### Imputing missing values in the dataset using global most common substitution

In [5]:
import pandas as pd
import numpy as np

import random

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.model_selection import cross_val_score

import pandas_profiling

In [788]:
cols = ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status', 'occupation',
          'relationship', 'race', 'sex', 'capital_gain', 'capital_loss', 'hours_per_week', 'native_country', 'y']

In [789]:
df_train = pd.read_csv('adult.data', names=cols, header=None)
df_test = pd.read_csv('adult.test', names=cols, header=None).iloc[1:]

In [790]:
num_cols = ["age", "fnlwgt", "education_num", "capital_gain", "capital_loss", "hours_per_week"]
cat_cols = [col for col in df_train.columns if col not in num_cols and col != "y"]
cat_cols

['workclass',
 'education',
 'marital_status',
 'occupation',
 'relationship',
 'race',
 'sex',
 'native_country']

In [791]:
df_train.head(5)

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,y
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [792]:
# numeric imputation
numeric_imputer = SimpleImputer(missing_values = 0, strategy = 'median')
df_train[['capital_loss', 'capital_gain']] = numeric_imputer.fit_transform(df_train[['capital_loss', 'capital_gain']])
df_test[['capital_loss', 'capital_gain']] = numeric_imputer.fit_transform(df_test[['capital_loss', 'capital_gain']])

# categorical imputation
categoric_imputer = SimpleImputer(missing_values= ' ?', strategy='most_frequent')
df_train[['workclass', 'occupation', 'native_country']] = categoric_imputer.fit_transform(df_train[['workclass', 'occupation', 'native_country']])
df_test[['workclass', 'occupation', 'native_country']] = categoric_imputer.fit_transform(df_test[['workclass', 'occupation', 'native_country']])

df_train['race'] = df_train['race'].str.replace(' ', '')
races = list(np.unique(df_train['race'].values))
race = df_train['race']
df_train_original = df_train.copy()

df_train = pd.get_dummies(df_train, columns = cat_cols)
df_test = pd.get_dummies(df_test, columns = cat_cols)

df_train['y'] = df_train['y'].str.replace(' ', '')

df_tr = df_train.copy()
df_te = df_test.copy()

Y = df_train['y']
df_train.drop('y', axis=1, inplace=True)

x_train, x_test, y_train, y_test = train_test_split(df_train, Y)
model = LogisticRegression(solver='lbfgs')
model.fit(x_train, y_train)
model.fit(x_test, y_test)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

### Original results

In [793]:
train_acc_original = model.score(x_train, y_train)
print("Train accuracy: ", train_acc_original)
test_acc_original = metrics.accuracy_score(y_test, model.predict(x_test))
print("Test accuracy: ", test_acc_original)

Train accuracy:  0.7696969696969697
Test accuracy:  0.7718953445522663


In [794]:
def train(df_train):
    Y = df_train['y']
    df_train.drop('y', axis=1, inplace=True)

    x_train, x_test, y_train, y_test = train_test_split(df_train, Y)
    model = LogisticRegression(solver='lbfgs', multi_class='auto')
    model.fit(x_train, y_train)
    model.fit(x_test, y_test)
    
    train_score = model.score(x_train, y_train)
    test_score = metrics.accuracy_score(y_test, model.predict(x_test))
    return train_score, test_score

# Misclassification noise

In [795]:
df_tr2 = df_tr.copy()
ns = [1, 5, 10, 20]
accs_2 = dict()

In [796]:
for n in ns:
    df_tr2 = df_tr.copy()
    curr_len = int(len(df_tr2['y']) * n / 100)
    for i in range(int(len(df_tr2['y'][:curr_len]))):
        if df_tr2['y'][i] == '<=50K':
            df_tr2['y'][i] = '>50K'
        else:
            df_tr2['y'][i] = '<=50K'
    accs_2[n] = train(df_tr2)



In [797]:
for n in ns:
    if abs(accs_2[n][0] - train_acc_original) <= 0.01 and abs(accs_2[n][1] - test_acc_original) <= 0.01:
        print("Flipping ", n, "% is safe", sep='')
    else:
        print("Flipping ", n, "% is not safe", sep='')
        

Flipping 1% is safe
Flipping 5% is not safe
Flipping 10% is not safe
Flipping 20% is not safe


# Attribute noise

In [798]:
df_tr3 = df_train_original.copy()
accs_3 = dict()

In [799]:
for n in ns:
    df_tr3 = df_train_original.copy()
    curr_len = int(len(df_tr3) * n / 100)
    for i in range(int(len(df_tr3[:curr_len]))):
        df_tr3['age'][i] = -df_tr3['age'][i]
        df_tr3['education_num'][i] = random.randint(20, 100)
        df_tr3['race'][i] = random.sample(races, 1)[0]
    df_tr3 = pd.get_dummies(df_tr3, columns = cat_cols)
    accs_3[n] = train(df_tr3)





## Impact comparison

In [800]:
lst = []
for n in ns:
    res = []
    for i in range(6):
        if i == 0:
            res.append(train_acc_original)
        elif i == 1:
            res.append(test_acc_original)
        elif i == 2:
            res.append(accs_2[n][0])
        elif i == 3:
            res.append(accs_2[n][1])
        elif i == 4:
            res.append(accs_3[n][0])
        elif i == 5:
            res.append(accs_3[n][1])
    lst.append(res)

In [801]:
result = pd.DataFrame(lst, columns=['Original train', 'Original test', 
                               'Misclassification noise train', 'Misclassification noise test',
                               'Attribute noise train', 'Attribute noise test'], index=[1, 5, 10, 20])
result.index.name = "% flipped"

In [802]:
result

Unnamed: 0_level_0,Original train,Original test,Misclassification noise train,Misclassification noise test,Attribute noise train,Attribute noise test
% flipped,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,0.769697,0.771895,0.764373,0.766491,0.769533,0.780985
5,0.769697,0.771895,0.744922,0.742906,0.771785,0.766491
10,0.769697,0.771895,0.713022,0.715268,0.769779,0.770421
20,0.769697,0.771895,0.657985,0.660238,0.769328,0.763174


### Clearly, misclassification noise has a greater impact on the accuracy of the model, especially when n = 10 or 20.

# Misclassification noise elimination