In [34]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import datasets, linear_model
from sklearn.metrics import classification_report,confusion_matrix,roc_curve,auc
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

In [35]:
X_train = pd.read_csv("../data/X_train.csv", index_col=0)  # Use the first column as index
y_train = pd.read_csv("../data/y_train.csv", index_col=0)  # Use the first column as index
X_test = pd.read_csv("../data/X_test.csv", index_col=0)    # Use the first column as index
y_test = pd.read_csv("../data/y_test.csv", index_col=0)    # Use the first column as index

print(X_train.isnull().sum())
print(X_train['workclass'].unique())

age                  0
workclass         2799
fnlwgt               0
education            0
education-num        0
marital-status       0
occupation        2809
relationship         0
race                 0
sex                  0
capital-gain         0
capital-loss         0
hours-per-week       0
native-country     857
dtype: int64
['State-gov' 'Self-emp-not-inc' 'Private' 'Federal-gov' 'Local-gov' nan
 'Self-emp-inc' 'Without-pay' 'Never-worked']


In [36]:
# We thus want to factorize the object variables. To do this to both the test and the train data whilst ensuring the same key is used
# for both, we briefly concatinate the dataframes before splitting them back up
X_all = pd.concat({'X_train':X_train, 'X_test':X_test})

# Then we factorize
objects = ['workclass','education','marital-status','occupation','relationship','race','sex','native-country']
keys = [0]*len(objects)

for i in range(len(objects)):
    X_all[objects[i]], keys[i] = pd.factorize(X_all[objects[i]])

# Then we return the original test train split
X_train = X_all.loc['X_train']
X_test = X_all.loc['X_test']

X_train['workclass'].unique()

array([ 0,  1,  2,  3,  4, -1,  5,  6,  7], dtype=int64)

In [37]:
X_train['workclass'].dtypes

dtype('int64')

In [46]:
missingcols = ['workclass','occupation','native-country']

X_train_nan=X_train.copy()

for col in missingcols:
    X_train_nan.loc[X_train[col] == -1,col] = np.nan

X_train_nan['workclass'].unique()

array([ 0.,  1.,  2.,  3.,  4., nan,  5.,  6.,  7.])

In [39]:
X_train['workclass'].dtypes

dtype('int64')

In [40]:
X_train_med = X_train_nan.copy()

for col in missingcols:
    col_median=X_train_nan[col].median()
    X_train_med.fillna({col: col_median}, inplace=True)    

X_train_med['workclass'].unique()

array([0., 1., 2., 3., 4., 5., 6., 7.])

In [41]:
X_train_mean = X_train_nan.copy()

for col in missingcols:
    col_mean=int(X_train_nan[col].mean())
    X_train_mean.fillna({col: col_mean}, inplace=True)    

X_train_mean['workclass'].unique()

array([0., 1., 2., 3., 4., 5., 6., 7.])

In [42]:
pipe = make_pipeline(StandardScaler(), linear_model.LogisticRegression())

pipe.fit(X_train, y_train.values.ravel())  # apply scaling on training data

print(pipe.score(X_train, y_train))
print(pipe.score(X_test, y_test))

0.8258912857645698
0.8201367445963829


In [43]:
pipe = make_pipeline(StandardScaler(), linear_model.LogisticRegression())

pipe.fit(X_train_med, y_train.values.ravel())  # apply scaling on training data

print(pipe.score(X_train_med, y_train))
print(pipe.score(X_test, y_test))

0.8261929903957359
0.8194750771945302


In [44]:
pipe = make_pipeline(StandardScaler(), linear_model.LogisticRegression())

pipe.fit(X_train_mean, y_train.values.ravel())  # apply scaling on training data

print(pipe.score(X_train_mean, y_train))
print(pipe.score(X_test, y_test))

0.8261175642379444
0.819585355094839


In [47]:
X_train_nan.notnull().mean()

age               1.000000
workclass         0.929627
fnlwgt            1.000000
education         1.000000
education-num     1.000000
marital-status    1.000000
occupation        0.929376
relationship      1.000000
race              1.000000
sex               1.000000
capital-gain      1.000000
capital-loss      1.000000
hours-per-week    1.000000
native-country    0.978453
dtype: float64