# Logistic regression  for Census dataset

In [1]:
import pandas as pd
import numpy as np

In [31]:
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.model_selection import cross_val_score

import pandas_profiling

In [138]:
cols = ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status', 'occupation',
          'relationship', 'race', 'sex', 'capital_gain', 'capital_loss', 'hours_per_week', 'native_country', 'y']

In [139]:
df_train = pd.read_csv('adult.data', names=cols, header=None)
df_test = pd.read_csv('adult.test', names=cols, header=None).iloc[1:]

In [140]:
df_train.head(5)

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,y
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [141]:
df_test.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,y
1,25,Private,226802.0,11th,7.0,Never-married,Machine-op-inspct,Own-child,Black,Male,0.0,0.0,40.0,United-States,<=50K.
2,38,Private,89814.0,HS-grad,9.0,Married-civ-spouse,Farming-fishing,Husband,White,Male,0.0,0.0,50.0,United-States,<=50K.
3,28,Local-gov,336951.0,Assoc-acdm,12.0,Married-civ-spouse,Protective-serv,Husband,White,Male,0.0,0.0,40.0,United-States,>50K.
4,44,Private,160323.0,Some-college,10.0,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688.0,0.0,40.0,United-States,>50K.
5,18,?,103497.0,Some-college,10.0,Never-married,?,Own-child,White,Female,0.0,0.0,30.0,United-States,<=50K.


# Strategy 1 -- Do not deal with missing values

In [142]:
num_cols = ["age", "fnlwgt", "education_num", "capital_gain", "capital_loss", "hours_per_week"]
cat_cols = [col for col in df_train.columns if col not in num_cols and col != "y"]
cat_cols

['workclass',
 'education',
 'marital_status',
 'occupation',
 'relationship',
 'race',
 'sex',
 'native_country']

In [143]:
df_test.shape

(16281, 15)

In [144]:
df_train.shape

(32561, 15)

In [116]:
df_train.dropna(subset=cat_cols, axis=0)
df_test.dropna(subset=cat_cols, axis=0)

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,y
1,25,Private,226802.0,11th,7.0,Never-married,Machine-op-inspct,Own-child,Black,Male,0.0,0.0,40.0,United-States,<=50K.
2,38,Private,89814.0,HS-grad,9.0,Married-civ-spouse,Farming-fishing,Husband,White,Male,0.0,0.0,50.0,United-States,<=50K.
3,28,Local-gov,336951.0,Assoc-acdm,12.0,Married-civ-spouse,Protective-serv,Husband,White,Male,0.0,0.0,40.0,United-States,>50K.
4,44,Private,160323.0,Some-college,10.0,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688.0,0.0,40.0,United-States,>50K.
5,18,?,103497.0,Some-college,10.0,Never-married,?,Own-child,White,Female,0.0,0.0,30.0,United-States,<=50K.
6,34,Private,198693.0,10th,6.0,Never-married,Other-service,Not-in-family,White,Male,0.0,0.0,30.0,United-States,<=50K.
7,29,?,227026.0,HS-grad,9.0,Never-married,?,Unmarried,Black,Male,0.0,0.0,40.0,United-States,<=50K.
8,63,Self-emp-not-inc,104626.0,Prof-school,15.0,Married-civ-spouse,Prof-specialty,Husband,White,Male,3103.0,0.0,32.0,United-States,>50K.
9,24,Private,369667.0,Some-college,10.0,Never-married,Other-service,Unmarried,White,Female,0.0,0.0,40.0,United-States,<=50K.
10,55,Private,104996.0,7th-8th,4.0,Married-civ-spouse,Craft-repair,Husband,White,Male,0.0,0.0,10.0,United-States,<=50K.


In [117]:
# drop rows from categorical columns with "?"
for item in cat_cols:
    df_train = df_train[~df_train[item].isin([" ?"])]
    df_test = df_test[~df_test[item].isin([' ?'])]

In [118]:
# transforming categorical values to numeric
df_train = pd.get_dummies(df_train, columns = cat_cols)
df_test = pd.get_dummies(df_test, columns = cat_cols)

## Modelling

In [119]:
Y = df_train['y']
df_train.drop('y', axis=1, inplace=True)

x_train, x_test, y_train, y_test = train_test_split(df_train, Y)
model = LogisticRegression()
model.fit(x_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

### Train classification accuracy

In [120]:
train_acc_1 = model.score(x_train, y_train)
print("Train accuracy: ", train_acc_1)

Train accuracy:  0.7896202643561293


### Test classification accuracy

In [121]:
test_acc_1 = metrics.accuracy_score(y_test, model.predict(x_test))
print("Test accuracy: ", test_acc_1)

Test accuracy:  0.7951200106086725


### In conclusion, this strategy is indeed worh noting, though I believe there are some more effective ones :)

# Strategy 2 -- Global most common substitution

In [147]:
# reloading the data to try out the 2nd strategy
df_train2 = pd.read_csv('adult.data', names=cols, header=None)
df_test2 = pd.read_csv('adult.test', names=cols, header=None).iloc[1:]

In [148]:
# numeric imputation
numeric_imputer = SimpleImputer(missing_values = 0, strategy = 'median')
df_train2[['capital_loss', 'capital_gain']] = numeric_imputer.fit_transform(df_train2[['capital_loss', 'capital_gain']])
df_test2[['capital_loss', 'capital_gain']] = numeric_imputer.fit_transform(df_test2[['capital_loss', 'capital_gain']])

In [149]:
# categorical imputation
categoric_imputer = SimpleImputer(missing_values= ' ?', strategy='most_frequent')
df_train2[['workclass', 'occupation', 'native_country']] = categoric_imputer.fit_transform(df_train2[['workclass', 'occupation', 'native_country']])
df_test2[['workclass', 'occupation', 'native_country']] = categoric_imputer.fit_transform(df_test2[['workclass', 'occupation', 'native_country']])

In [None]:
# cat_cols.append("y")
# cat_cols

In [150]:
df_train2 = pd.get_dummies(df_train2, columns = cat_cols)
df_test2 = pd.get_dummies(df_test2, columns = cat_cols)

In [151]:
Y = df_train2['y']
df_train2.drop('y', axis=1, inplace=True)

x_train, x_test, y_train, y_test = train_test_split(df_train2, Y)
model = LogisticRegression()
model.fit(x_train, y_train)
model.fit(x_test, y_test)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

### Train classification accuracy

In [152]:
train_acc_2 = model.score(x_train, y_train)
print("Train accuracy: ", train_acc_2)

Train accuracy:  0.7746519246519247


### Test classification accuracy

In [153]:
test_acc_2 = metrics.accuracy_score(y_test, model.predict(x_test))
print("Test accuracy: ", test_acc_2)

Test accuracy:  0.7831961675469844


### Looks like this strategy works is a little less effective than the previous one