In [1]:
import datetime
import itertools
import os
import pathlib
import sklearn

import numpy as np
import pandas as pd
import plotly.graph_objects as go
import seaborn as sns
import scipy.stats as stats

from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier


In [2]:
## from: https://www2.1010data.com/documentationcenter/prod/Tutorials/MachineLearningExamples/CensusIncomeDataSet.html
header_names = [
    'age',
    'class_worker',
    'det_ind_code',
    'det_occ_code',
    'education',
    'wage_per_hour',
    'hs_college',
    'marital_stat',
    'major_ind_code',
    'major_occ_code',
    'race',
    'hisp_origin',
    'sex',
    'union_member',
    'unemp_reason',
    'full_or_part_emp',
    'capital_gains',
    'capital_losses',
    'stock_dividends',
    'tax_filer_stat',
    'region_prev_res',
    'state_prev_res',
    'det_hh_fam_stat',
    'det_hh_summ',
    'instance_weight', ## this field is not used as a feature
    'mig_chg_msa',
    'mig_chg_reg',
    'mig_move_reg',
    'mig_same',
    'mig_prev_sunbelt',
    'num_emp',
    'fam_under_18',
    'country_father',
    'country_mother',
    'country_self',
    'citizenship',
    'own_or_self',
    'vet_question',
    'vet_benefits',
    'weeks_worked',
    'year',
    'income_50k',
]

In [3]:
data_dir = os.path.join(pathlib.Path(os.getcwd()).parent, 'data')
df = pd.read_csv(os.path.join(data_dir, 'census-income.data.csv'), header=None, names=header_names)
df_test = pd.read_csv(os.path.join(data_dir, 'census-income.test.csv'), header=None, names=header_names)
df = pd.concat([df,df_test]) ## the test file is also labelled so they can be merged
df.drop(columns=['instance_weight']) ## not used for our analysis

Unnamed: 0,age,class_worker,det_ind_code,det_occ_code,education,wage_per_hour,hs_college,marital_stat,major_ind_code,major_occ_code,...,country_father,country_mother,country_self,citizenship,own_or_self,vet_question,vet_benefits,weeks_worked,year,income_50k
0,73,Not in universe,0,0,High school graduate,0,Not in universe,Widowed,Not in universe or children,Not in universe,...,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,2,0,95,- 50000.
1,58,Self-employed-not incorporated,4,34,Some college but no degree,0,Not in universe,Divorced,Construction,Precision production craft & repair,...,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,2,52,94,- 50000.
2,18,Not in universe,0,0,10th grade,0,High school,Never married,Not in universe or children,Not in universe,...,Vietnam,Vietnam,Vietnam,Foreign born- Not a citizen of U S,0,Not in universe,2,0,95,- 50000.
3,9,Not in universe,0,0,Children,0,Not in universe,Never married,Not in universe or children,Not in universe,...,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,0,0,94,- 50000.
4,10,Not in universe,0,0,Children,0,Not in universe,Never married,Not in universe or children,Not in universe,...,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,0,0,94,- 50000.
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99757,14,Not in universe,0,0,Children,0,Not in universe,Never married,Not in universe or children,Not in universe,...,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,0,0,95,- 50000.
99758,61,Private,8,36,11th grade,0,Not in universe,Separated,Manufacturing-durable goods,Machine operators assmblrs & inspctrs,...,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,2,52,95,- 50000.
99759,24,Self-employed-not incorporated,1,43,7th and 8th grade,0,Not in universe,Married-civilian spouse present,Agriculture,Farming forestry and fishing,...,Mexico,Mexico,Mexico,Foreign born- U S citizen by naturalization,0,Not in universe,2,52,94,- 50000.
99760,30,Private,45,2,Bachelors degree(BA AB BS),0,Not in universe,Married-civilian spouse present,Other professional services,Executive admin and managerial,...,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,2,52,95,- 50000.


In [4]:
categorical_features = [
    'class_worker',
    'det_ind_code',
    'det_occ_code',
    'education',
    'hs_college',
    'marital_stat',
    'major_ind_code',
    'major_occ_code',
    'race',
    'hisp_origin',
    'sex',
    'union_member',
    'unemp_reason',
    'full_or_part_emp',
    'tax_filer_stat',
    'region_prev_res',
    'state_prev_res',
    'det_hh_fam_stat',
    'det_hh_summ',
    'mig_chg_msa',
    'mig_chg_reg',
    'mig_move_reg',
    'mig_same',
    'mig_prev_sunbelt',
    'fam_under_18',
    'country_father',
    'country_mother',
    'country_self',
    'citizenship',
    'own_or_self',
    'vet_question',
    'vet_benefits',
    'year',
]
df[categorical_features] = df[categorical_features].astype('category')

In [5]:
### Drop columns not used in modelling
df = df.drop(
    columns=[
        'state_prev_res',
        'country_father',
        'country_mother',
        'country_self',
    ]
)


In [6]:
def get_model(classifier, numeric_features, categorical_features):
    ### Scale numerical, one hot categorical
    numeric_transformer = Pipeline(
        steps=[
            #('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler()),
        ]
    )
    categorical_transformer = Pipeline(
        steps=[
            #('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
            ('onehot', OneHotEncoder(handle_unknown='ignore')),
        ]
    )
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)]
    )
    model = Pipeline(
        steps=[
            ('preprocessor', preprocessor),
            ('classifier', classifier),
        ]
    )
    
    return model

In [7]:
### Find the numerical/ categorical features
target = 'income_50k'
numeric_features = df.select_dtypes(include=['int64', 'float64']).columns
categorical_features = df.select_dtypes(include=['object','bool', 'category']).drop([target], axis=1).columns

X = df.drop(target, axis=1)
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)


param_scores = {}


class_weights =  [None, 'balanced']
penalties = ['none','l1', 'l2',]
solvers = ['lbfgs', 'liblinear']



for cw in class_weights:
    for penalty in penalties:
        for solver in solvers:
            if solver == 'lbfgs' and penalty == 'l1': 
                continue
            if solver == 'liblinear' and penalty == 'none': 
                continue
            params = (cw, penalty, solver)
            print(params)

            classifier = LogisticRegression(class_weight=cw, penalty=penalty, solver=solver, max_iter=100000)
            model = get_model(classifier, numeric_features, categorical_features)
            model.fit(X_train,y_train)
            y_pred = model.predict(X_test)

            acc = accuracy_score(y_pred, y_test)
            param_scores[params] = acc
            print(acc)

print('=' * 60)
for params, score in param_scores.items():
    print(params, score)

(None, 'none', 'lbfgs')
0.952703944400822
(None, 'l1', 'liblinear')
0.9528208897873265
(None, 'l2', 'lbfgs')
0.9527206508846083
(None, 'l2', 'liblinear')
0.9526872379170356
('balanced', 'none', 'lbfgs')
0.8561571745994621
('balanced', 'l1', 'liblinear')
0.8560903486643167
('balanced', 'l2', 'lbfgs')
0.8562240005346075
('balanced', 'l2', 'liblinear')
0.8562407070183938
(None, 'none', 'lbfgs') 0.952703944400822
(None, 'l1', 'liblinear') 0.9528208897873265
(None, 'l2', 'lbfgs') 0.9527206508846083
(None, 'l2', 'liblinear') 0.9526872379170356
('balanced', 'none', 'lbfgs') 0.8561571745994621
('balanced', 'l1', 'liblinear') 0.8560903486643167
('balanced', 'l2', 'lbfgs') 0.8562240005346075
('balanced', 'l2', 'liblinear') 0.8562407070183938


In [8]:
X_train.info()
X_test.info()
print(y_test)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 239428 entries, 18574 to 128037
Data columns (total 37 columns):
 #   Column            Non-Null Count   Dtype   
---  ------            --------------   -----   
 0   age               239428 non-null  int64   
 1   class_worker      239428 non-null  category
 2   det_ind_code      239428 non-null  category
 3   det_occ_code      239428 non-null  category
 4   education         239428 non-null  category
 5   wage_per_hour     239428 non-null  int64   
 6   hs_college        239428 non-null  category
 7   marital_stat      239428 non-null  category
 8   major_ind_code    239428 non-null  category
 9   major_occ_code    239428 non-null  category
 10  race              239428 non-null  category
 11  hisp_origin       239428 non-null  category
 12  sex               239428 non-null  category
 13  union_member      239428 non-null  category
 14  unemp_reason      239428 non-null  category
 15  full_or_part_emp  239428 non-null  category
 16

In [12]:
regularize_const = 0.1
iterations = 10

svm_sgd_classifier = SGDClassifier(alpha=regularize_const,fit_intercept=True, l1_ratio=0.0, learning_rate='optimal',loss='hinge', n_iter_no_change=iterations, n_jobs=-1, penalty='l2')
model = get_model(svm_sgd_classifier, numeric_features, categorical_features)
model.fit(X_train,y_train)
y_pred = model.predict(X_test)

svm_sgd_acc = accuracy_score(y_pred, y_test)
print('SVM:', svm_sgd_acc)



SVM: 0.940040429690763


In [13]:
print(svm_sgd_classifier.n_iter_)


11


In [14]:
print(svm_sgd_classifier.coef_)

[[ 1.36683078e-02 -5.54069593e-03  9.58023997e-02  7.01230471e-03
   5.54154313e-02  2.29285301e-03  9.54522067e-03  1.72583118e-02
  -8.43296258e-03 -2.07084097e-02 -1.82631923e-03 -1.90985150e-02
  -2.91755446e-02  5.75613295e-03 -1.43751447e-02 -1.81948477e-02
  -9.53027288e-04 -2.09248342e-02 -2.59329736e-03 -4.98915481e-03
   2.57810968e-03 -6.40540652e-03 -2.63506350e-03 -1.55294088e-03
  -9.90996503e-04  3.11347560e-04 -1.37068865e-03 -4.55630576e-05
   1.62508239e-03  1.68583313e-03  5.13723475e-03  1.29095330e-04
   4.02473676e-04  6.45476650e-04 -2.73378346e-04 -1.18843642e-03
  -4.27533357e-03  4.17661361e-04 -6.91039707e-04 -2.48318664e-03
   7.29008922e-04 -2.13386986e-03  6.22695121e-03  1.39726710e-03
  -5.20178241e-04 -2.20221445e-04 -5.04610863e-03  2.24018367e-03
   1.10110723e-03 -1.95541456e-03 -1.43675508e-02  2.11868218e-03
  -1.92124226e-03 -2.93881722e-03 -1.96680532e-03 -4.62844727e-03
  -5.90041596e-03 -5.68019452e-03 -3.46279238e-03 -5.58147456e-04
  -1.41093