# MiniLab: Logistic Regression and Support Vector Machines

In [1]:
import datetime
import itertools
import os
import pathlib
import sklearn

import numpy as np
import pandas as pd
import plotly.graph_objects as go
import seaborn as sns
import scipy.stats as stats

from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

## Create Models

In [2]:
## from: https://www2.1010data.com/documentationcenter/prod/Tutorials/MachineLearningExamples/CensusIncomeDataSet.html
header_names = [
    'age',
    'class_worker',
    'det_ind_code',
    'det_occ_code',
    'education',
    'wage_per_hour',
    'hs_college',
    'marital_stat',
    'major_ind_code',
    'major_occ_code',
    'race',
    'hisp_origin',
    'sex',
    'union_member',
    'unemp_reason',
    'full_or_part_emp',
    'capital_gains',
    'capital_losses',
    'stock_dividends',
    'tax_filer_stat',
    'region_prev_res',
    'state_prev_res',
    'det_hh_fam_stat',
    'det_hh_summ',
    'instance_weight', ## this field is not used as a feature
    'mig_chg_msa',
    'mig_chg_reg',
    'mig_move_reg',
    'mig_same',
    'mig_prev_sunbelt',
    'num_emp',
    'fam_under_18',
    'country_father',
    'country_mother',
    'country_self',
    'citizenship',
    'own_or_self',
    'vet_question',
    'vet_benefits',
    'weeks_worked',
    'year',
    'income_50k',
]

In [3]:
data_dir = os.path.join(pathlib.Path(os.getcwd()).parent, 'data')
df = pd.read_csv(os.path.join(data_dir, 'census-income.data.csv'), header=None, names=header_names)
df_test = pd.read_csv(os.path.join(data_dir, 'census-income.test.csv'), header=None, names=header_names)
df = pd.concat([df,df_test]) ## the test file is also labelled so they can be merged
df.drop(columns=['instance_weight']) ## not used for our analysis

Unnamed: 0,age,class_worker,det_ind_code,det_occ_code,education,wage_per_hour,hs_college,marital_stat,major_ind_code,major_occ_code,...,country_father,country_mother,country_self,citizenship,own_or_self,vet_question,vet_benefits,weeks_worked,year,income_50k
0,73,Not in universe,0,0,High school graduate,0,Not in universe,Widowed,Not in universe or children,Not in universe,...,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,2,0,95,- 50000.
1,58,Self-employed-not incorporated,4,34,Some college but no degree,0,Not in universe,Divorced,Construction,Precision production craft & repair,...,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,2,52,94,- 50000.
2,18,Not in universe,0,0,10th grade,0,High school,Never married,Not in universe or children,Not in universe,...,Vietnam,Vietnam,Vietnam,Foreign born- Not a citizen of U S,0,Not in universe,2,0,95,- 50000.
3,9,Not in universe,0,0,Children,0,Not in universe,Never married,Not in universe or children,Not in universe,...,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,0,0,94,- 50000.
4,10,Not in universe,0,0,Children,0,Not in universe,Never married,Not in universe or children,Not in universe,...,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,0,0,94,- 50000.
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99757,14,Not in universe,0,0,Children,0,Not in universe,Never married,Not in universe or children,Not in universe,...,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,0,0,95,- 50000.
99758,61,Private,8,36,11th grade,0,Not in universe,Separated,Manufacturing-durable goods,Machine operators assmblrs & inspctrs,...,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,2,52,95,- 50000.
99759,24,Self-employed-not incorporated,1,43,7th and 8th grade,0,Not in universe,Married-civilian spouse present,Agriculture,Farming forestry and fishing,...,Mexico,Mexico,Mexico,Foreign born- U S citizen by naturalization,0,Not in universe,2,52,94,- 50000.
99760,30,Private,45,2,Bachelors degree(BA AB BS),0,Not in universe,Married-civilian spouse present,Other professional services,Executive admin and managerial,...,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,2,52,95,- 50000.


In [4]:
categorical_features = [
    'class_worker',
    'det_ind_code',
    'det_occ_code',
    'education',
    'hs_college',
    'marital_stat',
    'major_ind_code',
    'major_occ_code',
    'race',
    'hisp_origin',
    'sex',
    'union_member',
    'unemp_reason',
    'full_or_part_emp',
    'tax_filer_stat',
    'region_prev_res',
    'state_prev_res',
    'det_hh_fam_stat',
    'det_hh_summ',
    'mig_chg_msa',
    'mig_chg_reg',
    'mig_move_reg',
    'mig_same',
    'mig_prev_sunbelt',
    'fam_under_18',
    'country_father',
    'country_mother',
    'country_self',
    'citizenship',
    'own_or_self',
    'vet_question',
    'vet_benefits',
    'year',
]
df[categorical_features] = df[categorical_features].astype('category')

In [5]:
### Drop columns not used in modelling
df = df.drop(
    columns=[
        'state_prev_res',
        'country_father',
        'country_mother',
        'country_self',
    ]
)


Create a logistic regression model and a support vector machine model for the classification task involved with your dataset. Assess how well each model performs (use 80/20 training/testing split for your data). Adjust parameters of the models to make them more accurate. If your dataset size requires the use of stochastic gradient descent, then linear kernel only is fine to use. That is, the SGDClassifier is fine to use for optimizing logistic regression and linear support vector machines. For many problems, SGD will be required in order to train the SVM model in a reasonable timeframe. 

In [6]:
def get_model(classifier, numeric_features, categorical_features):
    ### Scale numerical, one hot categorical
    numeric_transformer = Pipeline(
        steps=[
            #('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler()),
        ]
    )
    categorical_transformer = Pipeline(
        steps=[
            #('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
            ('onehot', OneHotEncoder(handle_unknown='ignore')),
        ]
    )
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)]
    )
    model = Pipeline(
        steps=[
            ('preprocessor', preprocessor),
            ('classifier', classifier),
        ]
    )
    
    return model

In [7]:
### Find the numerical/ categorical features
target = 'income_50k'
numeric_features = df.select_dtypes(include=['int64', 'float64']).columns
categorical_features = df.select_dtypes(include=['object','bool', 'category']).drop([target], axis=1).columns

X = df.drop(target, axis=1)
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)


In [None]:
logreg_param_scores = {}


class_weights =  [None, 'balanced']
penalties = ['none','l1', 'l2',]
solvers = ['lbfgs', 'liblinear']

for cw in class_weights:
    for penalty in penalties:
        for solver in solvers:
            if solver == 'lbfgs' and penalty == 'l1': 
                continue
            if solver == 'liblinear' and penalty == 'none': 
                continue
            params = (cw, penalty, solver)
            print(params)

            classifier = LogisticRegression(class_weight=cw, penalty=penalty, solver=solver, max_iter=100000)
            model = get_model(classifier, numeric_features, categorical_features)
            model.fit(X_train,y_train)
            y_pred = model.predict(X_test)

            acc = accuracy_score(y_pred, y_test)
            logreg_param_scores[params] = acc
            print(acc)

print('=' * 60)
for params, score in logreg_param_scores.items():
    print(params, score)

In [None]:
logreg_param_scores = {}


class_weights =  [None, 'balanced']
penalties = ['none','l1', 'l2',]
solvers = ['lbfgs', 'liblinear']

for cw in class_weights:
    for penalty in penalties:
        for solver in solvers:
            if solver == 'lbfgs' and penalty == 'l1': 
                continue
            if solver == 'liblinear' and penalty == 'none': 
                continue
            params = (cw, penalty, solver)
            print(params)

            classifier = LogisticRegression(class_weight=cw, penalty=penalty, solver=solver, max_iter=100000)
            model = get_model(classifier, numeric_features, categorical_features)
            model.fit(X_train,y_train)
            y_pred = model.predict(X_test)

            acc = accuracy_score(y_pred, y_test)
            logreg_param_scores[params] = acc
            print(acc)

print('=' * 60)
for params, score in logreg_param_scores.items():
    print(params, score)

In [8]:
sgd_param_scores = {}


class_weights =  [None, 'balanced']
penalties = ['l1', 'l2']
alphas = [0.00001, 0.0001, 0.001, 0.01, 0.1]

for cw in class_weights:
    for penalty in penalties:
        for alpha in alphas:
            params = (cw, penalty, alpha)
            print(params)

            classifier = SGDClassifier(class_weight=cw, penalty=penalty, alpha=alpha, max_iter=100000, loss="hinge")
            model = get_model(classifier, numeric_features, categorical_features)
            model.fit(X_train,y_train)
            y_pred = model.predict(X_test)

            acc = accuracy_score(y_pred, y_test)
            sgd_param_scores[params] = acc
            print(acc)

print('=' * 60)
for params, score in sgd_param_scores.items():
    print(params, score)

(None, 'l1', 1e-05)
0.9507158728302454
(None, 'l1', 0.0001)
0.9479091835541373
(None, 'l1', 0.001)
0.9438829209616252
(None, 'l1', 0.01)
0.939572648144745
(None, 'l1', 0.1)
0.9381860099904773
(None, 'l2', 1e-05)
0.9482600197136508
(None, 'l2', 0.0001)
0.9519521526304359
(None, 'l2', 0.001)
0.9483936715839417
(None, 'l2', 0.01)
0.9428638254506574
(None, 'l2', 0.1)
0.9400571361745493
('balanced', 'l1', 1e-05)
0.8088444125164976
('balanced', 'l1', 0.0001)
0.829142790316922
('balanced', 'l1', 0.001)
0.848071236446865
('balanced', 'l1', 0.01)
0.7764839534223232
('balanced', 'l1', 0.1)
0.5999298327680973
('balanced', 'l2', 1e-05)
0.8587132666187748
('balanced', 'l2', 0.0001)
0.8244148554053828
('balanced', 'l2', 0.001)
0.8546201780911171
('balanced', 'l2', 0.01)
0.825918438946155
('balanced', 'l2', 0.1)
0.7773861035467865
(None, 'l1', 1e-05) 0.9507158728302454
(None, 'l1', 0.0001) 0.9479091835541373
(None, 'l1', 0.001) 0.9438829209616252
(None, 'l1', 0.01) 0.939572648144745
(None, 'l1', 0.1)

In [10]:
#Subsampling the dataset
sampleDF = df.sample(frac = .5)

if(0.5*(len(df)) == len(sampleDF)):
    print("good")
    print(len(df), len(sampleDF))
    
#show
sampleDF.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 149642 entries, 16136 to 127732
Data columns (total 38 columns):
 #   Column            Non-Null Count   Dtype   
---  ------            --------------   -----   
 0   age               149642 non-null  int64   
 1   class_worker      149642 non-null  category
 2   det_ind_code      149642 non-null  category
 3   det_occ_code      149642 non-null  category
 4   education         149642 non-null  category
 5   wage_per_hour     149642 non-null  int64   
 6   hs_college        149642 non-null  category
 7   marital_stat      149642 non-null  category
 8   major_ind_code    149642 non-null  category
 9   major_occ_code    149642 non-null  category
 10  race              149642 non-null  category
 11  hisp_origin       149642 non-null  category
 12  sex               149642 non-null  category
 13  union_member      149642 non-null  category
 14  unemp_reason      149642 non-null  category
 15  full_or_part_emp  149642 non-null  category
 16

In [None]:
#svm_param_scores = {}


# class_weights =  [None, 'balanced']
# kernals = ['linear','rbf']
# gammas = ['scale', 'auto']
# 
# for cw in class_weights:
#     for k in kernals:
#         for gam in gammas:
#             if gam == 'scale' and k == 'linear': 
#                 continue
#             if gam == 'auto' and k == 'rbf': 
#                 continue
#             params = (cw, k, gam)
#             print(params)
from sklearn import svm

classifier = svm.SVC(max_iter=100000, gamma ='auto', C=0.0001, kernel='rbf')
model = get_model(classifier, numeric_features, categorical_features)
model.fit(X_train,y_train)
y_pred = model.predict(X_test)

acc_svm = accuracy_score(y_pred, y_test)
#svm_param_scores[params] = acc
print(acc_svm)

# print('=' * 60)
# for params, score in svm_param_scores.items():
#     print(params, score)

## Model Advantages

Discuss the advantages of each model for each classification task. Does one type of model offer superior performance over another in terms of prediction accuracy? In terms of training time or efficiency? Explain in detail.

## Interpret Feature Importance

In [None]:
## get catergorical cols after one hot
cat_columns = best_logistic_regression_model.named_steps['preprocessor'].transformers_[1][1]\
    .named_steps['onehot'].get_feature_names(categorical_features)
## combine numerical and categorical (same order as pipeline)
all_cols = np.concatenate((numeric_features, cat_columns), axis=0)
coef_scores = [coef_score for coef_score in zip(all_cols,best_logistic_regression_coefs[0])]
coef_scores.sort(key=lambda tup: abs(tup[1]), reverse=True)
for coef, weight in coef_scores:
    print('{coef:70}{weight:.3f}'.format(coef=coef, weight=weight))

In [None]:
# Add sns plot

## Interpret Support Vectors

In [None]:
#Getting a classification report
from sklearn.metrics import classification_report

classification_report(y_test, y_hat)

In [None]:
#Look at support vectors
print(classifier.support_vectors_.shape)
print(classifier.support_.shape)
print(classifier.n_support_)
print(classifier.classes_)