In [1]:
#Importing the libraries we need for this analysis
import datetime
import itertools
import os
import pathlib
import sklearn
import random

import numpy as np
import pandas as pd
#import plotly.graph_objects as go
import seaborn as sns
import scipy.stats as stats

from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

In [2]:
## from: https://www2.1010data.com/documentationcenter/prod/Tutorials/MachineLearningExamples/CensusIncomeDataSet.html
header_names = [
    'age',
    'class_worker',
    'det_ind_code',
    'det_occ_code',
    'education',
    'wage_per_hour',
    'hs_college',
    'marital_stat',
    'major_ind_code',
    'major_occ_code',
    'race',
    'hisp_origin',
    'sex',
    'union_member',
    'unemp_reason',
    'full_or_part_emp',
    'capital_gains',
    'capital_losses',
    'stock_dividends',
    'tax_filer_stat',
    'region_prev_res',
    'state_prev_res',
    'det_hh_fam_stat',
    'det_hh_summ',
    'instance_weight', ## this field is not used as a feature
    'mig_chg_msa',
    'mig_chg_reg',
    'mig_move_reg',
    'mig_same',
    'mig_prev_sunbelt',
    'num_emp',
    'fam_under_18',
    'country_father',
    'country_mother',
    'country_self',
    'citizenship',
    'own_or_self',
    'vet_question',
    'vet_benefits',
    'weeks_worked',
    'year',
    'income_50k',
]

In [3]:
df = pd.read_csv('/Users/password1234/Documents/Machine Learning/census-income.data.csv', header=None, names=header_names)
df_test = pd.read_csv('/Users/password1234/Documents/Machine Learning/census-income.test.csv', header=None, names=header_names)
df = pd.concat([df,df_test]) ## the test file is also labelled so they can be merged
df.drop(columns=['instance_weight']) ## not used for our analysis

Unnamed: 0,age,class_worker,det_ind_code,det_occ_code,education,wage_per_hour,hs_college,marital_stat,major_ind_code,major_occ_code,...,country_father,country_mother,country_self,citizenship,own_or_self,vet_question,vet_benefits,weeks_worked,year,income_50k
0,73,Not in universe,0,0,High school graduate,0,Not in universe,Widowed,Not in universe or children,Not in universe,...,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,2,0,95,- 50000.
1,58,Self-employed-not incorporated,4,34,Some college but no degree,0,Not in universe,Divorced,Construction,Precision production craft & repair,...,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,2,52,94,- 50000.
2,18,Not in universe,0,0,10th grade,0,High school,Never married,Not in universe or children,Not in universe,...,Vietnam,Vietnam,Vietnam,Foreign born- Not a citizen of U S,0,Not in universe,2,0,95,- 50000.
3,9,Not in universe,0,0,Children,0,Not in universe,Never married,Not in universe or children,Not in universe,...,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,0,0,94,- 50000.
4,10,Not in universe,0,0,Children,0,Not in universe,Never married,Not in universe or children,Not in universe,...,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,0,0,94,- 50000.
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99757,14,Not in universe,0,0,Children,0,Not in universe,Never married,Not in universe or children,Not in universe,...,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,0,0,95,- 50000.
99758,61,Private,8,36,11th grade,0,Not in universe,Separated,Manufacturing-durable goods,Machine operators assmblrs & inspctrs,...,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,2,52,95,- 50000.
99759,24,Self-employed-not incorporated,1,43,7th and 8th grade,0,Not in universe,Married-civilian spouse present,Agriculture,Farming forestry and fishing,...,Mexico,Mexico,Mexico,Foreign born- U S citizen by naturalization,0,Not in universe,2,52,94,- 50000.
99760,30,Private,45,2,Bachelors degree(BA AB BS),0,Not in universe,Married-civilian spouse present,Other professional services,Executive admin and managerial,...,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,2,52,95,- 50000.


In [4]:
categorical_features = [
    'class_worker',
    'det_ind_code',
    'det_occ_code',
    'education',
    'hs_college',
    'marital_stat',
    'major_ind_code',
    'major_occ_code',
    'race',
    'hisp_origin',
    'sex',
    'union_member',
    'unemp_reason',
    'full_or_part_emp',
    'tax_filer_stat',
    'region_prev_res',
    'state_prev_res',
    'det_hh_fam_stat',
    'det_hh_summ',
    'mig_chg_msa',
    'mig_chg_reg',
    'mig_move_reg',
    'mig_same',
    'mig_prev_sunbelt',
    'fam_under_18',
    'country_father',
    'country_mother',
    'country_self',
    'citizenship',
    'own_or_self',
    'vet_question',
    'vet_benefits',
    'year',
]
df[categorical_features] = df[categorical_features].astype('category')

In [5]:
### Drop columns not used in modelling
model_df = df.drop(
    columns=[
        'region_prev_res',
        'state_prev_res',
        'country_father',
        'country_mother',
        'country_self',
        'year',
    ]
)

model_df.head()

Unnamed: 0,age,class_worker,det_ind_code,det_occ_code,education,wage_per_hour,hs_college,marital_stat,major_ind_code,major_occ_code,...,mig_same,mig_prev_sunbelt,num_emp,fam_under_18,citizenship,own_or_self,vet_question,vet_benefits,weeks_worked,income_50k
0,73,Not in universe,0,0,High school graduate,0,Not in universe,Widowed,Not in universe or children,Not in universe,...,Not in universe under 1 year old,?,0,Not in universe,Native- Born in the United States,0,Not in universe,2,0,- 50000.
1,58,Self-employed-not incorporated,4,34,Some college but no degree,0,Not in universe,Divorced,Construction,Precision production craft & repair,...,No,Yes,1,Not in universe,Native- Born in the United States,0,Not in universe,2,52,- 50000.
2,18,Not in universe,0,0,10th grade,0,High school,Never married,Not in universe or children,Not in universe,...,Not in universe under 1 year old,?,0,Not in universe,Foreign born- Not a citizen of U S,0,Not in universe,2,0,- 50000.
3,9,Not in universe,0,0,Children,0,Not in universe,Never married,Not in universe or children,Not in universe,...,Yes,Not in universe,0,Both parents present,Native- Born in the United States,0,Not in universe,0,0,- 50000.
4,10,Not in universe,0,0,Children,0,Not in universe,Never married,Not in universe or children,Not in universe,...,Yes,Not in universe,0,Both parents present,Native- Born in the United States,0,Not in universe,0,0,- 50000.


In [11]:
#Subsampling the dataset
sampleDF = model_df.sample(frac = .5)

if(0.5*(len(model_df)) == len(sampleDF)):
    print("good")
    print(len(model_df), len(sampleDF))
    
#show
sampleDF.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 149642 entries, 171721 to 184731
Data columns (total 36 columns):
age                 149642 non-null int64
class_worker        149642 non-null category
det_ind_code        149642 non-null category
det_occ_code        149642 non-null category
education           149642 non-null category
wage_per_hour       149642 non-null int64
hs_college          149642 non-null category
marital_stat        149642 non-null category
major_ind_code      149642 non-null category
major_occ_code      149642 non-null category
race                149642 non-null category
hisp_origin         149642 non-null category
sex                 149642 non-null category
union_member        149642 non-null category
unemp_reason        149642 non-null category
full_or_part_emp    149642 non-null category
capital_gains       149642 non-null int64
capital_losses      149642 non-null int64
stock_dividends     149642 non-null int64
tax_filer_stat      149642 non-null category

In [12]:
from sklearn import svm

numeric_features = sampleDF.select_dtypes(include=['int64', 'float64']).columns
categorical_features = sampleDF.select_dtypes(include=['object','bool', 'category']).drop(['income_50k'], axis=1).columns


Scaler = StandardScaler()

one_encode = OneHotEncoder(handle_unknown='ignore')

classifier = svm.SVC(kernel = 'rbf', gamma = 'auto', C =2)

preprocessor = ColumnTransformer(
    transformers=[
        ('num', Scaler, numeric_features),
        ('cat', one_encode, categorical_features)]
)
        
model_up = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('classifier', classifier),
    ]
)


y = sampleDF['income_50k']
X = sampleDF.drop('income_50k', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)


In [14]:
from sklearn import svm

model_up.fit(X_train, y_train)

y_hat = model_up.predict(X_test)

In [16]:
#Accuracy scores and confusion matrix

from sklearn.svm import SVC
from sklearn import metrics as mt

acc = mt.accuracy_score(y_test, y_hat)
conf = mt.confusion_matrix(y_test, y_hat)
print(acc)
print(conf)

0.9492131377593638
[[27934   115]
 [ 1405   475]]


In [17]:
#Getting a classification report
from sklearn.metrics import classification_report

classification_report(y_test, y_hat)

'              precision    recall  f1-score   support\n\n    - 50000.       0.95      1.00      0.97     28049\n     50000+.       0.81      0.25      0.38      1880\n\n    accuracy                           0.95     29929\n   macro avg       0.88      0.62      0.68     29929\nweighted avg       0.94      0.95      0.94     29929\n'

In [53]:
#Look at support vectors
print(classifier.support_vectors_.shape)
print(classifier.support_)
print(classifier.n_support_)
print(classifier.classes_)

(13835, 322)
[    13     64     65 ... 119662 119671 119702]
[6983 6852]
[' - 50000.' ' 50000+.']


In [26]:
svm_param_scores = {}


class_weights =  [None, 'balanced']
kernals = ['linear','rbf']
gammas = ['scale', 'auto']

for cw in class_weights:
    for k in kernals:
        for gam in gammas:
            if gam == 'scale' and k == 'linear': 
                continue
            if gam == 'auto' and k == 'rbf': 
                continue
            params = (cw, k, gam)
            print(params)

            classifier = svm.SVC(class_weight=cw, k=k, gam=gam, max_iter=100000)
            model = model_up(classifier, numeric_features, categorical_features)
            model.fit(X_train,y_train)
            y_pred = model.predict(X_test)

            acc = accuracy_score(y_pred, y_test)
            svm_param_scores[params] = acc
            print(acc)

print('=' * 60)
for params, score in svm_param_scores.items():
    print(params, score)

(None, 'linear', 'auto')


TypeError: __init__() got an unexpected keyword argument 'k'

In [54]:
from sklearn.model_selection import ShuffleSplit


num_cv_iterations = 2
num_instances = len(y_train)
cv_object = ShuffleSplit(n_splits=num_cv_iterations,
                         test_size  = 0.2, random_state = 1)
                         
print(cv_object)
print(num_instances)

ShuffleSplit(n_splits=2, random_state=1, test_size=0.2, train_size=None)
119713


In [55]:
for train_indices, test_indices in cv_object.split(X,y):
    X_train = X[train_indices]
    y_train = y[train_indices]
    
    X_test = X[test_indices]
    y_test = y[test_indices]

KeyError: "None of [Int64Index([ 58074, 118700,  78082,  44389, 106921,  88027,  70912,    591,\n             72329,  81793,\n            ...\n             21758,  31228, 138823,  21440, 117583,  73349, 109259,  50057,\n              5192, 128037],\n           dtype='int64', length=119713)] are in the [columns]"

In [51]:
#Add back the class

arr = X_train
testedDF = pd.DataFrame(data = arr)
supportDF = testedDF.loc[classifier.support_].copy()

supportDF['income_50k'] = y[classifier.support_]
sampleDF['income_50k'] = y

#testedDF = sampleDF.loc[y_train].copy()
#supportDF = testedDF.loc[classifier.support_ ,:].copy()
#
#supportDF['income_50k'] = y[classifier.support_]
#sampleDF['income_50k'] = y
#
#supportDF.info()
supportDF.info()

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike
  """


ValueError: cannot reindex from a duplicate axis

In [56]:
support_vectors = classifier.support_vectors_

#visualize
plt.scatter(X_train[:,0], X_train[:,1])
plt.scatter(support_vectors[:,0], support_vectors[:,1], color='red')
plt.title('Linearly separable data with support vectors')
plt.xlabel('X1')
plt.ylabel('X2')
plt.show()

TypeError: '(slice(None, None, None), 0)' is an invalid key