In [1]:
#Importing the libraries we need for this analysis

import numpy as np
import pandas as pd 

import matplotlib.pyplot as plt
import matplotlib.style as style
import scipy.stats as stats
import seaborn as sns
%matplotlib inline
style.use('bmh') ## style for charts

In [2]:
## from: https://www2.1010data.com/documentationcenter/prod/Tutorials/MachineLearningExamples/CensusIncomeDataSet.html
header_names = [
    'age',
    'class_worker',
    'det_ind_code',
    'det_occ_code',
    'education',
    'wage_per_hour',
    'hs_college',
    'marital_stat',
    'major_ind_code',
    'major_occ_code',
    'race',
    'hisp_origin',
    'sex',
    'union_member',
    'unemp_reason',
    'full_or_part_emp',
    'capital_gains',
    'capital_losses',
    'stock_dividends',
    'tax_filer_stat',
    'region_prev_res',
    'state_prev_res',
    'det_hh_fam_stat',
    'det_hh_summ',
    'instance_weight', ## this field is not used as a feature
    'mig_chg_msa',
    'mig_chg_reg',
    'mig_move_reg',
    'mig_same',
    'mig_prev_sunbelt',
    'num_emp',
    'fam_under_18',
    'country_father',
    'country_mother',
    'country_self',
    'citizenship',
    'own_or_self',
    'vet_question',
    'vet_benefits',
    'weeks_worked',
    'year',
    'income_50k',
]

In [3]:
df = pd.read_csv('/Users/password1234/Documents/Machine Learning/census-income.data.csv', header=None, names=header_names)
df_test = pd.read_csv('/Users/password1234/Documents/Machine Learning/census-income.test.csv', header=None, names=header_names)
df = pd.concat([df,df_test]) ## the test file is also labelled so they can be merged
df.drop(columns=['instance_weight']) ## not used for our analysis

Unnamed: 0,age,class_worker,det_ind_code,det_occ_code,education,wage_per_hour,hs_college,marital_stat,major_ind_code,major_occ_code,...,country_father,country_mother,country_self,citizenship,own_or_self,vet_question,vet_benefits,weeks_worked,year,income_50k
0,73,Not in universe,0,0,High school graduate,0,Not in universe,Widowed,Not in universe or children,Not in universe,...,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,2,0,95,- 50000.
1,58,Self-employed-not incorporated,4,34,Some college but no degree,0,Not in universe,Divorced,Construction,Precision production craft & repair,...,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,2,52,94,- 50000.
2,18,Not in universe,0,0,10th grade,0,High school,Never married,Not in universe or children,Not in universe,...,Vietnam,Vietnam,Vietnam,Foreign born- Not a citizen of U S,0,Not in universe,2,0,95,- 50000.
3,9,Not in universe,0,0,Children,0,Not in universe,Never married,Not in universe or children,Not in universe,...,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,0,0,94,- 50000.
4,10,Not in universe,0,0,Children,0,Not in universe,Never married,Not in universe or children,Not in universe,...,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,0,0,94,- 50000.
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99757,14,Not in universe,0,0,Children,0,Not in universe,Never married,Not in universe or children,Not in universe,...,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,0,0,95,- 50000.
99758,61,Private,8,36,11th grade,0,Not in universe,Separated,Manufacturing-durable goods,Machine operators assmblrs & inspctrs,...,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,2,52,95,- 50000.
99759,24,Self-employed-not incorporated,1,43,7th and 8th grade,0,Not in universe,Married-civilian spouse present,Agriculture,Farming forestry and fishing,...,Mexico,Mexico,Mexico,Foreign born- U S citizen by naturalization,0,Not in universe,2,52,94,- 50000.
99760,30,Private,45,2,Bachelors degree(BA AB BS),0,Not in universe,Married-civilian spouse present,Other professional services,Executive admin and managerial,...,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,2,52,95,- 50000.


In [4]:
## Create a new variable for classification based of if the person recieved a 
## college degree
higer_degrees = [
    ' Bachelors degree(BA AB BS)', 
    ' Masters degree(MA MS MEng MEd MSW MBA)', 
    ' Prof school degree (MD DDS DVM LLB JD)',
    ' Doctorate degree(PhD EdD)',
]

df['graduated'] = 'no'
df.loc[df['education'].isin(higer_degrees), 'graduated'] = 'yes'
df.shape

(299285, 43)

In [5]:
categorical_features = [
    'class_worker',
    'det_ind_code',
    'det_occ_code',
    'education',
    'hs_college',
    'marital_stat',
    'major_ind_code',
    'major_occ_code',
    'race',
    'hisp_origin',
    'sex',
    'union_member',
    'unemp_reason',
    'full_or_part_emp',
    'tax_filer_stat',
    'region_prev_res',
    'state_prev_res',
    'det_hh_fam_stat',
    'det_hh_summ',
    'mig_chg_msa',
    'mig_chg_reg',
    'mig_move_reg',
    'mig_same',
    'mig_prev_sunbelt',
    'fam_under_18',
    'country_father',
    'country_mother',
    'country_self',
    'citizenship',
    'own_or_self',
    'vet_question',
    'vet_benefits',
    'year',
]
continuous_features = [
    'age', 
    'wage_per_hour',
    'capital_gains',
    'capital_losses',
    'stock_dividends',
    'num_emp',
    'weeks_worked',
  ]
df[categorical_features] = df[categorical_features].astype('category')

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 299285 entries, 0 to 99761
Data columns (total 43 columns):
age                 299285 non-null int64
class_worker        299285 non-null category
det_ind_code        299285 non-null category
det_occ_code        299285 non-null category
education           299285 non-null category
wage_per_hour       299285 non-null int64
hs_college          299285 non-null category
marital_stat        299285 non-null category
major_ind_code      299285 non-null category
major_occ_code      299285 non-null category
race                299285 non-null category
hisp_origin         299285 non-null category
sex                 299285 non-null category
union_member        299285 non-null category
unemp_reason        299285 non-null category
full_or_part_emp    299285 non-null category
capital_gains       299285 non-null int64
capital_losses      299285 non-null int64
stock_dividends     299285 non-null int64
tax_filer_stat      299285 non-null category
regio

In [64]:
#Keeping only a specific subset of variables for the classification and also transforming 
#Income so it can be used in our classification model.
columnsKeep = [
    'age',
    'education',
    'race', 
    'sex', 
    'capital_gains', 
    'capital_losses', 
    'stock_dividends', 
    'tax_filer_stat', 
    'det_hh_summ', 
    'own_or_self', 
    'vet_benefits', 
    'weeks_worked',
    'income_50k'
]

updatedDF = df.loc[:, columnsKeep]

updatedDF['income_50k'] = updatedDF['income_50k'].str.replace('- 50000.','below_50k')
updatedDF['income_50k'] = updatedDF['income_50k'].str.replace(' 50000+.','above_50k')


updatedDF.head()

Unnamed: 0,age,education,race,sex,capital_gains,capital_losses,stock_dividends,tax_filer_stat,det_hh_summ,own_or_self,vet_benefits,weeks_worked,income_50k
0,73,High school graduate,White,Female,0,0,0,Nonfiler,Other relative of householder,0,2,0,below_50k
1,58,Some college but no degree,White,Male,0,0,0,Head of household,Householder,0,2,52,below_50k
2,18,10th grade,Asian or Pacific Islander,Female,0,0,0,Nonfiler,Child 18 or older,0,2,0,below_50k
3,9,Children,White,Female,0,0,0,Nonfiler,Child under 18 never married,0,0,0,below_50k
4,10,Children,White,Female,0,0,0,Nonfiler,Child under 18 never married,0,0,0,below_50k


In [65]:
ind_cols=['education', 'race', 'sex', 'tax_filer_stat', 'det_hh_summ', 'own_or_self', 'vet_benefits', 'income_50k']

# df_trunc.loc[:,ind_cols].head()

for col in ind_cols:
    updatedDF[col] = updatedDF[col].astype('category')

updatedDF.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 299285 entries, 0 to 99761
Data columns (total 13 columns):
age                299285 non-null int64
education          299285 non-null category
race               299285 non-null category
sex                299285 non-null category
capital_gains      299285 non-null int64
capital_losses     299285 non-null int64
stock_dividends    299285 non-null int64
tax_filer_stat     299285 non-null category
det_hh_summ        299285 non-null category
own_or_self        299285 non-null category
vet_benefits       299285 non-null category
weeks_worked       299285 non-null int64
income_50k         299285 non-null category
dtypes: category(8), int64(5)
memory usage: 16.0 MB


In [66]:
tmp_df = pd.get_dummies(updatedDF['income_50k'])

updatedDF = pd.concat((updatedDF,tmp_df),axis=1)

#updatedDF=updatedDF.drop(['income_50k', ' below_50k'], axis=1)

print(list(updatedDF.columns))

print(updatedDF.head())

['age', 'education', 'race', 'sex', 'capital_gains', 'capital_losses', 'stock_dividends', 'tax_filer_stat', 'det_hh_summ', 'own_or_self', 'vet_benefits', 'weeks_worked', 'income_50k', ' below_50k', 'above_50k.']
   age                    education                        race      sex  \
0   73         High school graduate                       White   Female   
1   58   Some college but no degree                       White     Male   
2   18                   10th grade   Asian or Pacific Islander   Female   
3    9                     Children                       White   Female   
4   10                     Children                       White   Female   

   capital_gains  capital_losses  stock_dividends      tax_filer_stat  \
0              0               0                0            Nonfiler   
1              0               0                0   Head of household   
2              0               0                0            Nonfiler   
3              0               0       

In [67]:
from sklearn.model_selection import ShuffleSplit

# we want to predict the X and y data as follows:
if 'income_50k' in updatedDF:
    y = updatedDF['income_50k'].values
    del updatedDF['income_50k']
    X = updatedDF.values
    
    
#Doing three iterations for cv
num_cv_iterations = 3
num_instances = len(y)
cv_object = ShuffleSplit(n_splits=num_cv_iterations,
                         test_size  = 0.2)
                         
print(cv_object)

ShuffleSplit(n_splits=3, random_state=None, test_size=0.2, train_size=None)


In [68]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics as mt

# first we create a reusable logisitic regression object
#   here we can setup the object with different learning parameters and constants
lr_clf = LogisticRegression(penalty='l2', C=1.0, class_weight=None, solver='liblinear' ) # get object


iter_num=0
# the indices are the rows used for training and testing in each iteration
for train_indices, test_indices in cv_object.split(X,y): 
    X_train = X[train_indices]
    y_train = y[train_indices]
    
    X_test = X[test_indices]
    y_test = y[test_indices]
    
lr_clf.fit(X_train,y_train)  # train object
y_hat = lr_clf.predict(X_test) # get test set precitions
    


In [70]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV


columnsToScale = ['capital_gains', 'capital_losses', 'stock_dividends']

scaler = StandardScaler()

cat_cols = ind_cols[:-1]

one_encode = OneHotEncoder()

preprocessor = ColumnTransformer(
    transformers=[
        ('num', scaler, columnsToScale),
        ('cat', one_encode, cat_cols)])

lr_clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('lr_classifier', LogisticRegression(max_iter=100000))])

lr_clf.fit(X_train, y_train)
print("model score: " + str(round((lr_clf.score(X_test, y_test) * 100),2)) + "%")

ValueError: Specifying the columns using strings is only supported for pandas DataFrames

In [69]:
updatedDF.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 299285 entries, 0 to 99761
Data columns (total 14 columns):
age                299285 non-null int64
education          299285 non-null category
race               299285 non-null category
sex                299285 non-null category
capital_gains      299285 non-null int64
capital_losses     299285 non-null int64
stock_dividends    299285 non-null int64
tax_filer_stat     299285 non-null category
det_hh_summ        299285 non-null category
own_or_self        299285 non-null category
vet_benefits       299285 non-null category
weeks_worked       299285 non-null int64
 below_50k         299285 non-null uint8
above_50k.         299285 non-null uint8
dtypes: category(7), int64(5), uint8(2)
memory usage: 16.3 MB


In [75]:
#SVM One shot
from sklearn.preprocessing import StandardScaler

scaler.fit(X_train)

for train_indices, test_indices in cv_object.split(X,y): 
    X_train = X[train_indices]
    y_train = y[train_indices]
    
    X_test = X[test_indices]
    y_test = y[test_indices]

X_trained_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

ValueError: could not convert string to float: ' Children'