In [45]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, plot_confusion_matrix
from sklearn.preprocessing import PolynomialFeatures

In [2]:
test_df = pd.read_csv('./data/test_data.csv')


In [3]:
df = pd.read_csv('./data/large_train_sample.csv')

df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,sex,capital-gain,capital-loss,hours-per-week,native-country,wage
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Female,0,0,40,Cuba,<=50K


In [4]:
#no null values
df.isnull().sum().sum()

0

# EDA

## Categorical Variables

In [5]:
df['workclass'].unique()

array([' State-gov', ' Self-emp-not-inc', ' Private', ' Federal-gov',
       ' Local-gov', ' ?', ' Self-emp-inc', ' Without-pay',
       ' Never-worked'], dtype=object)

In [6]:
df['education'].unique()

array([' Bachelors', ' HS-grad', ' 11th', ' Masters', ' 9th',
       ' Some-college', ' Assoc-acdm', ' Assoc-voc', ' 7th-8th',
       ' Doctorate', ' Prof-school', ' 5th-6th', ' 10th', ' 1st-4th',
       ' Preschool', ' 12th'], dtype=object)

In [7]:
df['education-num'].unique()

array([13,  9,  7, 14,  5, 10, 12, 11,  4, 16, 15,  3,  6,  2,  1,  8],
      dtype=int64)

In [8]:
df['marital-status'].unique()

array([' Never-married', ' Married-civ-spouse', ' Divorced',
       ' Married-spouse-absent', ' Separated', ' Married-AF-spouse',
       ' Widowed'], dtype=object)

In [9]:
df['occupation'].unique()

array([' Adm-clerical', ' Exec-managerial', ' Handlers-cleaners',
       ' Prof-specialty', ' Other-service', ' Sales', ' Craft-repair',
       ' Transport-moving', ' Farming-fishing', ' Machine-op-inspct',
       ' Tech-support', ' ?', ' Protective-serv', ' Armed-Forces',
       ' Priv-house-serv'], dtype=object)

In [10]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,sex,capital-gain,capital-loss,hours-per-week,native-country,wage
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Female,0,0,40,Cuba,<=50K


In [11]:
df['relationship'].unique()

array([' Not-in-family', ' Husband', ' Wife', ' Own-child', ' Unmarried',
       ' Other-relative'], dtype=object)

In [12]:
df['sex'].unique()

array([' Male', ' Female'], dtype=object)

In [13]:
df['capital-gain'].unique()

array([ 2174,     0, 14084,  5178,  5013,  2407, 14344, 15024,  7688,
       34095,  4064,  4386,  7298,  1409,  3674,  1055,  3464,  2050,
        2176,   594, 20051,  6849,  4101,  1111,  8614,  3411,  2597,
       25236,  4650,  9386,  2463,  3103, 10605,  2964,  3325,  2580,
        3471,  4865, 99999,  6514,  1471,  2329,  2105,  2885, 25124,
       10520,  2202,  2961, 27828,  6767,  2228,  1506, 13550,  2635,
        5556,  4787,  3781,  3137,  3818,  3942,   914,   401,  2829,
        2977,  4934,  2062,  2354,  5455, 15020,  1424,  3273, 22040,
        4416,  3908, 10566,   991,  4931,  1086,  7430,  6497,   114,
        7896,  2346,  3418,  3432,  2907,  1151,  2414,  2290, 15831,
       41310,  4508,  2538,  3456,  6418,  1848,  3887,  5721,  9562,
        1455,  2036,  1831, 11678,  2936,  2993,  7443,  6360,  1797,
        1173,  4687,  6723,  2009,  6097,  2653,  1639, 18481,  7978,
        2387,  5060], dtype=int64)

In [14]:
df['capital-loss'].unique()

array([   0, 2042, 1408, 1902, 1573, 1887, 1719, 1762, 1564, 2179, 1816,
       1980, 1977, 1876, 1340, 2206, 1741, 1485, 2339, 2415, 1380, 1721,
       2051, 2377, 1669, 2352, 1672,  653, 2392, 1504, 2001, 1590, 1651,
       1628, 1848, 1740, 2002, 1579, 2258, 1602,  419, 2547, 2174, 2205,
       1726, 2444, 1138, 2238,  625,  213, 1539,  880, 1668, 1092, 1594,
       3004, 2231, 1844,  810, 2824, 2559, 2057, 1974,  974, 2149, 1825,
       1735, 1258, 2129, 2603, 2282,  323, 4356, 2246, 1617, 1648, 2489,
       3770, 1755, 3683, 2267, 2080, 2457,  155, 3900, 2201, 1944, 2467,
       2163, 2754, 2472, 1411], dtype=int64)

In [15]:
df['hours-per-week'].unique()

array([40, 13, 16, 45, 50, 80, 30, 35, 60, 20, 52, 44, 15, 25, 38, 43, 55,
       48, 58, 32, 70,  2, 22, 56, 41, 28, 36, 24, 46, 42, 12, 65,  1, 10,
       34, 75, 98, 33, 54,  8,  6, 64, 19, 18, 72,  5,  9, 47, 37, 21, 26,
       14,  4, 59,  7, 99, 53, 39, 62, 57, 78, 90, 66, 11, 49, 84,  3, 17,
       68, 27, 85, 31, 51, 77, 63, 23, 87, 88, 73, 89, 97, 94, 29, 96, 67,
       82, 86, 91, 81, 76, 92, 61, 74, 95], dtype=int64)

In [16]:
df['native-country'].unique()

array([' United-States', ' Cuba', ' Jamaica', ' India', ' ?', ' Mexico',
       ' South', ' Puerto-Rico', ' Honduras', ' England', ' Canada',
       ' Germany', ' Iran', ' Philippines', ' Italy', ' Poland',
       ' Columbia', ' Cambodia', ' Thailand', ' Ecuador', ' Laos',
       ' Taiwan', ' Haiti', ' Portugal', ' Dominican-Republic',
       ' El-Salvador', ' France', ' Guatemala', ' China', ' Japan',
       ' Yugoslavia', ' Peru', ' Outlying-US(Guam-USVI-etc)', ' Scotland',
       ' Trinadad&Tobago', ' Greece', ' Nicaragua', ' Vietnam', ' Hong',
       ' Ireland', ' Hungary', ' Holand-Netherlands'], dtype=object)

In [17]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,sex,capital-gain,capital-loss,hours-per-week,native-country,wage
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Female,0,0,40,Cuba,<=50K


In [18]:
df['wage'].value_counts(normalize = True)

 <=50K    0.75919
 >50K     0.24081
Name: wage, dtype: float64

In [52]:
85.7-75.9

9.799999999999997

In [19]:
df

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,sex,capital-gain,capital-loss,hours-per-week,native-country,wage
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,Male,0,0,20,United-States,<=50K


In [20]:
df.dtypes

age                int64
workclass         object
fnlwgt             int64
education         object
education-num      int64
marital-status    object
occupation        object
relationship      object
sex               object
capital-gain       int64
capital-loss       int64
hours-per-week     int64
native-country    object
wage              object
dtype: object

In [21]:
df['wage'].value_counts()

 <=50K    24720
 >50K      7841
Name: wage, dtype: int64

### Binarize Wage Target Class

In [22]:
df['wage'] = [1 if i == ' >50K' else 0 for i in df['wage']]
df['wage'].value_counts()

0    24720
1     7841
Name: wage, dtype: int64

In [23]:
24720-7841

16879

### Resampling to balance classes

In [24]:
#Bootstrap data to create an additional 200_000 observations of class = ckd

wage_sample = df[df['wage'] == 1].sample(16879, replace = True, random_state = 42) 
df2 = pd.concat([df,wage_sample])
df2.shape

(49440, 14)

In [25]:
df2['wage'].value_counts()

1    24720
0    24720
Name: wage, dtype: int64

In [26]:
df2.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,sex,capital-gain,capital-loss,hours-per-week,native-country,wage
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,Male,2174,0,40,United-States,0
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,Male,0,0,13,United-States,0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,Male,0,0,40,United-States,0
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Male,0,0,40,United-States,0
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Female,0,0,40,Cuba,0


In [27]:
df2 = df2.drop(columns = 'education')

df2.head()

Unnamed: 0,age,workclass,fnlwgt,education-num,marital-status,occupation,relationship,sex,capital-gain,capital-loss,hours-per-week,native-country,wage
0,39,State-gov,77516,13,Never-married,Adm-clerical,Not-in-family,Male,2174,0,40,United-States,0
1,50,Self-emp-not-inc,83311,13,Married-civ-spouse,Exec-managerial,Husband,Male,0,0,13,United-States,0
2,38,Private,215646,9,Divorced,Handlers-cleaners,Not-in-family,Male,0,0,40,United-States,0
3,53,Private,234721,7,Married-civ-spouse,Handlers-cleaners,Husband,Male,0,0,40,United-States,0
4,28,Private,338409,13,Married-civ-spouse,Prof-specialty,Wife,Female,0,0,40,Cuba,0


In [28]:
columns = df2.columns
cat_cols = []
for col in columns:
    if col not in df2._get_numeric_data().columns:
        cat_cols.append(col)
cat_cols

['workclass',
 'marital-status',
 'occupation',
 'relationship',
 'sex',
 'native-country']

In [29]:
cat_cols.append('hours-per-week')

cat_cols

['workclass',
 'marital-status',
 'occupation',
 'relationship',
 'sex',
 'native-country',
 'hours-per-week']

### Before Dummifying, we must find the categorical columns in the training and testing data that do not have the same number of categories

In [30]:
[len(df2[i].unique()) for i in cat_cols]

[9, 7, 15, 6, 2, 42, 94]

In [31]:
[len(test_df[i].unique()) for i in cat_cols]

[9, 7, 15, 6, 2, 41, 89]

*Find the missing country in the test data*

In [32]:
country1 = df2['native-country'].unique()
country2 = test_df['native-country'].unique()
list_difference = []

for item in country1:
    if item not in country2:
        list_difference.append(item)
        
list_difference

[' Holand-Netherlands']

In [33]:
# We must add a column to the test_df to represent the missing netherlands category.

In [34]:
df2['education-num'].value_counts()

9     14161
10    10307
13    10032
14     3778
11     2181
12     1616
15     1521
7      1303
6      1090
16     1049
4       722
5       573
8       497
3       378
2       181
1        51
Name: education-num, dtype: int64

In [35]:
df2.shape

(49440, 13)

In [36]:
df2 = pd.get_dummies(df2, columns = cat_cols, drop_first = True)
#test = pd.get_dummies(test, columns = ['Sex', 'Embarked'], drop_first = True)

df2.shape

(49440, 174)

In [38]:
X = df2.drop(columns=['wage', 'fnlwgt'])
y = df2['wage']

In [39]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state = 42)

In [40]:
# Set up some modeling scaffolding
pipe = Pipeline([
    ('sc', StandardScaler()),
    ('rf', RandomForestClassifier(random_state=42))
])

# I want to set up gridsearch params
params = {'rf__n_estimators':      [100, 120, 140],
          'rf__max_features':      ['sqrt'],
          'rf__max_depth':         [4,5],
          'rf__min_samples_split': [4],
          'rf__min_samples_leaf':  [3,4],
          'rf__ccp_alpha':         [.0008,.001,.0012]
}
gs = GridSearchCV(pipe, param_grid=params, cv=5, n_jobs=4)
gs.fit(X_train, y_train)

with open('./data/rf2.pkl', 'wb') as file:
     pickle.dump(gs, file)
with open('./data/rf2.pkl', 'rb') as file:
    gs_rf = pickle.load(file)

    
search_results = gs_rf.cv_results_
top_estimator = gs_rf.best_estimator_
top_score = gs_rf.best_score_
top_parameters = gs_rf.best_params_


In [41]:
sr = pd.DataFrame(search_results)
sr.head(3)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_rf__ccp_alpha,param_rf__max_depth,param_rf__max_features,param_rf__min_samples_leaf,param_rf__min_samples_split,param_rf__n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,4.421256,0.163408,0.217169,0.014946,0.0008,4,sqrt,3,4,100,"{'rf__ccp_alpha': 0.0008, 'rf__max_depth': 4, ...",0.800431,0.793554,0.798004,0.793554,0.800836,0.797276,0.003189,11
1,5.091183,0.142863,0.280977,0.031117,0.0008,4,sqrt,3,4,120,"{'rf__ccp_alpha': 0.0008, 'rf__max_depth': 4, ...",0.799083,0.794498,0.796926,0.792341,0.799622,0.796494,0.002753,20
2,5.578814,0.240915,0.27726,0.022018,0.0008,4,sqrt,3,4,140,"{'rf__ccp_alpha': 0.0008, 'rf__max_depth': 4, ...",0.799083,0.792476,0.796656,0.792745,0.798679,0.795928,0.002832,28


In [42]:
top_parameters

{'rf__ccp_alpha': 0.001,
 'rf__max_depth': 5,
 'rf__max_features': 'sqrt',
 'rf__min_samples_leaf': 4,
 'rf__min_samples_split': 4,
 'rf__n_estimators': 100}

In [43]:
top_score

0.798948220064725

In [44]:
gs.score(X_test, y_test)

0.7932847896440129

### Score Report

In [65]:
test_df = test_df.drop(columns = 'education')

test_df.head()

Unnamed: 0,age,workclass,fnlwgt,education-num,marital-status,occupation,relationship,sex,capital-gain,capital-loss,hours-per-week,native-country
0,25,Private,226802,7,Never-married,Machine-op-inspct,Own-child,Male,0,0,40,United-States
1,38,Private,89814,9,Married-civ-spouse,Farming-fishing,Husband,Male,0,0,50,United-States
2,28,Local-gov,336951,12,Married-civ-spouse,Protective-serv,Husband,Male,0,0,40,United-States
3,44,Private,160323,10,Married-civ-spouse,Machine-op-inspct,Husband,Male,7688,0,40,United-States
4,18,?,103497,10,Never-married,?,Own-child,Female,0,0,30,United-States


In [66]:
columns = test_df.columns
cat_cols = []
for col in columns:
    if col not in test_df._get_numeric_data().columns:
        cat_cols.append(col)
cat_cols

['workclass',
 'marital-status',
 'occupation',
 'relationship',
 'sex',
 'native-country']

In [67]:
test_df = pd.get_dummies(test_df, columns = cat_cols, drop_first = True)
#test = pd.get_dummies(test, columns = ['Sex', 'Embarked'], drop_first = True)

test_df.shape

(16281, 80)

In [68]:
test_df['Holand-Netherlands'] = 0
test_df.shape

(16281, 81)

In [None]:
X = test_df

test_preds = gs.predict(X)

In [None]:
def post_predictions(preds, submission_num):
    """Given the dataframe and submission number a fit model, the function saves predictions for submission in the kaggle requested format."""
    
    submiss_df = pd.DataFrame({
        
        'Predictions': preds
    })

    report = submiss_df
    report.to_csv(f'./data/submission_{submission_num}.csv',index = False)
    
post_predictions(test_preds, 2)

In [49]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,sex,capital-gain,capital-loss,hours-per-week,native-country,wage
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,Male,2174,0,40,United-States,0
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,Male,0,0,13,United-States,0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,Male,0,0,40,United-States,0
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Male,0,0,40,United-States,0
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Female,0,0,40,Cuba,0


In [51]:
data_dict_final_model = {
    'age':['The age of the person sampled'],
    'workclass':['Evaluates the quality of the material on the exterior(Ordinal)'],
    'fnlwgt':['The weighted tally indicated likeness in socio-economic and demographic statistics.'],
    'education':['The highest level of education the person sampled achieved.(categorical)'],
       'education-num':['The highest level of education the person sampled achieved, ranked on a numbered scale (oridnal)'],
    'marital-status':['The marital status of the person sampled'],
    'occupation':['The job sector the person sampled works'],
    'relationship':['The relationship of the person sampled to the data given in the sample.'],
       'sex':['The gender of the person sampled'],
    'capital-gain':['Capital gains for the person sampled in dollars'],
    'capital-loss':['Capital losses for the person sampled in dollars'],
    'hours-per-week':['The average hours per week the person sampled works'],
    'native-country':['The country of origin for the person sampled'],
    'wage':['Binary outcome indicating whether the annual income for the person sampled was above or below 50k dollars.']
}


#This places the dataframe in a markdown friendly format so it can be copy/pasted into the README
dict_df = pd.DataFrame(data_dict_final_model).T.to_markdown()

dict_df

text_file = open("./data/data_dict","w")#if you don't include the "w" it will assume there is a file already called that and try to open a file that doesn't exist
#The "w" writes the file

text_file.write(dict_df)

text_file.close()

|                | 0                                                                                                          |
|:---------------|:-----------------------------------------------------------------------------------------------------------|
| age            | The age of the person sampled                                                                              |
| workclass      | Evaluates the quality of the material on the exterior(Ordinal)                                             |
| fnlwgt         | The weighted tally indicated likeness in socio-economic and demographic statistics.                        |
| education      | The highest level of education the person sampled achieved.(categorical)                                   |
| education-num  | The highest level of education the person sampled achieved, ranked on a numbered scale (oridnal)           |
| marital-status | The marital status of the person sampled                                                                   |
| occupation     | The job sector the person sampled works                                                                    |
| relationship   | The relationship of the person sampled to the data given in the sample.                                    |
| sex            | The gender of the person sampled                                                                           |
| capital-gain   | Capital gains for the person sampled in dollars                                                            |
| capital-loss   | Capital losses for the person sampled in dollars                                                           |
| hours-per-week | The average hours per week the person sampled works                                                        |
| native-country | The country of origin for the person sampled                                                               |
| wage           | Binary outcome indicating whether the annual income for the person sampled was above or below 50k dollars. |

In [None]:
plt.savefig("./Images/.png", bbox_inches="tight")