In [80]:
import joblib # to save and load the models
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from collections import defaultdict
from sklearn.model_selection import cross_val_score

import seaborn as sns

In [224]:
data = pd.read_csv('data/adult.data', names=['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status',
                                              'occupation', 'relationship', 'race', 'sex', 'capital_gain', 'capital_loss',
                                              'hrs_per_week', 'native_country', 'income'])

data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hrs_per_week,native_country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [225]:
data_test = pd.read_csv('data/adult.test', names=['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status',
                                              'occupation', 'relationship', 'race', 'sex', 'capital_gain', 'capital_loss',
                                              'hrs_per_week', 'native_country', 'income'])
data_test.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hrs_per_week,native_country,income
0,|1x3 Cross validator,,,,,,,,,,,,,,
1,25,Private,226802.0,11th,7.0,Never-married,Machine-op-inspct,Own-child,Black,Male,0.0,0.0,40.0,United-States,<=50K.
2,38,Private,89814.0,HS-grad,9.0,Married-civ-spouse,Farming-fishing,Husband,White,Male,0.0,0.0,50.0,United-States,<=50K.
3,28,Local-gov,336951.0,Assoc-acdm,12.0,Married-civ-spouse,Protective-serv,Husband,White,Male,0.0,0.0,40.0,United-States,>50K.
4,44,Private,160323.0,Some-college,10.0,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688.0,0.0,40.0,United-States,>50K.


In [226]:
#data.info()

In [227]:
#data_test.info()

In [228]:
data_test.dropna(inplace=True)

In [229]:
# There's no nans
#data.isna().sum()

In [230]:
data = pd.concat([data, data_test], ignore_index=True)

In [231]:
data.income.unique()

array([' <=50K', ' >50K', ' <=50K.', ' >50K.'], dtype=object)

In [232]:
data['income'] = data['income'].replace({' <=50K.': ' <=50K', ' >50K.': ' >50K'})
data.income.unique()

array([' <=50K', ' >50K'], dtype=object)

In [233]:
# in this column the order does not matter
data.workclass.unique()

array([' State-gov', ' Self-emp-not-inc', ' Private', ' Federal-gov',
       ' Local-gov', ' ?', ' Self-emp-inc', ' Without-pay',
       ' Never-worked'], dtype=object)

In [234]:
# we must keep the order in this encoding
data.education.unique()

array([' Bachelors', ' HS-grad', ' 11th', ' Masters', ' 9th',
       ' Some-college', ' Assoc-acdm', ' Assoc-voc', ' 7th-8th',
       ' Doctorate', ' Prof-school', ' 5th-6th', ' 10th', ' 1st-4th',
       ' Preschool', ' 12th'], dtype=object)

In [235]:
edu = [' Preschool', ' 1st-4th', ' 5th-6th', ' 7th-8th', ' 9th', ' 10th', ' 11th', ' 12th',
       ' HS-grad', ' Prof-school', ' Some-college', ' Assoc-acdm', ' Assoc-voc', ' Bachelors',
       ' Masters',' Doctorate']

In [247]:
data.occupation.unique()

array([' Adm-clerical', ' Exec-managerial', ' Handlers-cleaners',
       ' Prof-specialty', ' Other-service', ' Sales', ' Transport-moving',
       ' Farming-fishing', ' Machine-op-inspct', ' Tech-support',
       ' Craft-repair', ' Protective-serv', ' Armed-Forces',
       ' Priv-house-serv'], dtype=object)

In [237]:
#data.occupation.value_counts()

In [238]:
#sns.countplot(data=data, y='occupation', hue='income')

In [239]:
# auxiliar calculations to decide how to order the occupations columns
aux = data[['occupation', 'income']].copy()
a = pd.DataFrame(aux.groupby(['occupation', 'income']).size())
a = {'clerical': (756/4784), 'af': (4/10), 'craft': (1355/4665), 'manager': (2867/3117), 'farm': (172/1308),
     'clean': (135/1911), 'mach': (365/2605), 'other': (196/4612), 'priv': (3/229), 'prof': (2704/3304),
     'prot_serv': (307/669), 'sales': (1455/3953), 'tech':(411/1009), 'transport': (478/1838) }
# sorted by the ratio of >50k / <= 50k by profession
dict(sorted(a.items(), key=lambda item: item[1]))

{'priv': 0.013100436681222707,
 'other': 0.042497831743278404,
 'clean': 0.0706436420722135,
 'farm': 0.13149847094801223,
 'mach': 0.1401151631477927,
 'clerical': 0.15802675585284282,
 'transport': 0.2600652883569097,
 'craft': 0.2904608788853162,
 'sales': 0.36807487983809767,
 'af': 0.4,
 'tech': 0.40733399405351833,
 'prot_serv': 0.4588938714499253,
 'prof': 0.8184019370460048,
 'manager': 0.919794674366378}

In [240]:
ocupation = [' Priv-house-serv', ' Other-service', ' Handlers-cleaners', ' Farming-fishing', ' Machine-op-inspct',
             ' Adm-clerical', ' Transport-moving', ' Craft-repair', ' Sales', ' Armed-Forces', ' Tech-support',
             ' Protective-serv', ' Prof-specialty', ' Exec-managerial']

In [241]:
data.loc[data.workclass == ' ?']

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hrs_per_week,native_country,income
27,54,?,180211.0,Some-college,10.0,Married-civ-spouse,?,Husband,Asian-Pac-Islander,Male,0.0,0.0,60.0,South,>50K
61,32,?,293936.0,7th-8th,4.0,Married-spouse-absent,?,Not-in-family,White,Male,0.0,0.0,40.0,?,<=50K
69,25,?,200681.0,Some-college,10.0,Never-married,?,Own-child,White,Male,0.0,0.0,40.0,United-States,<=50K
77,67,?,212759.0,10th,6.0,Married-civ-spouse,?,Husband,White,Male,0.0,0.0,2.0,United-States,<=50K
106,17,?,304873.0,10th,6.0,Never-married,?,Own-child,White,Female,34095.0,0.0,32.0,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48682,61,?,265201.0,Some-college,10.0,Married-civ-spouse,?,Husband,White,Male,0.0,0.0,14.0,United-States,<=50K
48769,21,?,212661.0,Some-college,10.0,Never-married,?,Own-child,White,Female,0.0,0.0,30.0,United-States,<=50K
48800,73,?,144872.0,HS-grad,9.0,Married-civ-spouse,?,Husband,White,Male,0.0,0.0,25.0,Canada,<=50K
48812,81,?,26711.0,Assoc-voc,11.0,Married-civ-spouse,?,Husband,White,Male,2936.0,0.0,20.0,United-States,<=50K


In [242]:
#data.loc[data.native_country == ' ?']

In [311]:
data.replace(' ?', np.NaN, inplace=True)
data.dropna(inplace=True)
data.reset_index(inplace=True, drop=True)

In [312]:
le = preprocessing.LabelEncoder()
sc = preprocessing.MaxAbsScaler()
oe = preprocessing.OrdinalEncoder(categories=[edu, ocupation])


In [61]:
def preprocess(df):
    mappings = {}
    y = df.income
    X = df.drop(columns=['income'])
    objList = X.select_dtypes(include="object").columns.to_list()
    y = le.fit_transform(y)
    
    for feat in objList:
        X[feat] = X[feat].astype('category')
        mappings[feat] = (X[feat].cat.categories, X[feat].cat.codes)
        X[feat] = X[feat].cat.codes
        
    #X = pd.DataFrame(oe.transform(X[['workclass', 'education', 'marital_status', 'occupation', 'relationship',
    #                        'race', 'sex', 'native_country']]))
    #print(X.head())
    X = sc.fit_transform(X)
    
    return X, y, mappings

In [62]:
X, y, mappings = preprocess(data)
#a.head()

In [315]:
def preprocessing_v3(df):
    y = df.income
    X = df.drop(columns=['income'])
    objList = X.select_dtypes(include="object").columns.to_list()
    objList.remove('education')
    y = le.fit_transform(y)
    #oe.fit(X[['education', 'occupation']])
    X[['education', 'occupation']] = pd.DataFrame(oe.fit_transform(X[['education', 'occupation']]))
    X = pd.get_dummies(X, columns=['workclass', 'marital_status', 'relationship',
                                   'race', 'sex', 'native_country'])
    
    
    return X, y

In [316]:
X, y = preprocessing_v3(data)

In [317]:
X.occupation.unique().shape

(14,)

In [318]:
X.occupation.unique()

array([ 5., 13.,  2., 12.,  1.,  8.,  6.,  3.,  4., 10.,  7., 11.,  9.,
        0.])

In [322]:
X.education.unique()

array([13.,  8.,  6., 14.,  4., 10., 11.,  3., 15., 12.,  9.,  2.,  5.,
        0.,  7.,  1.])

In [277]:
X

Unnamed: 0,age,fnlwgt,education,education_num,occupation,capital_gain,capital_loss,hrs_per_week,workclass_ Federal-gov,workclass_ Local-gov,...,native_country_ Portugal,native_country_ Puerto-Rico,native_country_ Scotland,native_country_ South,native_country_ Taiwan,native_country_ Thailand,native_country_ Trinadad&Tobago,native_country_ United-States,native_country_ Vietnam,native_country_ Yugoslavia
0,39,77516.0,13.0,13.0,5.0,2174.0,0.0,40.0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,50,83311.0,13.0,13.0,13.0,0.0,0.0,13.0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,38,215646.0,8.0,9.0,2.0,0.0,0.0,40.0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,53,234721.0,6.0,7.0,2.0,0.0,0.0,40.0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,28,338409.0,13.0,13.0,12.0,0.0,0.0,40.0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48836,33,245211.0,,13.0,,0.0,0.0,40.0,0,0,...,0,0,0,0,0,0,0,1,0,0
48837,39,215419.0,,13.0,,0.0,0.0,36.0,0,0,...,0,0,0,0,0,0,0,1,0,0
48839,38,374983.0,,13.0,,0.0,0.0,50.0,0,0,...,0,0,0,0,0,0,0,1,0,0
48840,44,83891.0,,13.0,,5455.0,0.0,40.0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [134]:
#mappings

In [12]:
def preprocess_v2(df):
    y = df.income
    X = df.drop(columns=['income'])
    
    y = le.fit_transform(y)
    # need to change to have a consistent one_hot_encoding
    X = pd.get_dummies(X, columns=['workclass', 'education', 'marital_status', 'occupation', 'relationship',
                                   'race', 'sex', 'native_country'])
    sc = preprocessing.MaxAbsScaler()
    X = sc.fit_transform(X)
    
    return X, y

In [14]:
# may have the need to change the encoding because knn 
# does not work well with many features
X = pd.get_dummies(X, columns=['workclass', 'education', 'marital_status', 'occupation', 'relationship',
                               'race', 'sex', 'native_country'])
X.head()

ValueError: Data must be 1-dimensional

In [25]:
X_2, y_2 = preprocess_v2(data)

In [140]:
def train_knn(X, y):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=0)
    param_grid = [
        {'n_neighbors': [7, 9, 11, 13],
         'weights': ['uniform', 'distance'],
         'leaf_size': [10, 15, 20, 25]}
    ]

    knn = KNeighborsClassifier()

    gs_knn = GridSearchCV(knn,
                        param_grid=param_grid,
                        scoring='accuracy',
                        cv=5)

    gs_knn.fit(X_train, y_train)
    print(gs_knn.best_params_)
    print(gs_knn.best_score_)
    print(gs_knn.score(X_test, y_test))


In [141]:
# Using ordinal encoding
train_knn(X, y)

{'leaf_size': 10, 'n_neighbors': 9, 'weights': 'uniform'}
0.829536600733638
0.8231848823074372


In [142]:
# Using onehot encoding
train_knn(X_2, y_2)

{'leaf_size': 10, 'n_neighbors': 11, 'weights': 'uniform'}
0.8300102650067815
0.8247320145872472


In [38]:
# saving the model to disk
filename = 'knn.sav'
joblib.dump(gs_knn, filename)

['knn.sav']

In [None]:
# loading the model from disk
loaded_model = joblib.load('knn.sav')
result = loaded_model.score(X_test, y_test)

# Ramdom Forest

In [None]:
def train_rf(X, y):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=0)
    
    param_grid = [
        {'criterion': ['gini','entropy'],
        'max_depth': [10,30,60,None],
        'max_leaf_nodes': [10,30,60,None],
        'max_features': ['auto', 'sqrt'],
        'min_samples_leaf': [1,2,3],
        'min_samples_split': [2,4],
        'cn_estimators': (500,800,1000,1200)}
    ]

    rf = RandomForestClassifier()

    gs_rf = GridSearchCV(rf,
                          param_grid=param_grid,
                          scoring='accuracy',
                          cv=5)

    gs_rf.fit(X_train, y_train)
    print(gs_rf.best_params_)
    print(gs_rf.best_score_)
    print(gs_rf.score(X_test, y_test))


In [26]:
# Triyng with standard parameters
# With the second preprocess function
RF_classifier = RandomForestClassifier(n_estimators=1000, random_state=0)
RF_classifier.fit(X_2, y_2) 

RandomForestClassifier(n_estimators=1000, random_state=0)

In [29]:
all_accuracies_RF = cross_val_score(estimator=RF_classifier, X=X_2, y=y_2, cv=3)
print('cross_val_score: ', all_accuracies_RF)
print('Mean: ', all_accuracies_RF.mean())
print('Standard deviation: ', all_accuracies_RF.std())

cross_val_score:  [0.84921424 0.85229759 0.85786752]
Mean:  0.8531264505006299
Standard deviation:  0.0035809713832762207


In [None]:
#y_pred = classifier_best.predict(X_test)

In [None]:
#print(confusion_matrix(y_test,y_pred))
#print(classification_report(y_test,y_pred))
#print('accuracy_score: ', accuracy_score(y_test, y_pred))