In [39]:
import joblib # to save and load the models
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from collections import defaultdict

In [4]:
data = pd.read_csv('data/adult.data', names=['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status',
                                              'occupation', 'relationship', 'race', 'sex', 'capital_gain', 'capital_loss',
                                              'hrs_per_week', 'native_country', 'income'])
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hrs_per_week,native_country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education_num   32561 non-null  int64 
 5   marital_status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital_gain    32561 non-null  int64 
 11  capital_loss    32561 non-null  int64 
 12  hrs_per_week    32561 non-null  int64 
 13  native_country  32561 non-null  object
 14  income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [6]:
# There's no nans
#data.isna().sum()

In [7]:
data.workclass.unique()

array([' State-gov', ' Self-emp-not-inc', ' Private', ' Federal-gov',
       ' Local-gov', ' ?', ' Self-emp-inc', ' Without-pay',
       ' Never-worked'], dtype=object)

In [8]:
data.loc[data.workclass == ' ?']

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hrs_per_week,native_country,income
27,54,?,180211,Some-college,10,Married-civ-spouse,?,Husband,Asian-Pac-Islander,Male,0,0,60,South,>50K
61,32,?,293936,7th-8th,4,Married-spouse-absent,?,Not-in-family,White,Male,0,0,40,?,<=50K
69,25,?,200681,Some-college,10,Never-married,?,Own-child,White,Male,0,0,40,United-States,<=50K
77,67,?,212759,10th,6,Married-civ-spouse,?,Husband,White,Male,0,0,2,United-States,<=50K
106,17,?,304873,10th,6,Never-married,?,Own-child,White,Female,34095,0,32,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32530,35,?,320084,Bachelors,13,Married-civ-spouse,?,Wife,White,Female,0,0,55,United-States,>50K
32531,30,?,33811,Bachelors,13,Never-married,?,Not-in-family,Asian-Pac-Islander,Female,0,0,99,United-States,<=50K
32539,71,?,287372,Doctorate,16,Married-civ-spouse,?,Husband,White,Male,0,0,10,United-States,>50K
32541,41,?,202822,HS-grad,9,Separated,?,Not-in-family,Black,Female,0,0,32,United-States,<=50K


In [9]:
#data.loc[data.native_country == ' ?']

In [10]:
data.replace(' ?', np.NaN, inplace=True)
data.dropna(inplace=True)

In [48]:
le = preprocessing.LabelEncoder()
sc = preprocessing.MaxAbsScaler()
oe = preprocessing.OrdinalEncoder()

In [132]:
def preprocess(df):
    mappings = {}
    y = df.income
    X = df.drop(columns=['income'])
    objList = X.select_dtypes(include="object").columns
    
    y = le.fit_transform(y)
    
    for feat in objList:
        X[feat] = X[feat].astype('category')
        mappings[feat] = (X[feat].cat.categories, X[feat].cat.codes)
        X[feat] = X[feat].cat.codes
        
    #X = pd.DataFrame(oe.transform(X[['workclass', 'education', 'marital_status', 'occupation', 'relationship',
    #                        'race', 'sex', 'native_country']]))
    #print(X.head())
    X = sc.fit_transform(X)
    
    return X, y, mappings

In [133]:
X, y, mappings = preprocess(data)
#a.head()

In [134]:
#mappings

In [135]:
def preprocess_v2(df):
    y = df.income
    X = df.drop(columns=['income'])
    
    y = le.fit_transform(y)
    # need to change to have a consistent one_hot_encoding
    X = pd.get_dummies(X, columns=['workclass', 'education', 'marital_status', 'occupation', 'relationship',
                                   'race', 'sex', 'native_country'])
    sc = preprocessing.MaxAbsScaler()
    X = sc.fit_transform(X)
    
    return X, y

In [11]:
# may have the need to change the encoding because knn 
# does not work well with many features
X = pd.get_dummies(X, columns=['workclass', 'education', 'marital_status', 'occupation', 'relationship',
                               'race', 'sex', 'native_country'])
X.head()

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hrs_per_week,workclass_ Federal-gov,workclass_ Local-gov,workclass_ Private,workclass_ Self-emp-inc,...,native_country_ Portugal,native_country_ Puerto-Rico,native_country_ Scotland,native_country_ South,native_country_ Taiwan,native_country_ Thailand,native_country_ Trinadad&Tobago,native_country_ United-States,native_country_ Vietnam,native_country_ Yugoslavia
0,39,77516,13,2174,0,40,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,50,83311,13,0,0,13,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,38,215646,9,0,0,40,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
3,53,234721,7,0,0,40,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
4,28,338409,13,0,0,40,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [136]:
X_2, y_2 = preprocess_v2(data)

In [140]:
def train_knn(X, y):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=0)
    param_grid = [
        {'n_neighbors': [7, 9, 11, 13],
         'weights': ['uniform', 'distance'],
         'leaf_size': [10, 15, 20, 25]}
    ]

    knn = KNeighborsClassifier()

    gs_knn = GridSearchCV(knn,
                        param_grid=param_grid,
                        scoring='accuracy',
                        cv=5)

    gs_knn.fit(X_train, y_train)
    print(gs_knn.best_params_)
    print(gs_knn.best_score_)
    print(gs_knn.score(X_test, y_test))


In [None]:
def train_rf(X, y):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=0)
    param_grid = [
        {'n_neighbors': [7, 9, 11, 13],
         'weights': ['uniform', 'distance'],
         'leaf_size': [10, 15, 20, 25]}
    ]

    rf = RandomForestClassifier()

    gs_rf = GridSearchCV(rf,
                          param_grid=param_grid,
                          scoring='accuracy',
                          cv=5)

    gs_rf.fit(X_train, y_train)
    print(gs_rf.best_params_)
    print(gs_rf.best_score_)
    print(gs_rf.score(X_test, y_test))


In [141]:
# Using ordinal encoding
train_knn(X, y)

{'leaf_size': 10, 'n_neighbors': 9, 'weights': 'uniform'}
0.829536600733638
0.8231848823074372


In [142]:
# Using onehot encoding
train_knn(X_2, y_2)

{'leaf_size': 10, 'n_neighbors': 11, 'weights': 'uniform'}
0.8300102650067815
0.8247320145872472


In [38]:
# saving the model to disk
filename = 'knn.sav'
joblib.dump(gs_knn, filename)

['knn.sav']

In [None]:
# loading the model from disk
loaded_model = joblib.load('knn.sav')
result = loaded_model.score(X_test, y_test)