In [2]:
import numpy as np
import pandas as pd
from sklearn import tree
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn import preprocessing
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
import matplotlib.cm as cm

In [37]:
def label_encoding(train, test, cols):
    enc = preprocessing.LabelEncoder()
    data = pd.concat([train, test]).reset_index()
    for c in cols:
        enc.fit(np.unique(data[c]))
        test[c] = enc.transform(test[c])
        train[c] = enc.transform(train[c])
    return train, test


def one_hot_encoding(train, test, cols):
    data = pd.concat([train, test]).reset_index()
    data = pd.get_dummies(data, columns=cols, prefix=cols)
    data.drop('index', axis=1, inplace=True)
    return data.loc[0 : train.shape[0], : ], data.loc[train.shape[0] : train.shape[0]+test.shape[0], : ]
    
    
def print_scoring(clf, testX, testY, name):
    prediction = clf.predict(testX)
    ac = np.sum(prediction == testY)/testY.shape[0]
    tpr = np.sum(np.logical_and(prediction == 1, testY == 1))/np.sum(testY == 1)
    fpr = np.sum(np.logical_and(prediction == 1, testY == 0))/np.sum(testY == 0)
    tnr = np.sum(np.logical_and(prediction == 0, testY == 0))/np.sum(testY == 0)
    fnr = np.sum(np.logical_and(prediction == 0, testY == 1))/np.sum(testY == 1)
    print('accuracy', name + ': ', ac)
    print('true-positive-rate: ', tpr)
    print('false-positive-rate: ', fpr)
    print('true-negative-rate: ', tnr)
    print('false-negative-rate: ', fnr)
    print('')

In [38]:
test = pd.read_csv('data/adult_test.csv', sep=',', na_values = ['?'])
train = data = pd.read_csv('data/adult_train.csv', sep=',', na_values = ['?'])

In [39]:
print(train.shape)

(32561, 15)


In [40]:
#drop null values
train = train.dropna().reset_index(drop=True)
print(train.shape)

(30162, 15)


In [41]:
bin_col = ['sex', 'income']
cat_col = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'native-country']

#encoding
train, test = label_encoding(train, test, bin_col)
train, test = one_hot_encoding(train, test, cat_col)

In [42]:
#drop meaningless columns
trainX = train.drop(['income', 'fnlwgt'], axis=1)
trainY = train.income
testX = test.drop(['income', 'fnlwgt'], axis=1)
testY = test.income

In [43]:
#train classifier on train data
tree_clf = tree.DecisionTreeClassifier().fit(trainX, trainY)
gaus_clf = GaussianNB().fit(trainX, trainY)
knn_clf = KNeighborsClassifier(10).fit(trainX, trainY)

In [44]:
#compare results on test data
print_scoring(tree_clf, testX, testY, 'tree ')
print_scoring(gaus_clf, testX, testY, 'gaus ')
print_scoring(knn_clf, testX, testY, 'knn ')

accuracy tree :  0.81235059761
true-positive-rate:  0.605405405405
false-positive-rate:  0.120246478873
true-negative-rate:  0.879753521127
false-negative-rate:  0.394594594595

accuracy gaus :  0.809561752988
true-positive-rate:  0.799189189189
false-positive-rate:  0.187059859155
true-negative-rate:  0.812940140845
false-negative-rate:  0.200810810811

accuracy knn :  0.849535192563
true-positive-rate:  0.589189189189
false-positive-rate:  0.0656690140845
true-negative-rate:  0.934330985915
false-negative-rate:  0.410810810811

