# Income bracket

In [3]:
%matplotlib inline

In [2]:
import numpy as np
import pandas as pd

from sklearn import preprocessing
from sklearn.naive_bayes import GaussianNB

import matplotlib.pyplot as plt

In [4]:
dataset = pd.read_csv('../data/adult.data.txt', header=None)
dataset.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


## Balance the dataset

In [5]:
nb_clip = 10000

sub_50k = dataset.pipe(lambda d: d[d[14] == ' <=50K'])[:nb_clip]
print len(sub_50k)

10000


In [6]:
plus_50k = dataset.pipe(lambda d: d[d[14] == ' >50K'])[:nb_clip]
print len(plus_50k)

7841


In [7]:
frames = [sub_50k, plus_50k]
balanced_dataset = pd.concat(frames)

In [8]:
len(balanced_dataset)

17841

In [9]:
balanced_dataset[3].dtypes

dtype('O')

## Label Encoding

In [10]:
from collections import defaultdict
d = defaultdict(preprocessing.LabelEncoder)

In [49]:
def encode_labels(x):
    """
        Will label encode a column that is non numeric (object by default) 
    """
    print x.name, x.dtype
    if(x.dtype =='object'):
        print 'encoded'
        return d[x.name].fit_transform(x)
    else:
        print 'not encoded'
        return x

def transform_labels(x):
    """
        Will label new column with fitted transform 
    """
    print x.name, x.dtype
    if(x.dtype =='object'):
        print 'encoded'
        return d[x.name].transform(x)
    else:
        print 'not encoded'
        return x

In [50]:
fit = balanced_dataset.apply(encode_labels)

0 object
encoded
0 int64
not encoded
1 object
encoded
2 int64
not encoded
3 object
encoded
4 int64
not encoded
5 object
encoded
6 object
encoded
7 object
encoded
8 object
encoded
9 object
encoded
10 int64
not encoded
11 int64
not encoded
12 int64
not encoded
13 object
encoded
14 object
encoded


In [51]:
fit.head(1)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,39,7,77516,9,13,4,1,1,4,1,2174,0,40,38,0


In [52]:
balanced_dataset.head(1)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K


## Train-test split

In [53]:
X, Y = fit.values[:, :-1], fit.values[:, -1]

In [54]:
from sklearn.model_selection import train_test_split

In [55]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=5)

## Model training

In [56]:
classifier = GaussianNB()
classifier.fit(X_train, Y_train)
Y_test_pred = classifier.predict(X_test)

## Evaluation

In [57]:
from sklearn.model_selection import cross_val_score

In [58]:
f1 = cross_val_score(classifier, X, Y, scoring='f1_weighted', cv=5)

In [59]:
print('F1 score: ' + str(100*f1.mean()) + '%')

F1 score: 62.4719077361%


## Test case

In [65]:
data = [[39, 'State-gov', 77516, 'Bachelors', 13, 'Never-married', 'Adm-clerical',
        'Not-in-family', 'White', 'Male', 2174, 0, 40, 'United-States']]

In [66]:
data = pd.DataFrame(data)
data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States


In [71]:
data = data.apply(transform_labels)

0 object
encoded
0 int64
not encoded
1 object
encoded
2 int64
not encoded
3 object
encoded
4 int64
not encoded
5 object
encoded
6 object
encoded
7 object
encoded
8 object
encoded
9 object
encoded
10 int64
not encoded
11 int64
not encoded
12 int64
not encoded
13 object
encoded


In [72]:
classifier.predict(data)

array([0])

In [79]:
print d[14].inverse_transform([0])

[' <=50K']
