In [170]:
import pandas as pd

In [171]:
train = pd.read_csv('adults/train.csv')
test = pd.read_csv('adults/test.csv')

In [172]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
age               32561 non-null int64
workclass         30725 non-null object
fnlwgt            32561 non-null int64
education         32561 non-null object
education.num     32561 non-null int64
marital.status    32561 non-null object
occupation        30718 non-null object
relationship      32561 non-null object
race              32561 non-null object
sex               32561 non-null object
capital.gain      32561 non-null int64
capital.loss      32561 non-null int64
hours.per.week    32561 non-null int64
native.country    31978 non-null object
target            32561 non-null object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [173]:
print(train.shape)
print(test.shape)

(32561, 15)
(16281, 15)


In [174]:
train.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,target
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [175]:
print('Missing values in train set: {}'.format(train.shape[0] - train.dropna().shape[0]))
print('Missing values in test set: {}'.format(test.shape[0] - test.dropna().shape[0]))

Missing values in train set: 2399
Missing values in test set: 1221


In [176]:
train.isnull().sum()

age                  0
workclass         1836
fnlwgt               0
education            0
education.num        0
marital.status       0
occupation        1843
relationship         0
race                 0
sex                  0
capital.gain         0
capital.loss         0
hours.per.week       0
native.country     583
target               0
dtype: int64

In [177]:
# O is to select columns only with categorical data(OBJECTS)
train_with_only_categorical = train.select_dtypes(include = ['O'])
train_with_only_categorical.apply(pd.Series.nunique)

workclass          8
education         16
marital.status     7
occupation        14
relationship       6
race               5
sex                2
native.country    41
target             2
dtype: int64

In [178]:
print('Mode of workclass: {}'.format(train['workclass'].mode()[0]))
print('Mode of occupation: {}'.format(train['occupation'].mode()[0]))
print('Mode of native.country: {}'.format(train['native.country'].mode()[0]))

Mode of workclass:  Private
Mode of occupation:  Prof-specialty
Mode of native.country:  United-States


In [179]:
train['workclass'].fillna('Private', inplace = True)
train['occupation'].fillna('Prof-specialty', inplace = True)
train['native.country'].fillna('United-States', inplace = True)

In [180]:
train.isnull().sum()

age               0
workclass         0
fnlwgt            0
education         0
education.num     0
marital.status    0
occupation        0
relationship      0
race              0
sex               0
capital.gain      0
capital.loss      0
hours.per.week    0
native.country    0
target            0
dtype: int64

In [181]:
train['target'].value_counts()/train.shape[0]

 <=50K    0.75919
 >50K     0.24081
Name: target, dtype: float64

In [182]:
pd.crosstab(train['education'],train['target'],margins = True)/train.shape[0] * 100

target,<=50K,>50K,All
education,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
10th,2.674979,0.190412,2.865391
11th,3.424342,0.18427,3.608612
12th,1.228463,0.101348,1.329812
1st-4th,0.497528,0.018427,0.515955
5th-6th,0.973557,0.049139,1.022696
7th-8th,1.861122,0.122846,1.983969
9th,1.495654,0.082921,1.578576
Assoc-acdm,2.463069,0.813857,3.276926
Assoc-voc,3.135653,1.108688,4.244341
Bachelors,9.625012,6.821044,16.446055


In [183]:
from sklearn.preprocessing import LabelEncoder

for col_name in train.columns:
    if train[col_name].dtype == 'object':
        LE = LabelEncoder()
        LE.fit(train[col_name].values)
        train[col_name] = LE.transform(train[col_name].values)

In [184]:
train.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,target
0,39,6,77516,9,13,4,0,1,4,1,2174,0,40,38,0
1,50,5,83311,9,13,2,3,0,4,1,0,0,13,38,0
2,38,3,215646,11,9,0,5,1,4,1,0,0,40,38,0
3,53,3,234721,1,7,2,5,0,2,1,0,0,40,38,0
4,28,3,338409,9,13,2,9,5,2,0,0,0,40,4,0


In [185]:
train['target'].value_counts()

0    24720
1     7841
Name: target, dtype: int64

In [186]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import cross_val_score
from sklearn.metrics import accuracy_score

In [187]:
y = train['target']
del train['target']
X = train


In [188]:
train_x,test_x,train_y,test_y = train_test_split(X,y,test_size = 0.3,stratify = y)

In [189]:
clf = RandomForestClassifier(n_estimators=500,
                             max_depth=6)

clf.fit(train_x,train_y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=6, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=500, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [190]:
prediction = clf.predict(test_x)

In [191]:
accuracy_score(test_y,prediction)

0.85382331866107075

In [192]:
test['workclass'].fillna('Private', inplace = True)
test['occupation'].fillna('Prof-specialty', inplace = True)
test['native.country'].fillna('United-States', inplace = True)

In [193]:
for col_name in test.columns:
    if test[col_name].dtype == 'object':
        LE = LabelEncoder()
        LE.fit(test[col_name].values)
        test[col_name] = LE.transform(test[col_name].values)

In [194]:
test.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,target
0,25,3,226802,1,7,4,6,3,2,1,0,0,40,37,0
1,38,3,89814,11,9,2,4,0,4,1,0,0,50,37,0
2,28,1,336951,7,12,2,10,0,4,1,0,0,40,37,1
3,44,3,160323,15,10,2,6,0,2,1,7688,0,40,37,1
4,18,8,103497,15,10,4,14,3,4,0,0,0,30,37,0


In [195]:
y_test_original = test['target']

del test['target']

test = test

In [196]:
prediction = clf.predict(test)

In [197]:
score = accuracy_score(y_test_original,prediction)
print('Test set prediction accuracy: {}'.format(score * 100) )

Test set prediction accuracy: 85.283459246975
