Uses the [Census Income Data Set](http://archive.ics.uci.edu/ml/datasets/Census+Income).

In [1]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

In [2]:
columns = [
    'age', 'workclass', 'fnlwgt', 'education',
    'education-num', 'marital-status',
    'occupation', 'relationship', 'race',
    'sex', 'capital-gain', 'capital-loss',
    'hours-per-week', 'native-country', 'label'
]

In [3]:
url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/adult'

In [4]:
df_train = pd.read_csv(f'{url}/adult.data', names=columns, na_values='?', sep=', ')
df_test = pd.read_csv(f'{url}/adult.test', names=columns, na_values='?', sep=', ')

df_train.dropna(how='any', inplace=True)
df_test.dropna(how='any', inplace=True)

  """Entry point for launching an IPython kernel.
  


In [5]:
df_train.shape

(30162, 15)

In [6]:
df_test.shape

(15060, 15)

In [7]:
num_train = df_train.shape[0]

In [8]:
df_train.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,label
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [9]:
df_test.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,label
1,25,Private,226802.0,11th,7.0,Never-married,Machine-op-inspct,Own-child,Black,Male,0.0,0.0,40.0,United-States,<=50K.
2,38,Private,89814.0,HS-grad,9.0,Married-civ-spouse,Farming-fishing,Husband,White,Male,0.0,0.0,50.0,United-States,<=50K.
3,28,Local-gov,336951.0,Assoc-acdm,12.0,Married-civ-spouse,Protective-serv,Husband,White,Male,0.0,0.0,40.0,United-States,>50K.
4,44,Private,160323.0,Some-college,10.0,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688.0,0.0,40.0,United-States,>50K.
6,34,Private,198693.0,10th,6.0,Never-married,Other-service,Not-in-family,White,Male,0.0,0.0,30.0,United-States,<=50K.


In [10]:
df_test['label'].replace({ '<=50K.': 0, '>50K.': 1 }, inplace=True)
df_train['label'].replace({ '<=50K': 0, '>50K': 1 }, inplace=True)

In [11]:
df_train.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,label
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,0
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,0
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,0
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,0


In [12]:
df_test.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,label
1,25,Private,226802.0,11th,7.0,Never-married,Machine-op-inspct,Own-child,Black,Male,0.0,0.0,40.0,United-States,0
2,38,Private,89814.0,HS-grad,9.0,Married-civ-spouse,Farming-fishing,Husband,White,Male,0.0,0.0,50.0,United-States,0
3,28,Local-gov,336951.0,Assoc-acdm,12.0,Married-civ-spouse,Protective-serv,Husband,White,Male,0.0,0.0,40.0,United-States,1
4,44,Private,160323.0,Some-college,10.0,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688.0,0.0,40.0,United-States,1
6,34,Private,198693.0,10th,6.0,Never-married,Other-service,Not-in-family,White,Male,0.0,0.0,30.0,United-States,0


In [13]:
df = pd.concat([df_train, df_test])

In [14]:
df_one_hot = pd.get_dummies(df)
df_train_one_hot = df_one_hot[:num_train]
df_test_one_hot = df_one_hot[num_train:]

In [15]:
df_train_one_hot.shape[0] == num_train

True

In [16]:
X_train = df_train_one_hot.drop(columns=['label']).values
y_train = df_train_one_hot['label'].values

In [17]:
X_test = df_test_one_hot.drop(columns=['label']).values
y_test = df_test_one_hot['label'].values

In [18]:
X_train.shape[0] == num_train

True

In [19]:
parameters = {
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'min_samples_split': [2, 4, 8, 16, 32]
}
clf = GridSearchCV(DecisionTreeClassifier(), parameters, scoring='f1_weighted')
clf.fit(X_train, y_train)

GridSearchCV(estimator=DecisionTreeClassifier(),
             param_grid={'criterion': ['gini', 'entropy'],
                         'min_samples_split': [2, 4, 8, 16, 32],
                         'splitter': ['best', 'random']},
             scoring='f1_weighted')

In [20]:
print(clf.best_score_)
print(clf.best_params_)

0.8265492929673008
{'criterion': 'gini', 'min_samples_split': 32, 'splitter': 'best'}


In [21]:
train_preds = np.where(clf.predict(X_train) > 0.5, 1, 0)
(train_preds == y_train).sum() / num_train

0.9106491611962072

In [22]:
probs = clf.predict(X_test)

In [23]:
predictions = np.where(probs > 0.5, 1, 0)

In [24]:
df_test['prediction'] = predictions

In [25]:
df_test['label'].replace({ 0: '<=50K', 1: '>50K' }, inplace=True)
df_test['prediction'].replace({ 0: '<=50K', 1: '>50K' }, inplace=True)

In [26]:
df_test.to_csv('census-income.csv', index=False, float_format='%.f')