In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder

In [2]:
column_names = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status',
                'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'target']
dtypes = {
    'age': int,
    'workclass': 'category',
    'fnlwgt': int,
    'education': 'category',
    'education_num': int,
    'marital-status': 'category',
    'occupation': 'category',
    'relationship': 'category',
    'race': 'category',
    'sex': 'category',
    'capital-gain': int,
    'hours-per-week': int,
    'native-country': 'category',
    'target': 'category'
}

In [3]:
# ' ?'/'?' is a missing value marker
df = pd.read_csv('../data/raw/adult/adult.data', header=None, names=column_names, index_col=False, na_values=[' ?','?'],
                 dtype=dtypes)

In [4]:
df.head(5)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,target
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [5]:
# Columns with nan values
with_nans = df.columns[df.isna().any()].tolist()
with_nans

['workclass', 'occupation', 'native-country']

In [6]:
X, y = df.loc[:, df.columns != 'target'], df['target']

In [7]:
filler = DecisionTreeClassifier(max_depth=5)
unknown = X.loc[:,X.columns.isin(with_nans)].copy()
# One hot encoded
known = pd.get_dummies(X.loc[:,~X.columns.isin(with_nans)], drop_first=True)

for col in with_nans:
    y_known, y_nan = unknown[col][~unknown[col].isnull()], unknown[col][unknown[col].isnull()]
    X_known, X_nan = known.loc[~unknown[col].isnull()], known.loc[unknown[col].isnull()]
    filler.fit(X_known, y_known)
    unknown.loc[unknown[col].isnull(), col] = filler.predict(X_nan)

In [8]:
X_filled = pd.concat([known, unknown], axis=1)
X_filled = pd.get_dummies(X_filled)

In [9]:
X_filled.columns

Index(['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss',
       'hours-per-week', 'education_ 11th', 'education_ 12th',
       'education_ 1st-4th', 'education_ 5th-6th', 'education_ 7th-8th',
       'education_ 9th', 'education_ Assoc-acdm', 'education_ Assoc-voc',
       'education_ Bachelors', 'education_ Doctorate', 'education_ HS-grad',
       'education_ Masters', 'education_ Preschool', 'education_ Prof-school',
       'education_ Some-college', 'marital-status_ Married-AF-spouse',
       'marital-status_ Married-civ-spouse',
       'marital-status_ Married-spouse-absent',
       'marital-status_ Never-married', 'marital-status_ Separated',
       'marital-status_ Widowed', 'relationship_ Not-in-family',
       'relationship_ Other-relative', 'relationship_ Own-child',
       'relationship_ Unmarried', 'relationship_ Wife',
       'race_ Asian-Pac-Islander', 'race_ Black', 'race_ Other', 'race_ White',
       'sex_ Male', 'workclass_ Federal-gov', 'workclass_ Loc