In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

%matplotlib inline

In [2]:
def build_model(dataset, test_size=0.3, random_state=17):
    X_train, X_test, y_train, y_test = train_test_split(
        dataset.drop('Label', axis=1), dataset.Label,
        test_size=test_size, random_state=random_state)
    
    clf = DecisionTreeClassifier(random_state=random_state).fit(X_train, y_train)
    
    y_pred = clf.predict(X_test)
    return accuracy_score(y_test, y_pred)

In [3]:
df = pd.read_csv('datasets/employee-attrition/employee-attrition-missing.csv')

# This will fail with "ValueError: Input contains NaN, infinity or a value too large for dtype('float32')."
build_model(df)

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

In [4]:
df.head()

Unnamed: 0,TotalWorkingYears,MonthlyIncome,Overtime,DailyRate,Label
0,,6725,0,498.0,0
1,12.0,2782,0,,0
2,9.0,2468,0,,0
3,8.0,5003,0,549.0,0
4,12.0,8578,0,,0


In [5]:
num_orig_rows = len(df)
num_full_rows = len(df.dropna())

(num_orig_rows - num_full_rows)/float(num_orig_rows)

0.5653061224489796

### Solution 1: Remove rows and columns with NaN

In [6]:
df_droprows = df.dropna()
build_model(df_droprows)

0.75520833333333337

In [7]:
df_dropcols = df[['MonthlyIncome','Overtime','Label']]
build_model(df_dropcols)

0.77324263038548757

### Solution 2: Fill NaN values with '-1' sentinel values

In [8]:
df_sentinel = df.fillna(value=-1)
build_model(df_sentinel)

0.75283446712018143

### Solution 3a: Impute values using the mean

In [9]:
from sklearn.preprocessing import Imputer

imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
df_imputed = pd.DataFrame(imp.fit_transform(df),
                          columns=['TotalWorkingYears', 'MonthlyIncome',
                                   'OverTime', 'DailyRate', 'Label'])
build_model(df_imputed)

0.79365079365079361

### Solution 3b: Impute values using the median

In [10]:
imp = Imputer(missing_values='NaN', strategy='median', axis=0)
df_imputed = pd.DataFrame(imp.fit_transform(df),
                          columns=['TotalWorkingYears', 'MonthlyIncome',
                                   'OverTime', 'DailyRate', 'Label'])
build_model(df_imputed)

0.78684807256235823