# Imports

In [1]:
import pandas as pd
import random
import numpy as np
import time

import sys
from sklearn import preprocessing
import sklearn.neighbors._base
sys.modules['sklearn.neighbors.base'] = sklearn.neighbors._base
from missingpy import MissForest

# Data

In [2]:
#Let us create our fake dataframe
age = random.sample(range(1, 100), 20) #List of 20 random numbers representing age

gender = random.choices(['MALE','FEMALE'], k=20) #List of 20 random choices of Male or Female

income = random.sample(range(10000, 100000), 20) #List of 20 random numbers representing income

hairColor = random.choices(['BLACK','BROWN','BLONDE','RED'], k=20) #List of 20 random choices representing hair

df = pd.DataFrame(list(zip(age,gender,income,hairColor)),
               columns =['Age', 'Gender','Income','Hair_Color'])
df

Unnamed: 0,Age,Gender,Income,Hair_Color
0,60,FEMALE,35432,RED
1,78,MALE,43999,RED
2,90,MALE,70498,BLONDE
3,12,FEMALE,73017,BROWN
4,52,MALE,73130,RED
5,82,FEMALE,47581,BLONDE
6,68,FEMALE,44236,BROWN
7,99,MALE,84706,RED
8,31,FEMALE,42205,RED
9,95,MALE,53147,BROWN


In [3]:
#Let us randomly remove 20 values to create missingness in the data
for i in range(20):
    row = random.randint(0,20)
    col = random.choice(df.columns)
    
    df.loc[row,col] = np.nan

In [4]:
#Our original dataset
df

Unnamed: 0,Age,Gender,Income,Hair_Color
0,60.0,FEMALE,,RED
1,78.0,MALE,43999.0,RED
2,,MALE,70498.0,BLONDE
3,,FEMALE,73017.0,BROWN
4,52.0,MALE,73130.0,RED
5,82.0,FEMALE,,BLONDE
6,68.0,,44236.0,
7,99.0,MALE,,
8,31.0,FEMALE,42205.0,RED
9,95.0,,53147.0,


# Note to the Reader

The medium articles briefly mention a general naive method of dealing with missing data. This naive method is either to drop the rows or columns containing missingness. There is no standard approach to using this method as it depends on the specific case at hand. 


Usually I like perserving as much data as possible and so I try my best to drop as little as possible. There are times when you cannot preserve rows/columns. For the times you can though, below we take some different approaches to filling in the missing values. 


# Naive Imputation (Numerical Data)

In [5]:
#Lets see which columns are numeric
df.dtypes

Age           float64
Gender         object
Income        float64
Hair_Color     object
dtype: object

In [6]:
#Age and Income can be imputed via Mean Column Replacement 
ageMean = df['Age'].mean()
incomeMean = df['Income'].mean()

df['Age'] = df['Age'].fillna(ageMean)
df['Income'] = df['Income'].fillna(incomeMean)

In [7]:
df

Unnamed: 0,Age,Gender,Income,Hair_Color
0,60.0,FEMALE,56824.866667,RED
1,78.0,MALE,43999.0,RED
2,63.866667,MALE,70498.0,BLONDE
3,63.866667,FEMALE,73017.0,BROWN
4,52.0,MALE,73130.0,RED
5,82.0,FEMALE,56824.866667,BLONDE
6,68.0,,44236.0,
7,99.0,MALE,56824.866667,
8,31.0,FEMALE,42205.0,RED
9,95.0,,53147.0,


# Naive Imputation (Categorical Data)

In [8]:
#Let's see which columns are categorical
df.dtypes

Age           float64
Gender         object
Income        float64
Hair_Color     object
dtype: object

In [9]:
#Gender and Hair_Color can be imputed via Mode Column Replacement 
genderMode = df['Gender'].mode()
hairColorMode = df['Hair_Color'].mode()

df['Gender'] = df['Gender'].fillna(genderMode)
df['Hair_Color'] = df['Hair_Color'].fillna(hairColorMode)

In [10]:
df

Unnamed: 0,Age,Gender,Income,Hair_Color
0,60.0,FEMALE,56824.866667,RED
1,78.0,MALE,43999.0,RED
2,63.866667,MALE,70498.0,BLONDE
3,63.866667,FEMALE,73017.0,BROWN
4,52.0,MALE,73130.0,RED
5,82.0,FEMALE,56824.866667,BLONDE
6,68.0,,44236.0,
7,99.0,MALE,56824.866667,
8,31.0,FEMALE,42205.0,RED
9,95.0,,53147.0,


# MissForest Imputation

In [11]:
#Let us create an even larger dataset just to see the computational time required to fill in values 100k Rows

#Let us create our fake dataframe
ageLarge = [random.randint(0,100) for i in range(100000)]

genderLarge = random.choices(['MALE','FEMALE'], k=100000) 

incomeLarge = [random.randint(10000, 100000) for i in range(100000)] 

hairColorLarge = random.choices(['BLACK','BROWN','BLONDE','RED'], k=100000)

dfLarge = pd.DataFrame(list(zip(ageLarge,genderLarge,incomeLarge,hairColorLarge)),
               columns =['Age', 'Gender','Income','Hair_Color'])
dfLarge


Unnamed: 0,Age,Gender,Income,Hair_Color
0,67,MALE,72150,BLONDE
1,67,FEMALE,87073,RED
2,57,FEMALE,92811,BROWN
3,94,MALE,77874,BROWN
4,89,FEMALE,75981,BLONDE
...,...,...,...,...
99995,17,FEMALE,40911,BLONDE
99996,32,FEMALE,57111,BLACK
99997,88,FEMALE,22049,BLONDE
99998,24,MALE,56642,RED


In [12]:
for i in range(1000):
    row = random.randint(0,100001)
    col = random.choice(dfLarge.columns)
    
    dfLarge.loc[row,col] = np.nan

In [13]:
#For the cateogorical columns we will encode np.nan as the string 'missing'
dfLarge['Gender'].fillna('missing')
dfLarge['Hair_Color'].fillna('missing')

0        BLONDE
1           RED
2         BROWN
3         BROWN
4        BLONDE
          ...  
99995    BLONDE
99996     BLACK
99997    BLONDE
99998       RED
99999     BROWN
Name: Hair_Color, Length: 100000, dtype: object

In [14]:
#Now we encode cateogorical columns by mapping them to integer values
dfLarge['Gender'] = dfLarge.Gender.astype('category').cat.codes # 0, 1, 2 Each gender plus missing
dfLarge['Hair_Color'] = dfLarge.Hair_Color.astype('category').cat.codes # 0, 1, 2, 3, 4, 5 Each hair plus missing

In [15]:
#We will time our MissForest imputation to see how long it takes on a dataset of this size
startTime = time.time()

imputer = MissForest()

dfLarge_Imputed = imputer.fit_transform(dfLarge)


endTime = time.time()

Iteration: 0
Iteration: 1
Iteration: 2


In [16]:
#How long it took to use MissForest to impute values in seconds
print(endTime-startTime)

40.14900517463684


In [17]:
#Returns a numpy array 
dfLarge_Imputed

array([[6.7000e+01, 1.0000e+00, 7.2150e+04, 1.0000e+00],
       [6.7000e+01, 0.0000e+00, 8.7073e+04, 3.0000e+00],
       [5.7000e+01, 0.0000e+00, 9.2811e+04, 2.0000e+00],
       ...,
       [8.8000e+01, 0.0000e+00, 2.2049e+04, 1.0000e+00],
       [2.4000e+01, 1.0000e+00, 5.6642e+04, 3.0000e+00],
       [9.9000e+01, 1.0000e+00, 3.6876e+04, 2.0000e+00]])

In [18]:
#Convert back to a dataframe
dfLarge_Imputed = pd.DataFrame(dfLarge_Imputed)

In [19]:
dfLarge_Imputed

Unnamed: 0,0,1,2,3
0,67.0,1.0,72150.0,1.0
1,67.0,0.0,87073.0,3.0
2,57.0,0.0,92811.0,2.0
3,94.0,1.0,77874.0,2.0
4,89.0,0.0,75981.0,1.0
...,...,...,...,...
99995,17.0,0.0,40911.0,1.0
99996,32.0,0.0,57111.0,0.0
99997,88.0,0.0,22049.0,1.0
99998,24.0,1.0,56642.0,3.0


In [20]:
dfLarge_Imputed.isna().sum() #No missing for any of the columns

0    0
1    0
2    0
3    0
dtype: int64

In [21]:
#Now you can rename the columns and get to running a model on this data!