In [12]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os
plt.style.use('seaborn-colorblind')
%matplotlib inline

In [16]:
data = pd.read_csv('data/titanic.csv', usecols=use_cols)
data.isna().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Fare          0
dtype: int64

## Missing value checking
check the total number & percentage of missing values
per variable of a pandas Dataframe

In [17]:
def check_missing(data,output_path=None):
    """
    check the total number & percentage of missing values
    per variable of a pandas Dataframe
    """
    
    result = pd.concat([data.isnull().sum(),data.isnull().mean()],axis=1)
    result = result.rename(index=str,columns={0:'total missing',1:'proportion'})
    if output_path is not None:
        result.to_csv(output_path+'missing.csv')
        print('result saved at', output_path, 'missing.csv')
    return result
check_missing(data=data)

Unnamed: 0,total missing,proportion
Survived,0,0.0
Pclass,0,0.0
Sex,0,0.0
Age,177,0.198653
SibSp,0,0.0
Fare,0,0.0


## Listwise deletion  
excluding all cases (listwise) that have missing values

In [18]:

def drop_missing(data,axis=0):
    """
    Listwise deletion:
    excluding all cases (listwise) that have missing values

    Parameters
    ----------
    axis: drop cases(0)/columns(1),default 0

    Returns
    -------
    Pandas dataframe with missing cases/columns dropped
    """    
    
    data_copy = data.copy(deep=True)
    data_copy = data_copy.dropna(axis=axis,inplace=False)
    return data_copy
drop_missing(data=data)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Fare
0,0,3,male,22.0,1,7.2500
1,1,1,female,38.0,1,71.2833
2,1,3,female,26.0,0,7.9250
3,1,1,female,35.0,1,53.1000
4,0,3,male,35.0,0,8.0500
...,...,...,...,...,...,...
885,0,3,female,39.0,0,29.1250
886,0,2,male,27.0,0,13.0000
887,1,1,female,19.0,0,30.0000
889,1,1,male,26.0,0,30.0000


## Add a variable to denote NA
creating an additional variable indicating whether the data was missing for that observation

In [21]:
def add_var_denote_NA(data,NA_col=[]):
    """
    creating an additional variable indicating whether the data 
    was missing for that observation (1) or not (0).
    """
  
    data_copy = data.copy(deep=True)
    for i in NA_col:
        if data_copy[i].isnull().sum()>0:
            data_copy[i+'_is_NA'] = np.where(data_copy[i].isnull(),1,0)
        else:
            warn("Column %s has no missing cases" % i)
            
    return data_copy
add_var_denote_NA(data=data)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Fare
0,0,3,male,22.0,1,7.2500
1,1,1,female,38.0,1,71.2833
2,1,3,female,26.0,0,7.9250
3,1,1,female,35.0,1,53.1000
4,0,3,male,35.0,0,8.0500
...,...,...,...,...,...,...
886,0,2,male,27.0,0,13.0000
887,1,1,female,19.0,0,30.0000
888,0,3,female,,1,23.4500
889,1,1,male,26.0,0,30.0000


In [8]:
def impute_NA_with_arbitrary(data,impute_value,NA_col=[]):
    """
    replacing NA with arbitrary values. 
    """
    
    data_copy = data.copy(deep=True)
    for i in NA_col:
        if data_copy[i].isnull().sum()>0:
            data_copy[i+'_'+str(impute_value)] = data_copy[i].fillna(impute_value)
        else:
            warn("Column %s has no missing cases" % i)
    return data_copy
impute_NA_with_arbitrary(data=data,impute_value=-999)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Fare
0,0,3,male,22.0,1,7.2500
1,1,1,female,38.0,1,71.2833
2,1,3,female,26.0,0,7.9250
3,1,1,female,35.0,1,53.1000
4,0,3,male,35.0,0,8.0500
...,...,...,...,...,...,...
886,0,2,male,27.0,0,13.0000
887,1,1,female,19.0,0,30.0000
888,0,3,female,,1,23.4500
889,1,1,male,26.0,0,30.0000


## Mean/Median/Mode Imputation
Replacing the NA by mean/median/mode of that variable

In [22]:
def impute_NA_with_avg(data,strategy='mean',NA_col=[]):
    """
    replacing the NA with mean/median/most frequent values of that variable. 
    Note it should only be performed over training set and then propagated to test set.
    """
    
    data_copy = data.copy(deep=True)
    for i in NA_col:
        if data_copy[i].isnull().sum()>0:
            if strategy=='mean':
                data_copy[i+'_impute_mean'] = data_copy[i].fillna(data[i].mean())
            elif strategy=='median':
                data_copy[i+'_impute_median'] = data_copy[i].fillna(data[i].median())
            elif strategy=='mode':
                data_copy[i+'_impute_mode'] = data_copy[i].fillna(data[i].mode()[0])
        else:
            warn("Column %s has no missing" % i)
    return data_copy
impute_NA_with_avg(data=data,strategy='median',NA_col=['Age'])

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Fare,Age_impute_median
0,0,3,male,22.0,1,7.2500,22.0
1,1,1,female,38.0,1,71.2833,38.0
2,1,3,female,26.0,0,7.9250,26.0
3,1,1,female,35.0,1,53.1000,35.0
4,0,3,male,35.0,0,8.0500,35.0
...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,13.0000,27.0
887,1,1,female,19.0,0,30.0000,19.0
888,0,3,female,,1,23.4500,28.0
889,1,1,male,26.0,0,30.0000,26.0


##  End of distribution Imputation
replacing the NA by values that are at the far end of the distribution of that variable
calculated by mean + 3*std

In [23]:
def impute_NA_with_end_of_distribution(data,NA_col=[]):
    """
    replacing the NA by values that are at the far end of the distribution of that variable
    calculated by mean + 3*std
    """
    
    data_copy = data.copy(deep=True)
    for i in NA_col:
        if data_copy[i].isnull().sum()>0:
            data_copy[i+'_impute_end_of_distri'] = data_copy[i].fillna(data[i].mean()+3*data[i].std())
        else:
            warn("Column %s has no missing" % i)
    return data_copy 
impute_NA_with_end_of_distribution(data=data,NA_col=['Age'])

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Fare,Age_impute_end_of_distri
0,0,3,male,22.0,1,7.2500,22.00000
1,1,1,female,38.0,1,71.2833,38.00000
2,1,3,female,26.0,0,7.9250,26.00000
3,1,1,female,35.0,1,53.1000,35.00000
4,0,3,male,35.0,0,8.0500,35.00000
...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,13.0000,27.00000
887,1,1,female,19.0,0,30.0000,19.00000
888,0,3,female,,1,23.4500,73.27861
889,1,1,male,26.0,0,30.0000,26.00000


##  Random Imputation
replacing the NA with random sampling from the pool of available observations of the variable


In [24]:
def impute_NA_with_random(data,NA_col=[],random_state=0):
    """
    replacing the NA with random sampling from the pool of available observations of the variable
    """
    
    data_copy = data.copy(deep=True)
    for i in NA_col:
        if data_copy[i].isnull().sum()>0:
            data_copy[i+'_random'] = data_copy[i]
            # extract the random sample to fill the na
            random_sample = data_copy[i].dropna().sample(data_copy[i].isnull().sum(), random_state=random_state)
            random_sample.index = data_copy[data_copy[i].isnull()].index
            data_copy.loc[data_copy[i].isnull(), str(i)+'_random'] = random_sample
        else:
            warn("Column %s has no missing" % i)
    return data_copy
impute_NA_with_random(data=data,NA_col=['Age'])

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Fare,Age_random
0,0,3,male,22.0,1,7.2500,22.0
1,1,1,female,38.0,1,71.2833,38.0
2,1,3,female,26.0,0,7.9250,26.0
3,1,1,female,35.0,1,53.1000,35.0
4,0,3,male,35.0,0,8.0500,35.0
...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,13.0000,27.0
887,1,1,female,19.0,0,30.0000,19.0
888,0,3,female,,1,23.4500,15.0
889,1,1,male,26.0,0,30.0000,26.0
