In [1]:
import pandas as pd
import os
FILE_PATH = os.path.join(os.getcwd(), "../data-files", "titanic-passengers.csv")
DF = pd.read_csv(FILE_PATH)

Analyze missing values

In [2]:
def analyze_missing_values(df):
    missing_values = df.isnull().sum() / len(df) * 100
    print(missing_values.apply(lambda x: "%.2f%%" % x).sort_values(ascending=False))

analyze_missing_values(DF)

Cabin          77.10%
Age            19.87%
Embarked        0.22%
PassengerId     0.00%
Survived        0.00%
Pclass          0.00%
Name            0.00%
Sex             0.00%
SibSp           0.00%
Parch           0.00%
Ticket          0.00%
Fare            0.00%
dtype: object


Simple imputation values by using mean of a variable

In [3]:
def impute_mean(df):
    # Choose Age column, for example
    clone_df = df.copy()
    age_mean = clone_df['Age'].mean()
    clone_df['Age'].fillna(age_mean, inplace=True)
    print(clone_df[clone_df['PassengerId'] == 889])
impute_mean(DF)

     PassengerId  Survived  Pclass                                      Name  \
888          889         0       3  Johnston, Miss. Catherine Helen "Carrie"   

        Sex        Age  SibSp  Parch      Ticket   Fare Cabin Embarked  
888  female  29.699118      1      2  W./C. 6607  23.45   NaN        S  


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  clone_df['Age'].fillna(age_mean, inplace=True)


Create simple imputation for DF

In [4]:
def simple_forward_backward_impute(df):
    clone_df = df.copy()
    clone_df['age_ffill'] = df['Age'].fillna(method='ffill')
    clone_df['age_bfill'] = df['Age'].fillna(method='bfill')
    print(clone_df[clone_df['PassengerId'] == 889])

simple_forward_backward_impute(DF)

     PassengerId  Survived  Pclass                                      Name  \
888          889         0       3  Johnston, Miss. Catherine Helen "Carrie"   

        Sex  Age  SibSp  Parch      Ticket   Fare Cabin Embarked  age_ffill  \
888  female  NaN      1      2  W./C. 6607  23.45   NaN        S       19.0   

     age_bfill  
888       26.0  


  clone_df['age_ffill'] = df['Age'].fillna(method='ffill')
  clone_df['age_bfill'] = df['Age'].fillna(method='bfill')


Use Interpolation for handling missing values

In [5]:
def interpolate_values(df):
    clone_df = df.copy()
    clone_df = clone_df.interpolate(method='linear', limit_direction='forward')
    print(clone_df[clone_df['PassengerId'] == 889])
interpolate_values(DF)

     PassengerId  Survived  Pclass                                      Name  \
888          889         0       3  Johnston, Miss. Catherine Helen "Carrie"   

        Sex   Age  SibSp  Parch      Ticket   Fare Cabin Embarked  
888  female  22.5      1      2  W./C. 6607  23.45   NaN        S  


  clone_df = clone_df.interpolate(method='linear', limit_direction='forward')
