In [1]:
import pandas as pd
import numpy as np
dataset = pd.read_csv('titanic.csv')
dataset.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## Columns with missing values

In [2]:
missing_values_count_df = pd.DataFrame(data = np.sum(dataset.isnull())/dataset.shape[0], columns = ['Missing_Values'])

missing_values_df = missing_values_count_df.query('Missing_Values > 0')
display(missing_values_df)

Unnamed: 0,Missing_Values
Age,0.198653
Cabin,0.771044
Embarked,0.002245


### Imputing values

In [3]:
print(f'Age mean {dataset[["Age"]].mean().values}') 
print(f'Age median {dataset[["Age"]].median().values}') 
print(f'Age mode {dataset[["Age"]].mode().values}') 

Age mean [29.69911765]
Age median [28.]
Age mode [[24.]]


In [4]:
print(f'Embarked mean {dataset[["Embarked"]].mean().values}') 
print(f'Embarked median {dataset[["Embarked"]].median().values}') 
print(f'Embarked mode {dataset[["Embarked"]].mode().values}') 

Embarked mean []
Embarked median []
Embarked mode [['S']]


### Imputing grouped values

In [5]:
grouped = dataset.groupby(['Survived', 'Pclass', 'Sex'])

for i, df in grouped:
    print(f'Age Pclass {set(df["Pclass"].values)}')
    print(f'Age mean {df[["Age"]].mean().values}') 
    print(f'Age median {df[["Age"]].median().values}') 
    print(f'Age mode {df[["Age"]].mode().values}') 
    print('\n')

    
    print(f'Embarked Pclass {set(df["Pclass"].values)}')
    print(f'Embarked mean {df[["Embarked"]].mean().values}') 
    print(f'Embarked median {df[["Embarked"]].median().values}') 
    print(f'Embarked mode {df[["Embarked"]].mode().values}') 
    print('\n')

Age Pclass {1}
Age mean [38.23344086]
Age median [37.]
Age mode [[35.]
 [36.]]


Embarked Pclass {1}
Embarked mean []
Embarked median []
Embarked mode [['S']]


Age Pclass {2}
Age mean [29.87763006]
Age median [29.]
Age mode [[24.]
 [34.]]


Embarked Pclass {2}
Embarked mean []
Embarked median []
Embarked mode [['S']]


Age Pclass {3}
Age mean [25.14061972]
Age median [24.]
Age mode [[22.]]


Embarked Pclass {3}
Embarked mean []
Embarked median []
Embarked mode [['S']]




In [18]:
null_indexes = list(dataset.loc[dataset['Age'].isnull()].index)


In [17]:
dataset.iloc[null_indexes].head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
5,6,0,3,"Moran, Mr. James",male,26.555556,0,0,330877,8.4583,,Q
17,18,1,2,"Williams, Mr. Charles Eugene",male,25.901566,0,0,244373,13.0,,S
19,20,1,3,"Masselmani, Mrs. Fatima",female,20.646118,0,0,2649,7.225,,C
26,27,0,3,"Emir, Mr. Farred Chehab",male,26.555556,0,0,2631,7.225,,C
28,29,1,3,"O'Dwyer, Miss. Ellen ""Nellie""",female,20.646118,0,0,330959,7.8792,,Q


In [10]:
from enum import Enum

class ImputeStrategy(Enum):
    MEAN = 'mean'
    MEDIAN = 'median'
    MODE = 'mode'
    CONSTANT = 'constant'
    REGRESSOR_MODEL = 'regressor_model'
    CLASSIFICATION_MODEL = 'clasification_model'
    
fill_mean = lambda col: col.fillna(col.mean())
fill_median = lambda col: col.fillna(col.median())
fill_mode = lambda col: col.fillna(col.mode()[0])

impute_strategies = {
  ImputeStrategy.MEAN: fill_mean,
  ImputeStrategy.MEDIAN: fill_median,
  ImputeStrategy.MODE: fill_mode
}
    
def impute(dataset, impute_strategy):
    if impute_strategy in [ImputeStrategy.MEAN, ImputeStrategy.MEDIAN, ImputeStrategy.MODE]:
        return dataset.apply(impute_strategies[impute_strategy], axis=0)
    else:
        return dataset

def impute_grouped(dataset, target_feature, features, impute_strategy):
    dataset[target_feature] = dataset.groupby(features)[[target_feature]].apply(lambda df: impute(df, ImputeStrategy.MEAN))
    return dataset

In [15]:
dataset_tmp = impute_grouped(dataset, 'Age', ['Survived', 'Pclass', 'Sex'], ImputeStrategy.MEAN)
dataset_tmp.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [16]:
dataset_tmp.loc[dataset_tmp['Pclass'] == 1]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.000000,1,0,PC 17599,71.2833,C85,C
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.000000,1,0,113803,53.1000,C123,S
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.000000,0,0,17463,51.8625,E46,S
11,12,1,1,"Bonnell, Miss. Elizabeth",female,58.000000,0,0,113783,26.5500,C103,S
23,24,1,1,"Sloper, Mr. William Thompson",male,28.000000,0,0,113788,35.5000,A6,S
27,28,0,1,"Fortune, Mr. Charles Alexander",male,19.000000,3,2,19950,263.0000,C23 C25 C27,S
30,31,0,1,"Uruchurtu, Don. Manuel E",male,40.000000,0,0,PC 17601,27.7208,,C
31,32,1,1,"Spencer, Mrs. William Augustus (Marie Eugenie)",female,35.368197,1,0,PC 17569,146.5208,B78,C
34,35,0,1,"Meyer, Mr. Edgar Joseph",male,28.000000,1,0,PC 17604,82.1708,,C
35,36,0,1,"Holverson, Mr. Alexander Oskar",male,42.000000,1,0,113789,52.0000,,S
