In [38]:
import pandas as pd
import numpy as np

train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [39]:
print(train_df.head(1))

   PassengerId  Survived  Pclass                     Name   Sex   Age  SibSp  \
0            1         0       3  Braund, Mr. Owen Harris  male  22.0      1   

   Parch     Ticket  Fare Cabin Embarked  
0      0  A/5 21171  7.25   NaN        S  


In [53]:
def preprocess_data():
    """ Age preprocessing: Adding Age_is_missing col and filling the empty age with an estimate based on Sex and PClass """
    train_df['Age_Missing'] = train_df['Age'].isna().astype(int)
    test_df['Age_Missing'] = test_df['Age'].isna().astype(int)
    
    global_age = train_df['Age'].median()
    group_age = train_df.groupby(['Sex','Pclass'])['Age'].median()
    def estimate_age(row):
        if pd.isna(row['Age']):
            age = group_age.get((row['Sex'], row['Pclass']), np.nan)
            return age if not np.isnan(age) else global_age
        return row['Age']
    
    train_df['Age'] = train_df.apply(estimate_age, axis=1)
    test_df['Age'] = test_df.apply(estimate_age, axis=1)
    
    """ Adding a Family_size col that reflects sum of sibsp and parch """
    
    train_df['Family_Size'] = train_df['Parch'] + train_df['SibSp'] + 1
    test_df['Family_Size'] = test_df['Parch'] + test_df['SibSp'] + 1
    
    """ Fare preprocessing: Filling the empty fare with an estimate based on Sex and PClass """
    global_fare = train_df['Fare'].median()
    group_fare = train_df.groupby(['Sex', 'Pclass'])['Fare'].median()
    
    def estimate_fare(row):
        if pd.isna(row['Fare']):
            fare = group_fare.get((row['Sex'], row['Pclass']), np.nan)
            return fare if not pd.isna(fare) else global_fare
        return row['Fare']
    train_df['Fare'] = train_df.apply(estimate_fare, axis=1)
    test_df['Fare'] = test_df.apply(estimate_fare, axis=1)
    
    """ Change cabin to be the first letter of cabin and also fill with U if it is empty """
    train_df['Cabin'] = train_df['Cabin'].str[0].fillna('U')
    test_df['Cabin'] = test_df['Cabin'].str[0].fillna('U')
    
    """ If Embarked is empty, use 'U' """
    train_df['Embarked'] = train_df['Embarked'].fillna('U')
    test_df['Embarked'] = test_df['Embarked'].fillna('U')


In [54]:
preprocess_data()

print(train_df['Cabin'].head(10))



0    U
1    C
2    U
3    C
4    U
5    U
6    E
7    U
8    U
9    U
Name: Cabin, dtype: object
