In [1]:
from typing import List
import pandas as pd
import numpy as np

In [2]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder

In [3]:
train_output = "gs://xoonij-titanic-mlops/train.csv"
val_output = "gs://xoonij-titanic-mlops/val.csv"
test = "gs://xoonij-titanic-mlops/test.csv"

In [5]:
def drop_unused_columns(df: pd.DataFrame) -> pd.DataFrame:
  """Drops unused columns."""
  return df.drop(columns=['PassengerId'], inplace=True)

In [6]:
def create_title_feature(df: pd.DataFrame) -> pd.DataFrame:
  """Creates a new column Title that extracts title from name."""
  df['Title'] = df.Name.str.split(',', expand=True)[1].str.split('.', expand=True)[0].str.lstrip()
  df['Title'] = df['Title'].replace(['Miss', 'Ms', 'Mlle'], 'Unmarried Feminine Title')
  df['Title'] = df['Title'].replace(['Mrs','Mme'], 'Married Feminine Title')
  df['Title'] = df['Title'].replace(['Dr', 'Col', 'Major', 'Jonkheer', 'Capt', 'Rev',], 'Military/Clergy')
  df['Title'] = df['Title'].replace(['Don', 'Sir'], 'Masculine Noble')
  df['Title'] = df['Title'].replace(['Lady', 'Mme', 'the Countess', 'Dona'], 'Feminine Noble')
  df.drop(columns=['Name'], inplace=True)
  return df

In [7]:
def fill_embarked(df):
  """Fill missing embarked"""
  # S because https://www.encyclopedia-titanica.org/titanic-survivor/martha-evelyn-stone.html
  df['Embarked'] = df['Embarked'].fillna('S')
  return df

In [8]:
def create_deck_feature(df: pd.DataFrame) -> pd.DataFrame:
  """Creates a new Deck feature"""
  df['Deck'] = df['Cabin'].apply(lambda s: s[0] if pd.notnull(s) else 'M')
  # Passenger in the T deck is changed to A
  idx = df[df['Deck'] == 'T'].index
  df.loc[idx, 'Deck'] = 'A'
  df.drop(columns=['Cabin'], inplace=True)
  return df

In [9]:
def transform_dataset(df:pd.DataFrame, transformations: List) -> pd.DataFrame:
  """Performs all transformations on dataset."""
  [t(df) for t in transformations]
  return df

In [15]:
class MissingAgeImputer:
  """Imputes missing age using a Random Forest Regressor."""

  def __init__(self):
    """Constructor."""
    self.rfr = RandomForestRegressor()
    self.embarked_encoder = LabelEncoder()
    self.title_encoder = LabelEncoder()
    self.deck_encoder = LabelEncoder()
    self.sex_encoder = LabelEncoder()

  def _label_encode_data(self, df: pd.DataFrame, fit=False) -> pd.DataFrame:
    """Label encodes a dataframe"""
    if fit:
      self.embarked_encoder.fit(df['Embarked'])
      self.sex_encoder.fit(df['Sex'])
      
    df.loc[:,'Embarked'] = self.embarked_encoder.transform(df['Embarked'])
    df.loc[:,'Sex'] = self.sex_encoder.transform(df['Sex'])
    return df
    
  def fit(self, df: pd.DataFrame):
    """Trains a Random Forest Regressor on the Train dataset."""
    data = df[['Pclass', 'Sex', 'Age','SibSp', 'Parch', 'Fare', 'Embarked']]
    data = data.dropna()
    data = self._label_encode_data(data, fit=True)
    X = data.drop(columns=['Age'])
    y = data[['Age']]
    self.rfr.fit(X,np.ravel(y))
    
  def transform(self, df:pd.DataFrame) -> pd.DataFrame:
    """Imputes Age."""
    impute_df = df[['Pclass', 'Sex', 'Age','SibSp', 'Parch', 'Fare', 'Embarked']].copy()
    missing_idx = impute_df[impute_df['Age'].isnull()].index
    if not missing_idx.empty:
        impute_df = impute_df.loc[missing_idx]
        impute_df = self._label_encode_data(impute_df)
        impute_df = impute_df.drop(columns=['Age'])
        imputed_ages = self.rfr.predict(impute_df)
        df.loc[missing_idx,'Age'] = np.round(imputed_ages)
    return df

  def fit_transform(self, df: pd.DataFrame) -> pd.DataFrame:
    """Fits the Random Forest Regressor imputer on the train dataset and imputes missing values for Age."""
    self.fit(df)
    df = self.transform(df)
    return df
    
    

    

In [16]:
def bin_fare(df: pd.DataFrame) -> pd.DataFrame:
    """Bins the fare feature in 13 bins."""
    df['Fare'] = pd.qcut(df['Fare'], 13)
    return df

In [17]:
def bin_age(df: pd.DataFrame) -> pd.DataFrame:
    """Bins the age feature into 10 bins."""
    df['Age'] = pd.qcut(df['Age'], 10)
    return df

In [18]:
def create_family_size(df: pd.DataFrame) -> pd.DataFrame:
    """Creates bins based on total family size.
    
    Concept taken from https://www.kaggle.com/gunesevitan/titanic-advanced-feature-engineering-tutorial
    May want to bin this...
    """
    
    df['Family_Size'] = df['SibSp'] + df['Parch'] + 1
    return df

In [23]:
def create_ticket_freq(df: pd.DataFrame) -> pd.DataFrame:
    df['Ticket_Frequency'] = df.groupby('Ticket')['Ticket'].transform('count')
    return df

In [26]:
train_df = pd.read_csv(train_output)
val_df = pd.read_csv(val_output)
test_df = pd.read_csv(test)

imputer = MissingAgeImputer()
train_df = imputer.fit_transform(train_df)
val_df = imputer.transform(val_df)
test_df = imputer.transform(test_df)

feature_engineering = [create_title_feature, create_family_size, create_deck_feature,
                       fill_embarked, drop_unused_columns, create_ticket_freq]
train_df = transform_dataset(train_df, feature_engineering)
val_df = transform_dataset(val_df, feature_engineering)
test_df = transform_dataset(test_df, feature_engineering)


In [27]:
train_df

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,Title,Family_Size,Deck,Ticket_Frequency
0,1,1,male,0.92,1,2,113781,151.5500,S,Master,4,C,4
1,0,3,female,41.00,0,5,3101295,39.6875,S,Married Feminine Title,6,M,5
2,1,1,female,35.00,1,0,19943,90.0000,S,Married Feminine Title,2,C,2
3,0,3,male,47.00,0,0,384461,7.7500,Q,Mr,1,M,1
4,0,3,male,44.00,0,0,364511,8.0500,S,Mr,1,M,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
707,0,3,female,45.00,0,1,2691,14.4542,C,Married Feminine Title,2,M,1
708,1,2,female,22.00,1,2,SC/Paris 2123,41.5792,C,Married Feminine Title,4,M,2
709,0,1,male,31.00,0,0,PC 17590,50.4958,S,Mr,1,A,1
710,1,1,female,58.00,0,0,113783,26.5500,S,Unmarried Feminine Title,1,C,1
