In [1]:
from pathlib import Path
import pandas as pd

titanic_train = pd.read_csv(Path("datasets/train.csv"))
titanic_test = pd.read_csv(Path("datasets/test.csv"))

In [2]:
titanic_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## Data Dictionary
| **Variable** | **Definition** | **Key** |
| --- | --- | --- |
| survival | Survival | 0 = No, 1 = Yes |
| pclass | Ticket class | 1 = 1st, 2 = 2nd, 3 = 3rd |
| sex | Sex |     |
| Age | Age in years |     |
| sibsp | \# of siblings / spouses aboard the Titanic |     |
| parch | \# of parents / children aboard the Titanic |     |
| ticket | Ticket number |     |
| fare | Passenger fare |     |
| cabin | Cabin number |     |
| embarked | Port of Embarkation | C = Cherbourg, Q = Queenstown, S = Southampton |

In [3]:
titanic_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


### Notes

Some rows have missing values in the Age, Cabin and Embarked columns.

In [4]:
corr_matrix = titanic_train.corr(numeric_only=True)
corr_matrix["Survived"].sort_values(ascending=False)

Survived       1.000000
Fare           0.257307
Parch          0.081629
PassengerId   -0.005007
SibSp         -0.035322
Age           -0.077221
Pclass        -0.338481
Name: Survived, dtype: float64

## Creating new features (experiment)

Age categories

In [5]:
tt = titanic_train.copy()
tt['isKid'] = tt['Age'] < 18
tt['isAdult'] = (tt['Age'] >= 18) & (tt['Age'] < 60)
tt['isSenior'] = tt['Age'] >= 60

Additional numerical categories

In [6]:
tt['FamilySize']=tt['SibSp'] + tt['Parch']
tt['FarePerPerson']=tt['Fare']/(tt['FamilySize']+1)

Titles

In [7]:
def substrings_in_string(big_string, substrings):
    if isinstance(big_string, str):
        for substring in substrings:
            if substring in big_string:
                return substring
    return None
    
title_list=['Mrs', 'Mr', 'Master', 'Miss', 'Major', 'Rev',
                    'Dr', 'Ms', 'Mlle','Col', 'Capt', 'Mme', 'Countess',
                    'Don', 'Jonkheer']
tt['Title']=tt['Name'].map(lambda x: substrings_in_string(x, title_list))
 
#replacing all titles with mr, mrs, miss, master
def replace_titles(x):
    title=x['Title']
    if title in ['Don', 'Major', 'Capt', 'Jonkheer', 'Rev', 'Col']:
        return 'Mr'
    elif title in ['Countess', 'Mme']:
        return 'Mrs'
    elif title in ['Mlle', 'Ms']:
        return 'Miss'
    elif title =='Dr':
        if x['Sex']=='Male':
            return 'Mr'
        else:
            return 'Mrs'
    else:
        return title
        
tt['Title']=tt.apply(replace_titles, axis=1)

In [8]:
#Turning cabin number into Deck
cabin_list = ['A', 'B', 'C', 'D', 'E', 'F', 'T', 'G', 'Unknown']
tt['Deck']=substrings_in_string(tt['Cabin'], cabin_list)

One-hot encoding of gender and class

In [10]:
from sklearn.preprocessing import OneHotEncoder

# Instantiate encoders for each column
encoder_sex = OneHotEncoder()
encoder_pclass = OneHotEncoder()
encoder_emb = OneHotEncoder()

# Fit and transform each categorical column
one_hot_encoded_sex = encoder_sex.fit_transform(tt[['Sex']])
one_hot_encoded_pclass = encoder_pclass.fit_transform(tt[['Pclass']])
one_hot_encoded_emb = encoder_emb.fit_transform(tt[['Embarked']])

# Convert to dense arrays if needed
one_hot_array_sex = one_hot_encoded_sex.toarray()
one_hot_array_pclass = one_hot_encoded_pclass.toarray()
one_hot_array_emb = one_hot_encoded_emb.toarray()

# Create new DataFrames with one-hot encoded columns
one_hot_df_sex = pd.DataFrame(one_hot_array_sex, columns=encoder_sex.get_feature_names_out(['Sex']))
one_hot_df_pclass = pd.DataFrame(one_hot_array_pclass, columns=encoder_pclass.get_feature_names_out(['Pclass']))
one_hot_df_emb = pd.DataFrame(one_hot_array_emb, columns=encoder_emb.get_feature_names_out(['Embarked']))

# Concatenate the new DataFrames with the original DataFrame
result_df = pd.concat([tt, one_hot_df_sex, one_hot_df_pclass, one_hot_df_emb], axis=1)

In [11]:
result_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,...,Deck,Sex_female,Sex_male,Pclass_1,Pclass_2,Pclass_3,Embarked_C,Embarked_Q,Embarked_S,Embarked_nan
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,...,,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,...,,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,...,,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,...,,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,...,,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


## Data preparation

Separate the predictors and the labels

In [47]:
tt = titanic_train.drop("Survived", axis=1)
tt_labels = titanic_train["Survived"].copy()

### Handling empty values

Some rows have missing values in the Age, Cabin and Embarked columns.

If Cabin is missing, this means that the passenger is not from the 1st class, so this data should be kept.

There are only two rows without Embarked value, we can drop them.

For the Age (100+ empty values) filling in with median value will probably be a good idea.

### Creating data processing pipeline

Drop columns and rows

In [33]:
from sklearn.pipeline import FunctionTransformer, make_pipeline

# Define FunctionTransformers
dropPassIdFTransformer = FunctionTransformer(lambda df: df.drop(columns=['PassengerId'], axis=1))
dropNaEmbarkedFTransformer = FunctionTransformer(lambda df: df.dropna(subset=["Embarked"]))

Dividing columns into numerical and categorial

In [34]:
numerical_columns = ['SibSp', 'Parch', 'Fare']
categorial_columns = ['Pclass', 'Name', 'Sex', 'Ticket', 'Cabin', 'Embarked']

Impute missing values

In [35]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy="median")

Create new features

In [86]:
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np

sibsp_ix, parch_ix, fare_ix = 0, 1, 2
class CalculateRatiosAndSums(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        family_size = X[:, sibsp_ix] + X[:, parch_ix]
        fare_per_person = X[:, fare_ix]/(family_size + 1)
        return np.c_[X, family_size, fare_per_person]

In [87]:
def substrings_in_string(big_string, substrings):
    if isinstance(big_string, str):
        for substring in substrings:
            if substring in big_string:
                return substring
    return None
    
title_list=['Mrs', 'Mr', 'Master', 'Miss', 'Major', 'Rev',
                    'Dr', 'Ms', 'Mlle','Col', 'Capt', 'Mme', 'Countess',
                    'Don', 'Jonkheer']
 
#replacing all titles with mr, mrs, miss, master
def replace_titles(x):
    title=x['Title']
    if title in ['Don', 'Major', 'Capt', 'Jonkheer', 'Rev', 'Col']:
        return 'Mr'
    elif title in ['Countess', 'Mme']:
        return 'Mrs'
    elif title in ['Mlle', 'Ms']:
        return 'Miss'
    elif title =='Dr':
        if x['Sex']=='Male':
            return 'Mr'
        else:
            return 'Mrs'
    else:
        return title
        


class CreateNewCatColumns(BaseEstimator, TransformerMixin):
        
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # Custom logic to create new columns
        X['Deck'] = substrings_in_string(X['Cabin'], cabin_list)
        
        X['Title'] = X['Name'].map(lambda n: substrings_in_string(n, title_list))
        X['Title']=X.apply(replace_titles, axis=1)
        
        return X[['Deck', 'Title']]

class AgeToCategory(BaseEstimator, TransformerMixin):
        
    def fit(self, X, y=None):
        return self

    def transform(self, X):

        X['isKid'] = X['Age'] < 18
        X['isAdult'] = (X['Age'] >= 18) & (X['Age'] < 60)
        X['isSenior'] = X['Age'] >= 60
        
        return X[['isKid', 'isAdult', 'isSenior']]


One-hot encoding

In [88]:
# List of columns to one-hot encode
one_hot_columns = ['Sex', 'Pclass', 'Embarked']

Assemble the pipelines

In [89]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

num_pipeline = Pipeline([
    ("impute", SimpleImputer(strategy="median")),
    ("calc_ratios", CalculateRatiosAndSums()),
    ("standardize", StandardScaler())
])

cat_pipeline = Pipeline([
    ('new_columns', CreateNewCatColumns()),
    ('drop_after_new_columns', FunctionTransformer(lambda df: df.drop(columns=['Deck', 'Name'], axis=1))),
    ('drop_na', FunctionTransformer(lambda df: df.dropna(subset=["Embarked"]))),
    ('one_hot', OneHotEncoder(handle_unknown='ignore'), one_hot_columns),
    ('drop_after_one_hot', FunctionTransformer(lambda df: df.drop(columns=one_hot_columns, axis=1)))
])

preprocessing = ColumnTransformer([
    ("num", num_pipeline, numerical_columns),
    ("cat", cat_pipeline, categorial_columns),
    ("drop_passenger_id", FunctionTransformer(lambda df: df.drop(columns=['PassengerId'], axis=1)), ["PassengerId"]),
    ("age_to_category", AgeToCategory(), ['Age'])
])

In [90]:
tt.info()
titanic_prepared = preprocessing.fit_transform(tt)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Pclass       891 non-null    int64  
 2   Name         891 non-null    object 
 3   Sex          891 non-null    object 
 4   Age          714 non-null    float64
 5   SibSp        891 non-null    int64  
 6   Parch        891 non-null    int64  
 7   Ticket       891 non-null    object 
 8   Fare         891 non-null    float64
 9   Cabin        204 non-null    object 
 10  Embarked     889 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 76.7+ KB


ValueError: too many values to unpack (expected 2)