In [213]:
from pathlib import Path
import pandas as pd

titanic_train = pd.read_csv(Path("datasets/train.csv"))
titanic_test = pd.read_csv(Path("datasets/test.csv"))

In [214]:
titanic_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## Data Dictionary
| **Variable** | **Definition** | **Key** |
| --- | --- | --- |
| survival | Survival | 0 = No, 1 = Yes |
| pclass | Ticket class | 1 = 1st, 2 = 2nd, 3 = 3rd |
| sex | Sex |     |
| Age | Age in years |     |
| sibsp | \# of siblings / spouses aboard the Titanic |     |
| parch | \# of parents / children aboard the Titanic |     |
| ticket | Ticket number |     |
| fare | Passenger fare |     |
| cabin | Cabin number |     |
| embarked | Port of Embarkation | C = Cherbourg, Q = Queenstown, S = Southampton |

In [215]:
titanic_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


### Notes

Some rows have missing values in the Age, Cabin and Embarked columns.

In [216]:
corr_matrix = titanic_train.corr(numeric_only=True)
corr_matrix["Survived"].sort_values(ascending=False)

Survived       1.000000
Fare           0.257307
Parch          0.081629
PassengerId   -0.005007
SibSp         -0.035322
Age           -0.077221
Pclass        -0.338481
Name: Survived, dtype: float64

## Creating new features (experiment)

Age categories

In [217]:
tt = titanic_train.copy()
tt['isKid'] = tt['Age'] < 18
tt['isAdult'] = (tt['Age'] >= 18) & (tt['Age'] < 60)
tt['isSenior'] = tt['Age'] >= 60

Additional numerical categories

In [218]:
tt['FamilySize']=tt['SibSp'] + tt['Parch']
tt['FarePerPerson']=tt['Fare']/(tt['FamilySize']+1)

Titles

In [219]:
def substrings_in_string(big_string, substrings):
    if isinstance(big_string, str):
        for substring in substrings:
            if substring in big_string:
                return substring
    return None
    
title_list=['Mrs', 'Mr', 'Master', 'Miss', 'Major', 'Rev',
                    'Dr', 'Ms', 'Mlle','Col', 'Capt', 'Mme', 'Countess',
                    'Don', 'Jonkheer']
tt['Title']=tt['Name'].map(lambda x: substrings_in_string(x, title_list))
 
#replacing all titles with mr, mrs, miss, master
def replace_titles(x):
    title=x['Title']
    if title in ['Don', 'Major', 'Capt', 'Jonkheer', 'Rev', 'Col']:
        return 'Mr'
    elif title in ['Countess', 'Mme']:
        return 'Mrs'
    elif title in ['Mlle', 'Ms']:
        return 'Miss'
    elif title =='Dr':
        if x['Sex']=='Male':
            return 'Mr'
        else:
            return 'Mrs'
    else:
        return title
        
tt['Title']=tt.apply(replace_titles, axis=1)

In [220]:
def substrings_in_string_or_unknown(big_string, substrings):
    if isinstance(big_string, str):
        for substring in substrings:
            if substring in big_string:
                return substring
    return 'Unknown'
    
#Turning cabin number into Deck
cabin_list = ['A', 'B', 'C', 'D', 'E', 'F', 'T', 'G', 'Unknown']
tt['Deck']=tt['Cabin'].map(lambda x: substrings_in_string_or_unknown(x, cabin_list))

One-hot encoding of gender and class

In [221]:
from sklearn.preprocessing import OneHotEncoder

# Instantiate encoders for each column
encoder_sex = OneHotEncoder()
encoder_pclass = OneHotEncoder()
encoder_emb = OneHotEncoder()

# Fit and transform each categorical column
one_hot_encoded_sex = encoder_sex.fit_transform(tt[['Sex']])
one_hot_encoded_pclass = encoder_pclass.fit_transform(tt[['Pclass']])
one_hot_encoded_emb = encoder_emb.fit_transform(tt[['Embarked']])

# Convert to dense arrays if needed
one_hot_array_sex = one_hot_encoded_sex.toarray()
one_hot_array_pclass = one_hot_encoded_pclass.toarray()
one_hot_array_emb = one_hot_encoded_emb.toarray()

# Create new DataFrames with one-hot encoded columns
one_hot_df_sex = pd.DataFrame(one_hot_array_sex, columns=encoder_sex.get_feature_names_out(['Sex']))
one_hot_df_pclass = pd.DataFrame(one_hot_array_pclass, columns=encoder_pclass.get_feature_names_out(['Pclass']))
one_hot_df_emb = pd.DataFrame(one_hot_array_emb, columns=encoder_emb.get_feature_names_out(['Embarked']))

# Concatenate the new DataFrames with the original DataFrame
result_df = pd.concat([tt, one_hot_df_sex, one_hot_df_pclass, one_hot_df_emb], axis=1)

In [222]:
result_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,...,Deck,Sex_female,Sex_male,Pclass_1,Pclass_2,Pclass_3,Embarked_C,Embarked_Q,Embarked_S,Embarked_nan
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,...,Unknown,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,...,C,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,...,Unknown,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,...,C,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,...,Unknown,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


## Data preparation

Separate the predictors and the labels

In [223]:
tt = titanic_train.dropna(subset=["Embarked"])
tt_labels = tt["Survived"].copy()
tt = tt.drop("Survived", axis=1)

### Handling empty values

Some rows have missing values in the Age, Cabin and Embarked columns.

If Cabin is missing, this means that the passenger is not from the 1st class, so this data should be kept.

There are only two rows without Embarked value, we can drop them.

For the Age (100+ empty values) filling in with median value will probably be a good idea.

### Creating data processing pipeline

Drop columns and rows

In [224]:
from sklearn.pipeline import FunctionTransformer, make_pipeline

# Define FunctionTransformers
dropPassIdFTransformer = FunctionTransformer(lambda df: df.drop(columns=['PassengerId'], axis=1))
dropNaEmbarkedFTransformer = FunctionTransformer(lambda df: df.dropna(subset=["Embarked"]))

Dividing columns into numerical and categorial

In [225]:
numerical_columns = ['SibSp', 'Parch', 'Fare']
categorial_columns = ['Pclass', 'Name', 'Sex', 'Cabin', 'Embarked']

Impute missing values

In [226]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy="median")

Create new features

In [227]:
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np

sibsp_ix, parch_ix, fare_ix = 0, 1, 2
class CalculateRatiosAndSums(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        family_size = X[:, sibsp_ix] + X[:, parch_ix]
        fare_per_person = X[:, fare_ix]/(family_size + 1)
        return np.c_[X, family_size, fare_per_person]

    def get_feature_names_out(self, input_features=numerical_columns):
        # Assuming input_features is a list of input feature names
        output_features = np.append(input_features, ['FamilySize', 'FarePerPerson'])
        return output_features

In [228]:
def substrings_in_string(big_string, substrings):
    if isinstance(big_string, str):
        for substring in substrings:
            if substring in big_string:
                return substring
    return None
    
title_list=['Mrs', 'Mr', 'Master', 'Miss', 'Major', 'Rev',
                    'Dr', 'Ms', 'Mlle','Col', 'Capt', 'Mme', 'Countess',
                    'Don', 'Jonkheer']
 
#replacing all titles with mr, mrs, miss, master
def replace_titles(x):
    title=x['Title']
    if title in ['Don', 'Major', 'Capt', 'Jonkheer', 'Rev', 'Col']:
        return 'Mr'
    elif title in ['Countess', 'Mme']:
        return 'Mrs'
    elif title in ['Mlle', 'Ms']:
        return 'Miss'
    elif title =='Dr':
        if x['Sex']=='Male':
            return 'Mr'
        else:
            return 'Mrs'
    else:
        return title
        


class CreateNewCatColumns(BaseEstimator, TransformerMixin):
        
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # Custom logic to create new columns
        X['Deck'] = X['Cabin'].map(lambda n: substrings_in_string_or_unknown(n, cabin_list))
        
        X['Title'] = X['Name'].map(lambda n: substrings_in_string(n, title_list))
        X['Title']=X.apply(replace_titles, axis=1)
        return X
        
    def get_feature_names_out(self, input_features=categorial_columns):
        # Assuming input_features is a list of input feature names
        output_features = np.append(input_features, ['Deck', 'Title'])
        return output_features

class AgeToCategory(BaseEstimator, TransformerMixin):
        
    def fit(self, X, y=None):
        return self

    def transform(self, X):

        X['isKid'] = X['Age'] < 18
        X['isAdult'] = (X['Age'] >= 18) & (X['Age'] < 60)
        X['isSenior'] = X['Age'] >= 60
        
        return X[['isKid', 'isAdult', 'isSenior']]


One-hot encoding

In [247]:
from sklearn.compose import ColumnTransformer
# List of columns to one-hot encode
one_hot_columns = ['Sex', 'Pclass', 'Embarked', 'Title']
# Creating a ColumnTransformer for one-hot encoding
one_hot_preprocessor = ColumnTransformer(
    transformers=[
        ('one_hot_deck', OneHotEncoder(categories=[cabin_list], sparse_output=False), ['Deck']),
        ('one_hot', OneHotEncoder(sparse_output=False), one_hot_columns),
        ('drop_after_one_hot', FunctionTransformer(lambda df: df.drop(columns=['Sex', 'Pclass', 'Embarked', 'Title', 'Deck'], axis=1)), ['Sex', 'Pclass', 'Embarked', 'Title', 'Deck'])
    ],
    remainder='passthrough'
)

Assemble the pipelines

In [248]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

num_pipeline = Pipeline([
    ("impute", SimpleImputer(strategy="median")),
    ("calc_ratios", CalculateRatiosAndSums()),
    ("standardize", StandardScaler())
])

cat_pipeline = Pipeline([
    ('new_columns', CreateNewCatColumns()),
    ('drop_after_new_columns', FunctionTransformer(lambda df: df.drop(columns=['Cabin', 'Name']))),
    ('one_hot', one_hot_preprocessor),
])

preprocessing = ColumnTransformer([
    ("num", num_pipeline, numerical_columns),
    ("cat", cat_pipeline, categorial_columns),
    ("drop_passenger_id", FunctionTransformer(lambda df: df.drop(columns=['PassengerId'], axis=1)), ["PassengerId"]),
    ("drop_ticket", FunctionTransformer(lambda df: df.drop(columns=['Ticket'], axis=1)), ["Ticket"]),
    ("age_to_category", AgeToCategory(), ['Age']),
    ("drop_age", FunctionTransformer(lambda df: df.drop(columns=['Age'], axis=1)), ["Age"]),
])

In [249]:
titanic_prepared = preprocessing.fit_transform(tt)

# original column names
column_names = ['num__SibSp', 'num__Parch', 'num__Fare', 'num_FamilySize', 'num_FarePerPerson', 'cat__one_hot__Sex_female', 'cat__one_hot__Sex_male', 'cat__one_hot__Pclass_1', 'cat__one_hot__Pclass_2', 'cat__one_hot__Pclass_3', 'cat__one_hot__Embarked_C', 'cat__one_hot__Embarked_Q', 'cat__one_hot__Embarked_S', 'cat__remainder__Deck', 'cat__remainder__Title']

# shortened column names
column_names = ['SibSp', 'Parch', 'Fare', 'FamilySize', 'FarePerPerson', 'female', 'male', 'Pclass_1', 'Pclass_2', 'Pclass_3', 'Embarked_C', 'Embarked_Q', 'Embarked_S', 'Deck', 'Title', 'isKid', 'isAdult', 'isSenior']
# extra code – shows that we can get a DataFrame out if we want
titanic_prepared_fr = pd.DataFrame(
    titanic_prepared,
    # columns = column_names, 
    index=tt.index)
titanic_prepared_fr.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,19,20,21,22,23,24,25,26,27,28
0,0.43135,-0.474326,-0.50024,0.057853,-0.45195,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,0.43135,-0.474326,0.788947,0.057853,0.443677,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0


In [250]:
def prepare_data(dataset):
    cleaned_ds = dataset.dropna(subset=["Embarked"])
    labels = cleaned_ds["Survived"].copy()
    ds_features = cleaned_ds.drop("Survived", axis=1)
    ds_prepared = preprocessing.fit_transform(ds_features)
    return ds_prepared, labels

In [251]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score, StratifiedKFold


titanic_train_x, titanic_train_y = prepare_data(titanic_train)
X_train, X_test, y_train, y_test = train_test_split(titanic_train_x, titanic_train_y, test_size=0.2, random_state=42)
print(X_train.shape)
print(y_train.shape)

forest_model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)
forest_model.fit(X_train, y_train)

y_pred = forest_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

cv_strategy = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(forest_model, X_test, y_test, cv=cv_strategy, scoring='accuracy')

# Print accuracy scores
print("Accuracy scores for each fold:", scores)

# Print mean and standard deviation of accuracy
print(f"Mean Accuracy: {scores.mean():.4f}")
print(f"Standard Deviation: {scores.std():.4f}")

(711, 29)
(711,)
Accuracy: 0.8089887640449438
Accuracy scores for each fold: [0.72222222 0.91666667 0.69444444 0.82857143 0.82857143]
Mean Accuracy: 0.7981
Standard Deviation: 0.0805


In [252]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

cv_strategy = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(model, X_test, y_test, cv=cv_strategy, scoring='accuracy')

# Print accuracy scores
print("Accuracy scores for each fold:", scores)

# Print mean and standard deviation of accuracy
print(f"Mean Accuracy: {scores.mean():.4f}")
print(f"Standard Deviation: {scores.std():.4f}")

Accuracy: 0.8202247191011236
Accuracy scores for each fold: [0.72222222 0.88888889 0.69444444 0.82857143 0.77142857]
Mean Accuracy: 0.7811
Standard Deviation: 0.0706


### Make prediction

In [253]:
def prepare_data(dataset):
    # ds_features = dataset.drop("Embarked", axis=1)
    ds_prepared = preprocessing.fit_transform(dataset)
    return ds_prepared
    
titanic_test_x = prepare_data(titanic_test)
print(titanic_test_x.shape)
predictions = forest_model.predict(titanic_test_x)

output = pd.DataFrame({'PassengerId': titanic_test.PassengerId, 'Survived': predictions})
output.to_csv('submission.csv', index=False)

(418, 29)
