In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score
import re

2023-08-19 16:10:37.765801: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
df_train = pd.read_csv('./train.csv')
#df_train.head(10)

In [3]:
# divide train ftrs and labels
X_train = df_train
y_train = np.array(df_train.Survived)
print(len(X_train), y_train.shape)

891 (891,)


In [4]:
# compute additional ftrs
def substrings_in_string(big_string, substrings):
    for substring in substrings:
        if big_string.find(substring) != -1:
            return substring
    return np.nan

# extract Title from name
def preprocess(df):
    title_list = ['Mrs', 'Mr', 'Master', 'Miss', 'Major', 'Rev',
                'Dr', 'Ms', 'Mlle','Col', 'Capt', 'Mme', 'Countess',
                'Don', 'Jonkheer']
    df['Title'] = df['Name'].map(lambda x: substrings_in_string(x, title_list))
    # extract deck and nr of cabin from Cabin
    cabin_list = ['A', 'B', 'C', 'D', 'E', 'F', 'T', 'G', 'Unknown']
    df.Cabin = df.Cabin.fillna('Unknown')
    df['Deck'] = df['Cabin'].map(lambda x: substrings_in_string(x, cabin_list))
    df['CabinNr'] = df['Cabin'].apply(lambda x: re.search(r'\d+', x).group() if re.search(r'\d+', x) else 0)
    # compute family size
    df['FamilySize'] = df.SibSp * df.Parch
    return df
X_train = preprocess(X_train)
#X_train.head(15)

In [5]:
# drop a few column that i do not need anymore
cols2drop = ['Survived', 'Name', 'Cabin']
X_train = X_train.drop(labels=cols2drop, axis=1)
#X_train.head(10)

In [6]:
df_test =  pd.read_csv('./test.csv')
# should apply all the transformation also to the test set
X_test = preprocess(df_test)
X_test = X_test.drop(labels=['Name', 'Cabin'], axis=1)
X_test.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,Title,Deck,CabinNr,FamilySize
0,892,3,male,34.5,0,0,330911,7.8292,Q,Mr,Unknown,0,0
1,893,3,female,47.0,1,0,363272,7.0,S,Mrs,Unknown,0,0
2,894,2,male,62.0,0,0,240276,9.6875,Q,Mr,Unknown,0,0
3,895,3,male,27.0,0,0,315154,8.6625,S,Mr,Unknown,0,0
4,896,3,female,22.0,1,1,3101298,12.2875,S,Mrs,Unknown,0,1


In [7]:
# to encode categorical columns to ordinal/one-hot (sex, ticket, embarked, title, deck, cabin nr)
ord_enc = OrdinalEncoder()
simple_imp = SimpleImputer(strategy='median')

X_train['Sex'] = ord_enc.fit_transform(X_train[['Sex']])
X_test['Sex'] = ord_enc.transform(X_test[['Sex']])

X_train['Age'] = simple_imp.fit_transform(X_train[['Age']])
X_test['Age'] = simple_imp.transform(X_test[['Age']])

X_train['Embarked'] = X_train['Embarked'].fillna('Unknown')
X_test['Embarked'] = X_test['Embarked'].fillna('Unknown')

X_train['Embarked'] = ord_enc.fit_transform(X_train[['Embarked']])
X_test['Embarked'] = ord_enc.transform(X_test[['Embarked']])

X_train['Title'] = ord_enc.fit_transform(np.array(X_train.Title).reshape(-1,1))
X_test['Title'] = ord_enc.transform(np.array(X_test.Title).reshape(-1,1))

simple_imp.fit(X_train[['Fare']]) # only fit bc in train set i do not have any missing values
X_test['Fare'] = simple_imp.transform(X_test[['Fare']])

X_train['Deck'] = ord_enc.fit_transform(np.array(X_train.Deck).reshape(-1,1)) # Deck == 8 is no deck
X_test['Deck'] = ord_enc.transform(np.array(X_test.Deck).reshape(-1,1)) # Deck == 8 is no deck

X_train['CabinNr'] = X_train['CabinNr'].astype('int64')
X_test['CabinNr'] = X_test['CabinNr'].astype('int64') 
#X_train['CabinNr'] = ord_enc.fit_transform(np.array(X_train.CabinNr).reshape(-1,1)) # CabinNr == 0 is no Cabin

In [8]:
#X_train['Ticket'] = ord_enc.fit_transform(np.array(X_train.Ticket).reshape(-1,1))
#X_test['Ticket'] = ord_enc.transform(np.array(X_test.Ticket).reshape(-1,1))
# i ticket ovviamente sono diversi quindi non riesce a creare un mapping reale, quindi li droppo
X_train = X_train.drop(labels=['Ticket'], axis=1)
X_test = X_test.drop(labels=['Ticket'], axis=1)
X_test.head()


Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title,Deck,CabinNr,FamilySize
0,892,3,1.0,34.5,0,0,7.8292,1.0,11.0,8.0,0,0
1,893,3,0.0,47.0,1,0,7.0,2.0,12.0,8.0,0,0
2,894,2,1.0,62.0,0,0,9.6875,1.0,11.0,8.0,0,0
3,895,3,1.0,27.0,0,0,8.6625,2.0,11.0,8.0,0,0
4,896,3,0.0,22.0,1,1,12.2875,2.0,12.0,8.0,0,1


In [9]:
# first we try sklearn trees
from sklearn import tree, ensemble
dtc = tree.DecisionTreeClassifier(random_state=0, max_depth=50)
dtc_cv_score = cross_val_score(dtc, X_train, y_train, cv=10).mean()
print('DecisionTreeClassifier CV score: ', dtc_cv_score)

rfc = ensemble.RandomForestClassifier(n_estimators=50, max_depth=100, random_state=0)
rfc_cv_score = cross_val_score(rfc, X_train, y_train, cv=10).mean()
print('RandonForestClassiifier: ', rfc_cv_score.mean())

gbc = ensemble.GradientBoostingClassifier(learning_rate=0.01, n_estimators=25, max_depth=25, random_state=0)
gbc_cv_score = cross_val_score(gbc, X_train, y_train, cv=10).mean()
print(f'GradientBoostingClassifier: ', gbc_cv_score)

DecisionTreeClassifier CV score:  0.7519475655430712
RandonForestClassiifier:  0.8204369538077403
GradientBoostingClassifier:  0.762059925093633


In [11]:
# RandomForest seems to works better so we fit on it
rfc.fit(X_train, y_train)
y_pred = rfc.predict(X_test)

In [45]:
new_df = pd.DataFrame({'Survived': y_pred})

# Concatenate the new column DataFrame with the existing DataFrame
result_df = pd.concat([df_test['PassengerId'], new_df], axis=1)
result_df[['PassengerId', 'Survived']].to_csv('./submission.csv', index=False)

Kaggle competition **Titanic - Machine Learning from Disaster**

by submitting the results contained in 'submission.csv' i get almost 74% of accuracy