## Imports

In [39]:
import pandas as pd
import numpy as np
import seaborn as sns

In [55]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn import set_config; set_config(display='diagram')

## Data loading

In [56]:
path = '../raw_data/'

data = pd.read_csv(path + 'train.csv') 


In [57]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## Exploration and cleaning

In [58]:
data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [59]:
data.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [60]:
data[data.Age.isnull()]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
17,18,1,2,"Williams, Mr. Charles Eugene",male,,0,0,244373,13.0000,,S
19,20,1,3,"Masselmani, Mrs. Fatima",female,,0,0,2649,7.2250,,C
26,27,0,3,"Emir, Mr. Farred Chehab",male,,0,0,2631,7.2250,,C
28,29,1,3,"O'Dwyer, Miss. Ellen ""Nellie""",female,,0,0,330959,7.8792,,Q
...,...,...,...,...,...,...,...,...,...,...,...,...
859,860,0,3,"Razi, Mr. Raihed",male,,0,0,2629,7.2292,,C
863,864,0,3,"Sage, Miss. Dorothy Edith ""Dolly""",female,,8,2,CA. 2343,69.5500,,S
868,869,0,3,"van Melkebeke, Mr. Philemon",male,,0,0,345777,9.5000,,S
878,879,0,3,"Laleff, Mr. Kristo",male,,0,0,349217,7.8958,,S


### Cleaning

> Let's impute the mean in the missing ages, standard scale the numerical features, one hot encode the categorical features, and drop the columns "Cabin" and Ticket Number, PassengerId,

In [61]:
# features list
num_features = ['Age', 'Fare', 'SibSp', 'Parch']
cat_features = ['Sex', 'Embarked']
label_features = ['Pclass']

# Impute then Scale for numerical variables:
num_transformer = Pipeline([('imputer', SimpleImputer(strategy='median')),
                            ('scaler', MinMaxScaler())])

# Ordinal encode Pclass
ordinal_tranformer = OrdinalEncoder()

# Impute most frequent value for missing "Embarked" then one Hot Encode categorical variables
cat_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(sparse=False, handle_unknown='ignore'))
])



# Paralellize "num_transformer" and "One hot encoder"
preprocessor = ColumnTransformer([
    ('num_transformer', num_transformer, num_features),
    ('label_transformer', ordinal_tranformer, label_features),
    ('cat_transformer', cat_transformer, cat_features),
])

In [62]:
preprocessor

In [29]:
X = data.drop(columns='Survived')
y = data.Survived

In [30]:
X = preprocessor.fit_transform(X) # fit and transform the preprocessor

In [31]:
# Get the name of the categorical columns from the one hot encoder, it's deep down there
cat_cols = list(preprocessor.transformers_[-2][1].steps[-1][-1].get_feature_names_out())

# Make a DataFrame out of the transformed X, with nice column names
X = pd.DataFrame(X, columns=list(num_features) + label_features + cat_cols)

In [32]:
X.head()

Unnamed: 0,Age,Fare,SibSp,Parch,Pclass,x0_female,x0_male,x1_C,x1_Q,x1_S
0,0.271174,0.014151,0.125,0.0,2.0,0.0,1.0,0.0,0.0,1.0
1,0.472229,0.139136,0.125,0.0,0.0,1.0,0.0,1.0,0.0,0.0
2,0.321438,0.015469,0.0,0.0,2.0,1.0,0.0,0.0,0.0,1.0
3,0.434531,0.103644,0.125,0.0,0.0,1.0,0.0,0.0,0.0,1.0
4,0.434531,0.015713,0.0,0.0,2.0,0.0,1.0,0.0,0.0,1.0


## Model selection

In [67]:
pipe = Pipeline([
    ('preprocessing', preprocessor),
    #('linear_SVC', LinearSVC())
    ('ensemble_estimator', RandomForestClassifier())
])
    
pipe

In [68]:
X = data.drop(columns='Survived')

In [69]:
cross_val_score(pipe, X, y, cv=5, scoring='accuracy').mean()

0.813709120582512

In [70]:
## Hyperparameters tuning

In [72]:
pipe.get_params()

{'memory': None,
 'steps': [('preprocessing',
   ColumnTransformer(transformers=[('num_transformer',
                                    Pipeline(steps=[('imputer',
                                                     SimpleImputer(strategy='median')),
                                                    ('scaler', MinMaxScaler())]),
                                    ['Age', 'Fare', 'SibSp', 'Parch']),
                                   ('label_transformer', OrdinalEncoder(),
                                    ['Pclass']),
                                   ('cat_transformer',
                                    Pipeline(steps=[('imputer',
                                                     SimpleImputer(strategy='most_frequent')),
                                                    ('ohe',
                                                     OneHotEncoder(handle_unknown='ignore',
                                                                   sparse=False))]),
                  

In [76]:
from sklearn.model_selection import GridSearchCV

# Instanciate grid search
grid_search = GridSearchCV(
    pipe,
    param_grid={
        'ensemble_estimator__ccp_alpha': [0.0, 0.1, 0.5],
        'ensemble_estimator__min_impurity_decrease': [0.0, 0.1, 0.5],
        'ensemble_estimator__min_samples_leaf': [1, 2],
        'ensemble_estimator__min_samples_split': [2, 4],
        'ensemble_estimator__n_estimators': [100, 200, 500],
    },
    cv=5,
    scoring="accuracy")

grid_search.fit(X, y)
grid_search.best_params_

{'ensemble_estimator__ccp_alpha': 0.0,
 'ensemble_estimator__min_impurity_decrease': 0.0,
 'ensemble_estimator__min_samples_leaf': 2,
 'ensemble_estimator__min_samples_split': 2,
 'ensemble_estimator__n_estimators': 200}

In [82]:
grid_search.best_estimator_

In [77]:
tuned_pipe = grid_search.best_estimator_

## Generate submission file

In [79]:
X_test = pd.read_csv(path + 'test.csv')
X_test['Survived'] = tuned_pipe.predict(X_test)
X_test = X_test[['PassengerId', 'Survived']]
X_test.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1


In [80]:
submission_file = path + 'submission.csv'
X_test.to_csv(path_or_buf=submission_file, index=False)

In [81]:
!kaggle competitions submit -c titanic -f ../raw_data/submission.csv -m "RandomForestClassifier"

100%|████████████████████████████████████████| 2.77k/2.77k [00:05<00:00, 557B/s]
Successfully submitted to Titanic - Machine Learning from Disaster

## Baseline tracking

In [83]:
baseline_score = 0.76555 #LinearSVC, nothing fancy
baseline_score = 0.77272 #RandomForestClassifier(min_samples_leaf=2, n_estimators=200)
baseline_score

0.77272