In [None]:
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import numpy as np

## Read Data

In [None]:
df = pd.read_csv('titanic.csv')

In [None]:
df.head()

## Clean Data

In [None]:
df[df.isna().sum(axis = 1) > 0]

In [None]:
df['Cabin'] = df['Cabin'].fillna('Z0')

In [None]:
df[['Cabin', 'Cabin_number']] = df['Cabin'].str.extract('([A-Z]*)([0-9]*)')

In [None]:
df['Cabin_number'] = df['Cabin_number'].apply(lambda x: 0 if x=='' else x)

In [None]:
df

In [None]:
df[df.isna().sum(axis = 1) > 0]

In [None]:
df = df.dropna()

In [None]:
one_hot_data = pd.get_dummies(df[['Sex','Cabin', 'Embarked']])

In [None]:
one_hot_data.head()

## Format data

In [None]:
def get_one_hot(df):
    one_hot_data = pd.get_dummies(df[['Cabin', 'Embarked']])
    df_hot = df[['Survived','Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Cabin_number']].join(one_hot_data)
    return df_hot

In [None]:
df_hot = get_one_hot(df)

In [None]:
df_hot

## Train and test

In [None]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df_hot, test_size=0.2, random_state=7)

## Build model

In [None]:
model = RandomForestClassifier(n_estimators=1000, max_depth=4, random_state=12)

In [None]:
model.fit(train.iloc[:, 1:], train['Survived'])

## Confusion matrix

In [None]:
from sklearn.metrics import confusion_matrix, plot_confusion_matrix
import matplotlib.pyplot as plt


In [None]:
def confusion(model, df):
    plot_confusion_matrix(model, df.iloc[:, 1:], df['Survived'].values)
    M = confusion_matrix(df['Survived'].values, model.predict(df.iloc[:, 1:]))
    return M

In [None]:
M = confusion(model, test)

## F1 Score

In [None]:
precision = M[1,1] / M[:, 1].sum()
precision

In [None]:
recall = M[1, 1] / M[1, :].sum()
recall

In [None]:
F1 = 2 * (precision*recall)/ (precision + recall)
F1

In [None]:
from sklearn.metrics import f1_score

In [None]:
def get_f1(model, df):
    return f1_score(df['Survived'].values, model.predict(df.iloc[:, 1:]))
    

In [None]:
F1 = get_f1(model, test)
F1

In [None]:
n = np.argmax([get_f1(estimator, test) for estimator in model.estimators_])
n

In [None]:
M = confusion(model.estimators_[n], test)

In [None]:
get_f1(model.estimators_[n], test)

## Best Features

In [None]:
from sklearn.inspection import permutation_importance
result = permutation_importance(model.estimators_[n], test.iloc[:, 1:], test['Survived'], n_repeats=10,
                                random_state=7)
perm_sorted_idx = result.importances_mean.argsort()
tree_importance_sorted_idx = np.argsort(model.feature_importances_)

In [None]:
plt.boxplot(result.importances[perm_sorted_idx][-10:].T, vert=False, labels=test.iloc[:, 1:].columns[perm_sorted_idx][-10:]);

## Plot tree

In [None]:
from sklearn import tree
plt.figure(figsize=(20, 20))
tree.plot_tree(model.estimators_[n],
               feature_names = train.iloc[:, 1:].columns, 
               class_names=['no', 'yes'],
               filled = True);

In [None]:
train.to_csv('train.csv')
test.to_csv('test.csv')