In [27]:
from sklearn.datasets import fetch_openml
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.base import clone, BaseEstimator
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, precision_recall_curve, roc_curve, roc_auc_score
from sklearn.model_selection import cross_val_predict
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
import matplotlib
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from pathlib import Path
import pandas as pd
from pandas import DataFrame


In [19]:
ROOT = Path.cwd()

DATA_DIR = ROOT.parent.parent / 'datasets' / 'titanic'

df = pd.read_csv(DATA_DIR / 'titanic3.csv')

**Titanic** dataset dictionary:

- `pclass`: Ticket class (1 = 1st, 2 = 2nd, 3 = 3rd)
- `survived`: Survival (0 = No, 1 = Yes)
- `name`: Name
- `sex`: Gender
- `age`: Age
- `sibsp`: Number of siblings/spouses aboard
- `parch`: Number of parents/children aboard
- `ticket`: Ticket number
- `fare`: Passenger fare
- `cabin`: Cabin
- `embarked`: Port of embarkation (C = Cherbourg, Q = Queenstown, S = Southampton)
- `boat`: Lifeboat
- `body`: Body Identification Number
- `home.dest`: Home/destination

In [20]:
display(df)

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.00,0,0,24160,211.3375,B5,S,2,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.92,1,2,113781,151.5500,C22 C26,S,11,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.00,1,2,113781,151.5500,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.00,1,2,113781,151.5500,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.00,1,2,113781,151.5500,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1304,3,0,"Zabour, Miss. Hileni",female,14.50,1,0,2665,14.4542,,C,,328.0,
1305,3,0,"Zabour, Miss. Thamine",female,,1,0,2665,14.4542,,C,,,
1306,3,0,"Zakarian, Mr. Mapriededer",male,26.50,0,0,2656,7.2250,,C,,304.0,
1307,3,0,"Zakarian, Mr. Ortin",male,27.00,0,0,2670,7.2250,,C,,,


In [21]:
nan_values : DataFrame = df.isnull().sum()

nan_values

pclass          0
survived        0
name            0
sex             0
age           263
sibsp           0
parch           0
ticket          0
fare            1
cabin        1014
embarked        2
boat          823
body         1188
home.dest     564
dtype: int64

In [22]:
df['ticket'].nunique()

929

In [23]:
filtered_df : DataFrame = df.drop(columns=[
    'name',
    'ticket',
    'cabin',
    'boat',
    'body',
    'home.dest'
])

In [24]:
display(
    filtered_df
)

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,fare,embarked
0,1,1,female,29.00,0,0,211.3375,S
1,1,1,male,0.92,1,2,151.5500,S
2,1,0,female,2.00,1,2,151.5500,S
3,1,0,male,30.00,1,2,151.5500,S
4,1,0,female,25.00,1,2,151.5500,S
...,...,...,...,...,...,...,...,...
1304,3,0,female,14.50,1,0,14.4542,C
1305,3,0,female,,1,0,14.4542,C
1306,3,0,male,26.50,0,0,7.2250,C
1307,3,0,male,27.00,0,0,7.2250,C


In [25]:
filtered_df['sex'] = filtered_df['sex'].map({'male': 0, 'female': 1})
filtered_df['embarked'] = filtered_df['embarked'].map({'S': 0, 'C': 1, 'Q': 2})

In [26]:
filtered_df.isna().sum()

pclass        0
survived      0
sex           0
age         263
sibsp         0
parch         0
fare          1
embarked      2
dtype: int64

In [28]:
si = SimpleImputer(strategy='mean')

filtered_df['age'] = si.fit_transform(filtered_df[['age']])
filtered_df['fare'] = si.fit_transform(filtered_df[['fare']])
filtered_df['embarked'] = si.fit_transform(filtered_df[['embarked']])

In [29]:
X = filtered_df.drop(columns=['survived'])
y = filtered_df['survived']

In [30]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [31]:
param_grid = [
    {
        'n_estimators': [10, 100, 200],
        'max_features': [2, 4, 6, 8],
        'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
        'bootstrap': [False, True]
    }
]

In [32]:
rf_clf = RandomForestClassifier()

grid_search = GridSearchCV(rf_clf, param_grid, cv=5, scoring='accuracy', return_train_score=True)

In [33]:
grid_search.fit(X_train, y_train)

  _data = np.array(data, dtype=dtype, copy=copy,


In [34]:
print(grid_search.best_params_)

{'bootstrap': True, 'max_depth': 10, 'max_features': 6, 'n_estimators': 200}


In [35]:
best_params = grid_search.best_params_

rf_clf = RandomForestClassifier(**best_params)

In [36]:
rf_clf.fit(X_train, y_train)

In [37]:
y_pred = rf_clf.predict(X_test)

In [39]:
confusion_matrix(y_test, y_pred)

array([[130,  14],
       [ 39,  79]])

In [40]:
print(
    f'precision: {precision_score(y_test, y_pred)}\n'
    f'recall: {recall_score(y_test, y_pred)}\n'
    f'f1: {f1_score(y_test, y_pred)}'
)

precision: 0.8494623655913979
recall: 0.6694915254237288
f1: 0.7488151658767772
