In [1]:
import pandas as pd
from pandas import DataFrame
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.metrics import classification_report
from random import seed    # Set random seed for reproducible results

In [2]:
seed(110717)    # Set the seed
titanic = pd.read_csv("titanic.csv")
titanic_train, titanic_test = train_test_split(titanic)

In [3]:
from sklearn.ensemble import RandomForestClassifier

In [4]:
forest1 = RandomForestClassifier(n_estimators=10,    # Number of trees to grow
                                 max_depth=5)        # Maximum depth of a tree
forest1.fit(X=titanic_train.replace({'Sex': {'male': 0, 'female': 1}}    # Replace strings with numbers
                                   ).drop(["Survived", "Name"], axis=1),
            y=titanic_train.Survived)

# Example prediction
forest1.predict([[2, 0, 26, 0, 0, 30]])

array([0], dtype=int64)

In [5]:
pred1 = forest1.predict(titanic_train.replace({'Sex': {'male': 0, 'female': 1}}
                                             ).drop(["Survived", "Name"], axis=1))
print(classification_report(titanic_train.Survived, pred1))

              precision    recall  f1-score   support

           0       0.85      0.94      0.89       411
           1       0.89      0.72      0.80       254

    accuracy                           0.86       665
   macro avg       0.87      0.83      0.85       665
weighted avg       0.86      0.86      0.86       665



In [6]:
n_candidate = [10, 20, 30, 40, 60, 80, 100]    # Candidate forest sizes
res1 = dict()

for n in n_candidate:
    pred3 = RandomForestClassifier(n_estimators=n, max_depth=5)
    res1[n] = cross_validate(pred3,
                            X=titanic_train.replace({'Sex': {'male': 0, 'female': 1}}    # Replace strings with numbers
                                         ).drop(["Survived", "Name"], axis=1),
                            y=titanic_train.Survived,
                            cv=10,
                            return_train_score=False,
                            scoring='accuracy')

res1df = DataFrame({(i, j): res1[i][j]
                             for i in res1.keys()
                             for j in res1[i].keys()}).T

res1df.loc[(slice(None), 'test_score'), :]

Unnamed: 0,Unnamed: 1,0,1,2,3,4,5,6,7,8,9
10,test_score,0.75,0.80597,0.820896,0.761194,0.878788,0.833333,0.863636,0.818182,0.863636,0.818182
20,test_score,0.705882,0.776119,0.80597,0.776119,0.878788,0.818182,0.833333,0.818182,0.863636,0.833333
30,test_score,0.691176,0.80597,0.791045,0.80597,0.909091,0.848485,0.818182,0.863636,0.848485,0.833333
40,test_score,0.720588,0.80597,0.820896,0.791045,0.878788,0.818182,0.833333,0.848485,0.863636,0.833333
60,test_score,0.705882,0.835821,0.731343,0.776119,0.878788,0.833333,0.818182,0.848485,0.878788,0.848485
80,test_score,0.705882,0.820896,0.820896,0.776119,0.909091,0.833333,0.848485,0.848485,0.848485,0.833333
100,test_score,0.676471,0.80597,0.791045,0.776119,0.893939,0.80303,0.848485,0.848485,0.878788,0.833333


In [7]:
res1df.loc[(slice(None), 'test_score'), :].mean(axis=1)

10   test_score    0.821382
20   test_score    0.810955
30   test_score    0.821537
40   test_score    0.821426
60   test_score    0.815523
80   test_score    0.824500
100  test_score    0.815567
dtype: float64

In [8]:
m_candidate = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]    # Candidate depths

In [9]:
res2 = dict()

for m in m_candidate:
    pred3 = RandomForestClassifier(max_depth=m, n_estimators=40)
    res2[m] = cross_validate(pred3,
                             X=titanic_train.replace({'Sex': {'male': 0, 'female': 1}}    # Replace strings with numbers
                                          ).drop(["Survived", "Name"], axis=1),
                             y=titanic_train.Survived,
                             cv=10,
                             return_train_score=False,
                             scoring='accuracy')

res2df = DataFrame({(i, j): res2[i][j]
                             for i in res2.keys()
                             for j in res2[i].keys()}).T

res2df.loc[(slice(None), 'test_score'), :]

Unnamed: 0,Unnamed: 1,0,1,2,3,4,5,6,7,8,9
1,test_score,0.705882,0.731343,0.791045,0.701493,0.757576,0.818182,0.757576,0.787879,0.848485,0.818182
2,test_score,0.676471,0.791045,0.776119,0.716418,0.818182,0.848485,0.772727,0.818182,0.848485,0.80303
3,test_score,0.720588,0.80597,0.80597,0.761194,0.80303,0.833333,0.818182,0.863636,0.893939,0.818182
4,test_score,0.705882,0.820896,0.791045,0.761194,0.893939,0.848485,0.818182,0.818182,0.878788,0.818182
5,test_score,0.691176,0.820896,0.80597,0.791045,0.893939,0.787879,0.818182,0.848485,0.893939,0.848485
6,test_score,0.735294,0.850746,0.791045,0.776119,0.893939,0.80303,0.848485,0.848485,0.848485,0.833333
7,test_score,0.720588,0.820896,0.791045,0.820896,0.893939,0.787879,0.833333,0.848485,0.878788,0.848485
8,test_score,0.720588,0.80597,0.80597,0.791045,0.909091,0.818182,0.848485,0.878788,0.848485,0.833333
9,test_score,0.720588,0.820896,0.791045,0.776119,0.893939,0.787879,0.818182,0.848485,0.848485,0.818182
10,test_score,0.735294,0.791045,0.80597,0.791045,0.893939,0.80303,0.80303,0.833333,0.833333,0.818182


In [10]:
res2df.loc[(slice(None), 'test_score'), :].mean(axis=1)

1   test_score    0.771764
2   test_score    0.786914
3   test_score    0.812403
4   test_score    0.815477
5   test_score    0.820000
6   test_score    0.822896
7   test_score    0.824433
8   test_score    0.825994
9   test_score    0.812380
10  test_score    0.810820
dtype: float64

In [11]:
forest2 = RandomForestClassifier(max_depth=9, n_estimators=40)
forest2.fit(X=titanic_train.replace({'Sex': {'male': 0, 'female': 1}}    # Replace strings with numbers
                                   ).drop(["Survived", "Name"], axis=1),
            y=titanic_train.Survived)

survived_test_predict = forest2.predict(X=titanic_test.replace(
    {'Sex': {'male': 0, 'female': 1}}
).drop(["Survived", "Name"], axis=1))

In [12]:
print(classification_report(titanic_test.Survived, survived_test_predict))

              precision    recall  f1-score   support

           0       0.87      0.93      0.90       134
           1       0.87      0.78      0.83        88

    accuracy                           0.87       222
   macro avg       0.87      0.85      0.86       222
weighted avg       0.87      0.87      0.87       222

