# Titanic 🚢

In [None]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

import joblib

# Saving and loading data
train_data = pd.read_csv('titanic_data/train.csv')
test_data = pd.read_csv('titanic_data/test.csv')

## 1. Checking data correlations

In [2]:
def rates(feature, objective):
    item = train_data.loc[train_data[feature] == objective]['Survived']
    return sum(item)/len(item)

# Pclass
for i in range(1, 4):
    rate = rates('Pclass', i)
    print(f'Rate of surviving passengers in class {i}: ', rate)

for i in ['S', 'C', 'Q']:
    rate = rates('Embarked', i)
    print(f'Rate of surviving passengers from {i}: ', rate)



Rate of surviving passengers in class 1:  0.6296296296296297
Rate of surviving passengers in class 2:  0.47282608695652173
Rate of surviving passengers in class 3:  0.24236252545824846
Rate of surviving passengers from S:  0.33695652173913043
Rate of surviving passengers from C:  0.5535714285714286
Rate of surviving passengers from Q:  0.38961038961038963


## 2. Feature engineering

In [3]:
## Age
# Filling missing values with the mean
train_data['Age'] = train_data['Age'].fillna(train_data['Age'].mean())

## New feature: Family
# Creating a new feature for family size
train_data['FamilySize'] = train_data['SibSp'] + train_data['Parch']
train_data['FamilySize'] = train_data['FamilySize'].apply(lambda x: 1 if x == 0 else 0)

test_data['FamilySize'] = test_data['SibSp'] + test_data['Parch']
test_data['FamilySize'] = test_data['FamilySize'].apply(lambda x: 1 if x == 0 else 0)

rate_family = rates('FamilySize', 1)
print('Rate of surviving passengers with family: ', rate_family)

Rate of surviving passengers with family:  0.30353817504655495


## 3. Training and fitting

### Method 1

General. We use all of out training data and we test the model on the data used to train it.

Problems: 
- May lead to overestimation of accuracy
- Can't tell if it will generalize to new passengers

In [4]:
# Features and target variable
y = train_data['Survived']
features = ['Pclass', 'Sex', 'Parch']
X = pd.get_dummies(train_data[features])
X_test = pd.get_dummies(train_data[features])


# Getting the best parameters
'''
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [5, 10, None],
    'min_samples_split': [2, 5, 10],
}
grid_search = GridSearchCV(RandomForestClassifier(random_state=1), param_grid, cv=5)
grid_search.fit(X, y)
print("Best parameters:", grid_search.best_params_)
# Max depth = 5, Min samples split = 2, N estimators = 200
'''

# Fitting the model
rf_model1 = RandomForestClassifier(
    n_estimators=200,
    max_depth=5,
    min_samples_split=2,
    random_state=1
)
rf_model1.fit(X, y)
rf_predictions = rf_model1.predict(X_test)

# Performance of the prediction
rf_accuracy = accuracy_score(y, rf_model1.predict(X))
print('Accuracy of the model: ', rf_accuracy)
print(classification_report(y, rf_model1.predict(X), target_names=['Did not survive', 'Survived']))

Accuracy of the model:  0.8058361391694725
                 precision    recall  f1-score   support

Did not survive       0.80      0.91      0.85       549
       Survived       0.82      0.63      0.71       342

       accuracy                           0.81       891
      macro avg       0.81      0.77      0.78       891
   weighted avg       0.81      0.81      0.80       891



## 4. Saving the model

In [None]:
# Saving the model
joblib.dump(rf_model1, 'rf_model.pkl')

# Loading the model
rf_model2 = joblib.load('rf_model.pkl')

# Making predictions on the test set
X_test = pd.get_dummies(test_data[features])
rf_predictions = rf_model2.predict(X_test)

# Saving the predictions
submission = pd.DataFrame({
    'PassengerId': test_data['PassengerId'],
    'Survived': rf_predictions
})

# Saving the submission file
submission.to_csv('submission.csv', index=False)