# Kaggle Titanic - Machine Learning from Disaster


## Libraries import and setup

In [51]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error

## Import and separate data

In [52]:
train_data_path = "data/train.csv"
train_data = pd.read_csv(train_data_path)

train_data = train_data.rename(columns={
    "PassengerId": "passengerId",
    "Survived": "survived",
    "Pclass": "pClass",
    "Name": "name",
    "Sex": "sex",
    "Age": "age",
    "SibSp": "sibSp",
    "Parch": "parch",
    "Ticket": "ticket",
    "Fare": "fare",
    "Cabin": "cabin",
    "Embarked": "embarked"
})
y = train_data.survived

label_encoder = LabelEncoder()
train_data['sex'] = label_encoder.fit_transform(train_data['sex'])
train_data['embarked'] = label_encoder.fit_transform(train_data['embarked'])


features = ['pClass', 'sex', 'age', 'sibSp', 'parch', 'fare', 'embarked']

X = train_data[features]

train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=0)

## Model training and setup. Making predictions

In [53]:
n_estimators_variants = [10, 50, 100, 150]

best_n_estimator = n_estimators_variants[0]
best_mae = float('inf')
for n_estimator in n_estimators_variants:
    model = RandomForestClassifier(n_estimators=n_estimator, random_state=0)
    
    model.fit(train_X, train_y)
    
    titanic_predictions = model.predict(val_X)
    titanic_predictions_mae = mean_absolute_error(titanic_predictions, val_y)

    print(f"n_estimators: {n_estimator} | Mean MAE: {titanic_predictions_mae:.4f}")

    if titanic_predictions_mae < best_mae:
        best_mae = titanic_predictions_mae
        best_n_estimator = n_estimator
        
print(f"Best n_estimator: {best_n_estimator}")
print(f"Best mae: {best_mae}")

n_estimators: 10 | Mean MAE: 0.1973
n_estimators: 50 | Mean MAE: 0.1570
n_estimators: 100 | Mean MAE: 0.1614
n_estimators: 150 | Mean MAE: 0.1659
Best n_estimator: 50
Best mae: 0.15695067264573992


## Final model

In [54]:
final_model = RandomForestClassifier(n_estimators=best_n_estimator, random_state=0)
final_model.fit(train_X, train_y)



test_data_path = "data/test.csv"
test_data = pd.read_csv(test_data_path)

test_data = test_data.rename(columns={
    "PassengerId": "passengerId",
    "Survived": "survived",
    "Pclass": "pClass",
    "Name": "name",
    "Sex": "sex",
    "Age": "age",
    "SibSp": "sibSp",
    "Parch": "parch",
    "Ticket": "ticket",
    "Fare": "fare",
    "Cabin": "cabin",
    "Embarked": "embarked"
})

test_data['sex'] = label_encoder.fit_transform(test_data['sex'])
test_data['embarked'] = label_encoder.fit_transform(test_data['embarked'])

test_X = test_data[features]

final_titanic_predictions = final_model.predict(test_X)

418
418


## Generate a submission

In [55]:
output = pd.DataFrame({'PassengerId': test_data.passengerId,
                       'Survived': final_titanic_predictions})
output.to_csv('submission.csv', index=False)