# Kaggle Titanic - Machine Learning from Disaster


## Libraries import and setup

In [27]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_absolute_error
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

## Import and separate data

In [28]:
train_data_path = "data/train.csv"
train_data = pd.read_csv(train_data_path)

train_data = train_data.rename(columns={
    "PassengerId": "passengerId",
    "Survived": "survived",
    "Pclass": "pClass",
    "Name": "name",
    "Sex": "sex",
    "Age": "age",
    "SibSp": "sibSp",
    "Parch": "parch",
    "Ticket": "ticket",
    "Fare": "fare",
    "Cabin": "cabin",
    "Embarked": "embarked"
})
y = train_data.survived

features = ['pClass', 'sex', 'age', 'sibSp', 'parch', 'fare', 'embarked']
X = train_data[features]

numeric_features = ['age', 'sibSp', 'parch', 'fare']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean'))
])

categorical_features = ['pClass', 'sex', 'embarked']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

X = preprocessor.fit_transform(X)

train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=0, test_size=0.2)

## Model training and setup. Making predictions

In [29]:
param_grid = {
    'n_estimators': [10, 50, 100, 150],
    'max_features': ['sqrt', 'log2'],
    'max_depth': [4, 5, 6, 7, 8],
    'criterion': ['gini', 'entropy']
}
grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=0), param_grid=param_grid, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1)
grid_search.fit(train_X, train_y)

best_model = grid_search.best_estimator_

titanic_predictions = best_model.predict(val_X)
titanic_predictions_mae = mean_absolute_error(val_y, titanic_predictions)

print(f"Best Model Parameters: {grid_search.best_params_}")
print(f"Validation MAE: {titanic_predictions_mae:.4f}")

Best Model Parameters: {'criterion': 'entropy', 'max_depth': 6, 'max_features': 'sqrt', 'n_estimators': 150}
Validation MAE: 0.1732


## Final model

In [30]:
test_data_path = "data/test.csv"
test_data = pd.read_csv(test_data_path)

test_data = test_data.rename(columns={
    "PassengerId": "passengerId",
    "Survived": "survived",
    "Pclass": "pClass",
    "Name": "name",
    "Sex": "sex",
    "Age": "age",
    "SibSp": "sibSp",
    "Parch": "parch",
    "Ticket": "ticket",
    "Fare": "fare",
    "Cabin": "cabin",
    "Embarked": "embarked"
})


X = test_data[features]

test_X = preprocessor.transform(X)

final_titanic_predictions = best_model.predict(test_X)

## Generate a submission

In [31]:
output = pd.DataFrame({'PassengerId': test_data.passengerId,
                       'Survived': final_titanic_predictions})
output.to_csv('submission.csv', index=False)