<center><h1>Titanic: Machine Learning from Disaster</h1></center>

In [None]:
import os
import warnings
warnings.filterwarnings('ignore')

import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.svm import LinearSVC
from xgboost import XGBClassifier


%matplotlib inline
mpl.style.use('ggplot')
sns.set_style('white')
plt.rcParams['figure.figsize'] = (7, 4)

In [None]:
# Load and preview datasets 
train_dataset, test_dataset = pd.read_csv('data/train.csv'), pd.read_csv('data/test.csv')

# Examine training and testing dataset shapes
print('Training Dataset: %s, Testing Dataset: %s' %(str(train_dataset.shape), str(test_dataset.shape)))

# Inspect column types
train_dataset.dtypes.reset_index()  

In [None]:
train_dataset.head(3)  # Peek at the first 3 rows of the dataset

## Exploratory Data Analysis

In [None]:
# Compare number of dead versus survived
survivors = train_dataset[train_dataset['Survived'] == 1]['Pclass'].value_counts()
dead = train_dataset[train_dataset['Survived'] == 0]['Pclass'].value_counts()
df_survival_pclass = pd.DataFrame([survivors, dead])
df_survival_pclass.index = ['Dead', 'Survived']
df_survival_pclass.plot(kind='bar', stacked=True, title='Survival Based on by Passenger Class');
train_dataset['Dead'] = 1 - train_dataset['Survived']
train_dataset.groupby('Sex').agg('sum')[['Survived', 'Dead']].plot(kind='bar', stacked=True, colors=['g', 'r'], title='Survival Based on by Sex');

In [None]:
def null_check(train_dataset, test_dataset):
    """Checks and returns a summary of null elements of the training and testing datasets."""
    print("Training Dataset:")
    print(train_dataset.isnull().sum())

    print("\nTesting Dataset:")
    print(test_dataset.isnull().sum())


null_check(train_dataset, test_dataset)

## Data Wrangling, Feature Selection

In [None]:
# Replace NaN values in the column 'Age' with the median value 
train_dataset['Age'] = train_dataset['Age'].fillna(train_dataset['Age'].median())
test_dataset['Age'] = test_dataset['Age'].fillna(test_dataset['Age'].median())

# Apply same concept as above
train_dataset["Embarked"].fillna("S", inplace = True)
test_dataset["Embarked"].fillna("S", inplace = True)
train_dataset["Fare"].fillna(train_dataset["Fare"].median(), inplace = True)
test_dataset["Fare"].fillna(test_dataset["Fare"].median(), inplace = True)

# Drop columns 'Cabin' and 'Ticket' since they contain a lot of noise
train_dataset.drop(labels = ["Cabin", "Ticket"], axis = 1, inplace = True)
test_dataset.drop(labels = ["Cabin", "Ticket"], axis = 1, inplace = True)

In [None]:
encoder_embarked, encoder_sex, = LabelEncoder(), LabelEncoder()

encoder_embarked.fit(train_dataset['Embarked'])
encoder_sex.fit(train_dataset['Sex'])
encoder_embarked.fit(test_dataset['Embarked'])
encoder_sex.fit(test_dataset['Sex'])


train_dataset['Embarked'] = encoder_embarked.transform(train_dataset['Embarked'])
train_dataset['Sex'] = encoder_sex.transform(train_dataset['Sex'])
test_dataset['Embarked'] = encoder_embarked.transform(test_dataset['Embarked'])
test_dataset['Sex'] = encoder_sex.transform(test_dataset['Sex'])

null_check(train_dataset, test_dataset)

In [None]:
# Define the scaler instance
scaler = StandardScaler()

X_train = train_dataset[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare']].values
y_train = train_dataset['Survived'].values

X_test = test_dataset[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare']].values

X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

## Model Selection

In [None]:
estimators = [GradientBoostingClassifier(), LinearSVC(), RandomForestClassifier(), XGBClassifier()]
for estimator in estimators:
    print(str(estimator.__class__.__name__) + ': ' + str(cross_val_score(estimator, X_train, y_train, cv=10, scoring='accuracy').mean()))

## Hyperparameter Tuning

In [None]:
# Print list of parameters of selected classifier
print(list(GradientBoostingClassifier().get_params().keys()))

In [None]:
parameters = {'learning_rate': [1, 0.5, 0.25, 0.1, 0.05, 0.01],              
              'max_depth': np.linspace(1, 32, 32, endpoint=True),
              'min_samples_leaf': np.linspace(0.1, 0.5, 5, endpoint=True),
              'min_samples_split': np.linspace(0.1, 1.0, 10, endpoint=True),
              'n_estimators': [1, 2, 4, 8, 16, 32, 64, 100, 200]}

hyper_model = GridSearchCV(GradientBoostingClassifier(), param_grid=parameters, scoring='roc_auc', cv=3)

In [None]:
hyper_model.fit(X_train, y_train)

In [None]:
print('Best Parameters: ', hyper_model.best_params_)
print('Mean Train Score: ', hyper_model.cv_results_['mean_train_score'])
print('Mean Test Score: ', hyper_model.cv_results_['mean_test_score'])