In [2]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier

In [3]:
def concatenate_df(df1, df2):
    """Concatenate two pandas dataframes"""
    
    return pd.concat([df1, df2], sort=True).reset_index(drop=True)

# Load data

train_data = pd.read_csv("/kaggle/input/titanic/train.csv")
test_data = pd.read_csv("/kaggle/input/titanic/test.csv")

# Combine data

all_data = concatenate_df(train_data, test_data)

# Data cleaning

See Version 12 for details of the following.

In [4]:
# Fill in missing ages

# SibSp seems to be as good a predictor of age as any (>= 2 indicates travelling with siblings, which likely means a child)
all_data['Age'] = all_data.groupby(['SibSp'])['Age'].apply(lambda x: x.fillna(x.median()))
# Use Pclass and Sex as a backup
all_data['Age'] = all_data.groupby(['Pclass', 'Sex'])['Age'].apply(lambda x: x.fillna(x.median()))

# Fill in missing fare

mr_thomas_fare = all_data.loc[(all_data['Pclass'] == 3) & (all_data['SibSp'] == 0) & (all_data['Embarked'] == 'S')]['Fare'].median()
all_data.loc[all_data['Fare'].isnull(), 'Fare'] = mr_thomas_fare

# Fill in missing embarkeds

all_data.loc[61, 'Embarked'] = 'S'
all_data.loc[829, 'Embarked'] = 'S'

# Keep first letter of cabin (indicating deck) only and insert 'M' for missing

all_data['Deck'] = all_data['Cabin'].apply(lambda c: c[0] if pd.notnull(c) else 'M')

# Feature engineering

## Binning

See Version 12 for details of the following.

In [5]:
# Outliers can disrupt learning, so split up the distribution for each

# For Fare, use qcut to assign approximately equal number of cases to each category
all_data['Fare'] = pd.qcut(all_data['Fare'], 5)
# For Age, use cut to split according to values
all_data['Age'] = pd.cut(all_data['Age'].astype(int), 5)

## New feature creation

See Version 12 for details of the following.

In [6]:
# Combine sibSp and Parch to create new feature Family_Size
all_data['Family_Size'] = all_data['SibSp'] + all_data['Parch'] + 1
# Bin family size to group to group family size of four of higher into a single value
all_data['Family_Size_Bin'] = all_data['Family_Size'].map(lambda fs: fs if fs <= 3 else (4 if fs > 3 else 0))

# Create Ticket_Frequency feature
all_data['Ticket_Freq'] = all_data.groupby('Ticket')['Ticket'].transform('count')

See Version 15 for details of the following

In [7]:
def title(name):
    """Get the title from a passenger's name
    
    Return False if no title is found"""
    
    parts = name.split()
    ends_with_period = [part for part in parts if part.endswith('.')]
    if ends_with_period:
        return ends_with_period[0]
    else:
        return None
    

all_data['Title'] = [title(x) for x in all_data['Name']]
rare = (all_data['Title'].value_counts() < 10)
all_data['Title'] = all_data['Title'].apply(lambda x: 'misc' if rare.loc[x] else x)

# Encoding

See Version 16 for details of the following

## Label encode non-numeric features

In [8]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
non_numeric = ['Age', 'Embarked', 'Fare', 'Sex', 'Deck', 'Title']

for feature in non_numeric:
    all_data[feature] = label_encoder.fit_transform(all_data[feature])

## One-hot encode categorical features

In [9]:
from sklearn.preprocessing import OneHotEncoder
one_hot_encoder = OneHotEncoder(handle_unknown='ignore')

categorical = ["Pclass", "Sex", "Family_Size_Bin", "Age", "Fare", "Deck", "Embarked", "Title"]

for feature in categorical:

    # Create encoder dataframe
    encoder_df = pd.DataFrame(one_hot_encoder.fit_transform(all_data[[feature]]).toarray())
    column_names = one_hot_encoder.get_feature_names([feature])
    encoder_df.columns = column_names
    # Append to all_data
    all_data = all_data.join(encoder_df)
    
# Drop columns that were one-hot encoded
all_data = all_data.drop(columns=categorical)

# Model training

## Prepare data

In [10]:
# Drop columns that will not be used as features
drop_columns = ['Cabin', 'Name', 'Parch', 'PassengerId', 'SibSp', 'Ticket']
all_data = all_data.drop(columns=drop_columns)

train_data = all_data.loc[:890]
y = train_data["Survived"]
X = train_data.drop(columns=['Survived'])

X_test = all_data.loc[891:].drop(columns=['Survived'])

## Hyperparameter optimisation

Look at default parameters

In [11]:
from pprint import pprint

model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)

pprint(model.get_params())

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': 5,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 1,
 'verbose': 0,
 'warm_start': False}


Split training data into 80% training and 20% test data

In [12]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

See how RandomForestClassifier performs with current parameters

In [13]:
from sklearn.metrics import confusion_matrix
from sklearn import metrics

def evaluate(model, test_features, test_labels):
    
    y_pred = model.predict(test_features)
    accuracy = metrics.accuracy_score(y_test, y_pred)
    return accuracy

model.fit(x_train, y_train)

default_accuracy = evaluate(model, x_test, y_test)
print(default_accuracy)

0.8435754189944135


Tune hyperparameters using RandomizedSearchCV

In [15]:
from sklearn.model_selection import RandomizedSearchCV

# Number of trees in random forest
n_estimators = [int(x) for x in range(100,500,50)]

# Number of features to consider at every split
max_features = ['auto', 'sqrt']

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]

# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)

{'n_estimators': [100, 150, 200, 250, 300, 350, 400, 450], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}


In [16]:
model = RandomForestClassifier()

model_random = RandomizedSearchCV(estimator=model, param_distributions=random_grid,
                                  n_iter=100, cv=3, verbose=1, n_jobs=-1, random_state=1)

model_random.fit(x_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   12.9s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   45.7s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  1.1min finished


RandomizedSearchCV(cv=3, estimator=RandomForestClassifier(), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, 110,
                                                      None],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [100, 150, 200, 250,
                                                         300, 350, 400, 450]},
                   random_state=1, verbose=1)

Look at the best parameters

In [18]:
model_random.best_params_

{'n_estimators': 100,
 'min_samples_split': 2,
 'min_samples_leaf': 4,
 'max_features': 'auto',
 'max_depth': 70,
 'bootstrap': False}

Check model accuracy with best parameters from RandomizedSearchCV

In [19]:
best_random = model_random.best_estimator_
rscv_accuracy = evaluate(best_random, x_test, y_test)

print(f"Accuracy with best parameters from RandomizedSearchCV: {rscv_accuracy}")
improvement = ((rscv_accuracy - default_accuracy) / default_accuracy) * 100
print(f"Improvement from default parameters: {improvement:.2f}%")

Accuracy with best parameters from RandomizedSearchCV: 0.8659217877094972
Improvement from default parameters: 2.65%


Fine-tune using GridSearchCV

In [22]:
from sklearn.model_selection import GridSearchCV

# Create parameter grid based on results of RandomizedSearchCV, which were:

# {'n_estimators': 100,
#  'min_samples_split': 2,
#  'min_samples_leaf': 4,
#  'max_features': 'auto',
#  'max_depth': 70,
#  'bootstrap': False}

param_grid = {
    'n_estimators': [75, 100, 125, 150],
    'min_samples_split': [2, 3],
    'min_samples_leaf': [3, 4, 5],
    'max_features': ['auto'],
    'max_depth': [66, 68, 70, 72, 74],
    'bootstrap': [False]
}

model = RandomForestClassifier()

grid_search = GridSearchCV(estimator=model, param_grid=param_grid,
                           cv=3, n_jobs=-1, verbose=1)
grid_search.fit(x_train, y_train)

Fitting 3 folds for each of 120 candidates, totalling 360 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    3.1s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   14.3s
[Parallel(n_jobs=-1)]: Done 360 out of 360 | elapsed:   27.1s finished


GridSearchCV(cv=3, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'bootstrap': [False],
                         'max_depth': [66, 68, 70, 72, 74],
                         'max_features': ['auto'],
                         'min_samples_leaf': [3, 4, 5],
                         'min_samples_split': [2, 3],
                         'n_estimators': [75, 100, 125, 150]},
             verbose=1)

In [23]:
grid_search.best_params_

{'bootstrap': False,
 'max_depth': 66,
 'max_features': 'auto',
 'min_samples_leaf': 3,
 'min_samples_split': 2,
 'n_estimators': 75}

In [25]:
best_grid = grid_search.best_estimator_
gscv_accuracy = evaluate(best_grid, x_test, y_test)

print(f"Accuracy with best parameters from GridSearchCV: {gscv_accuracy}")
improvement_default_gscv = ((gscv_accuracy - default_accuracy) / default_accuracy) * 100
print(f"Improvement from default parameters: {improvement_default_gscv:.2f}%")
improvement_rscv_gscv = ((gscv_accuracy - rscv_accuracy) / rscv_accuracy) * 100
print(f"Improvement from RandomizedSearchCV parameters: {improvement_rscv_gscv:.2f}%")

Accuracy with best parameters from GridSearchCV: 0.8715083798882681
Improvement from default parameters: 3.31%
Improvement from RandomizedSearchCV parameters: 0.65%


## Training

Train and predict using best parameters from GridSearchCV

In [28]:
model = RandomForestClassifier(bootstrap=False,
                               max_depth=66,
                               max_features='auto',
                               min_samples_leaf=3,
                               min_samples_split=2,
                               n_estimators=75,
                               random_state=1)

model.fit(X, y)
predictions = model.predict(X_test)

output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
# Convert 'Survived' to int
output['Survived'] = output['Survived'].astype(int)
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!
