Objective: to predict whether passengers survived the Titanic based on their characteristics

Data: Titanic dataset from Kaggle
Columns: Passenger Id, Class, Name, Sex, Age, # Siblings/Spouse, # Family Members, Ticket Number, Fare Cost, Cabin, Location Embarked From (Cherbourg, Queenstown, or Southampton)

*titanictrain.csv contained 'Survived' column while titanictest.csv did not

1) Imported Data from csv files

2) Made adjustments to both datasets before creating any models. This allows the model to process certain variables that may not be useful otherwise
    Adjustments include:
        - Changing 'Name' to the number of letters in the passenger's name
        - Changing 'Cabin' to whether the passenger was assigned a Cabin (typically reserved for first and wealthy second class passengers)
        - Assigning dummy variables to 'Sex' and 'Embarked' (location embarked from)
        - Dropping irrelevant variables 'PassengerId' and 'Ticket'
        - Using KNN to predict and fill in missing ages in datasets

3) Ran Random Forest Classification to create model and return a ROC value. 

4) For greater accuracy, picked model with the highest ROC value out of hundreds of Random Forest runs. 

5) Applied the model to the testing dataset (without the Survived column) to predict whether those passengers survived.

In [1]:
import pandas as pd
import os
from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

In [2]:
df = pd.read_csv(os.path.join("..", "titanictrain.csv"))
df_test = pd.read_csv(os.path.join("..", "titanictest.csv"))

In [3]:
# Modifying columns to be more machine readible and to fill in missing ages (with KNN)

def makeAdjustments(df):
    # Name --> length of name
    namelen = []
    for i in df['Name']:
        namelen.append(len(i))
    df.drop(['Name'], axis=1, inplace=True)
    df['Name Length'] = namelen

    # Cabin --> whether they had a cabin
    cabinbool = []
    for i in df['Cabin']:
        if type(i) == str:
            cabinbool.append(True)
        else:
            cabinbool.append(False)
    df.drop(['Cabin'], axis=1, inplace=True)
    df['Had Cabin'] = cabinbool

    # Assigning dummies to Sex and Embarked
    cat_variables = df[['Sex', 'Embarked']]
    cat_dummies = pd.get_dummies(cat_variables)
    df.drop(['Sex', 'Embarked'], axis=1, inplace=True)
    df = pd.concat([df, cat_dummies], axis=1)
    
    # Dropping irrelevant variables -- Ticket, Passenger Id
    df.drop(['Ticket', 'PassengerId'], axis=1, inplace=True)
    
    # KNN: Scaling
    scaler = MinMaxScaler()
    df = pd.DataFrame(scaler.fit_transform(df), columns = df.columns)

    # KNN: Imputer
    imputer = KNNImputer(n_neighbors=5)
    df = pd.DataFrame(imputer.fit_transform(df), columns = df.columns)
    df = pd.DataFrame(scaler.inverse_transform(df), columns = df.columns)
    
    return df

In [4]:
# Random Forest Classifier

def runRandomForest(df):
    y = df['Survived']
    
    data = df.drop("Survived", axis=1)
    feature_names = data.columns

    X = df[feature_names]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

    # Create the model with 100 trees
    model = RandomForestClassifier(n_estimators=100, 
                                   bootstrap = True,
                                   max_features = 'sqrt')
    # Fit on training data
    model.fit(X_train, y_train)


    # Actual class predictions
    rf_predictions = model.predict(X_test)
    # Probabilities for each class
    rf_probs = model.predict_proba(X_test)[:, 1]

    # Calculate roc auc
    roc_value = roc_auc_score(y_test, rf_probs)
    return roc_value, model

In [5]:
# MAIN

df = makeAdjustments(df)

# getting model for max roc (out of 100,000 tries)
max_roc = 0
model = ''
for i in range(1000):
    output = runRandomForest(df)
    roc_value = output[0]
    if roc_value > max_roc:
        max_roc = roc_value
        model = output[1]
print('max roc:', max_roc)

# Applying to Test set
df_test = makeAdjustments(df_test)
test_predictions = model.predict(df_test)
test_probs = model.predict_proba(df_test)[:, 1]
df_test['Survived(predict)'] = test_predictions
df_test['Prob(predict)'] = test_probs

max roc: 0.861409491332765


In [6]:
df_test.head(15)

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Name Length,Had Cabin,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Survived(predict),Prob(predict)
0,3.0,34.5,0.0,0.0,7.8292,16.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.03
1,3.0,47.0,1.0,0.0,7.0,32.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.36
2,2.0,62.0,0.0,0.0,9.6875,25.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.06
3,3.0,27.0,0.0,0.0,8.6625,16.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.04
4,3.0,22.0,1.0,1.0,12.2875,44.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.48
5,3.0,14.0,0.0,0.0,9.225,26.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.158333
6,3.0,30.0,0.0,0.0,7.6292,20.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.14
7,2.0,26.0,1.0,1.0,29.0,28.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.09
8,3.0,18.0,0.0,0.0,7.2292,41.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.74
9,3.0,21.0,2.0,0.0,24.15,23.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.07
