In [1]:
# importing libraries
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree

# ensemble methods
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier

from sklearn.model_selection import GridSearchCV

# for visualisation
import matplotlib.pyplot as plt
from sklearn.tree import export_graphviz
from IPython.display import Image  
from subprocess import call

In [2]:
titanic_df = pd.read_csv("titanic.csv")
titanic_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
titanic_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [4]:
# Drop unnecessary columns
# columns to drop
drop_columns = ['Cabin','PassengerId','Name','Ticket','Fare']

# dropping colums
titanic_df.drop(drop_columns, axis = 1, inplace = True)


### One-Hot Encoding
One-hot encoding is a technique used to ensure that categorical variables are better represented in the machine. Let's take a look at the "Sex" column

In [5]:
titanic_df["Sex"].unique()

array(['male', 'female'], dtype=object)

Machine Learning classifiers don't know how to handle strings. As a result, you need to convert it into a categorical representation. There are two main ways to go about this:

Label Encoding: Assigning, for example, 0 for "male" and 1 for "female". The problem here is it intrinsically makes one category "larger than" the other category.

One-hot encoding: Assigning, for example, [1, 0] for "male" and [0, 1] for female. In this case, you have an array of size (n_categories,) and you represent a 1 in the correct index, and 0 elsewhere. In Pandas, this would show as extra columns. For example, rather than having a "Sex" column, it would be a "Sex_male" and "Sex_female" column. Then, if the person is male, it would simply show as a 1 in the "Sex_male" column and a 0 in the "Sex_female" column.

There is a nice and easy method that does this in pandas: get_dummies()

In [6]:
titanic_df = pd.get_dummies(titanic_df, prefix="Sex", columns=["Sex"])
titanic_df.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Embarked,Sex_female,Sex_male
0,0,3,22.0,1,0,S,0,1
1,1,1,38.0,1,0,C,1,0
2,1,3,26.0,0,0,S,1,0
3,1,1,35.0,1,0,S,1,0
4,0,3,35.0,0,0,S,0,1


Now, we do the same to the "Embarked" column.

In [7]:
titanic_df = pd.get_dummies(titanic_df, prefix="Embarked", columns=["Embarked"])
titanic_df.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,0,3,22.0,1,0,0,1,0,0,1
1,1,1,38.0,1,0,1,0,1,0,0
2,1,3,26.0,0,0,1,0,0,0,1
3,1,1,35.0,1,0,1,0,0,0,1
4,0,3,35.0,0,0,0,1,0,0,1


It looks the Age columns has missing data so I am just going to check the whole dataframe for missing data and fix it.

In [8]:
# getting the number of missing data points per column
missing_data = titanic_df.isnull().sum()

# number of missing points in the columns
missing_data = missing_data[missing_data != 0]

missing_data

Age    177
dtype: int64

As it can be seen, 20% of the Age data is missing and deleting all these rows of data would be a big lose of data. Therefore, to fix it, replacing the missing values with the average of the age column is the best solution.

In [9]:
# Get the mean of the Age values
age_mean = round(titanic_df['Age'].mean(),0)

# Replace the missing values with the mean
titanic_df['Age'].fillna(age_mean, inplace = True)

# Selecting Variables

Getting independent and dependent variables.

In [10]:
X = titanic_df.iloc[:,1:].values

y = titanic_df.iloc[:,0]


# Training, Development and Test Data

In [11]:
# Split the original dataset into training and test sets
X_train_full, X_test, y_train_full, y_test = train_test_split(X, y,test_size=0.2)

# Split the training set further into training and development sets
X_train, X_dev, y_train, y_dev = train_test_split(X_train_full, y_train_full, test_size=0.2)

print(X_train.shape, X_dev.shape, X_test.shape)
print(y_train.shape, y_dev.shape, y_test.shape)


(569, 9) (143, 9) (179, 9)
(569,) (143,) (179,)


# Models

### Bagging

In [12]:
# create base model
base = DecisionTreeClassifier()

# create bagged tree
bagged_tree = BaggingClassifier(base_estimator = base)
bagged_tree.fit(X_train,y_train)

print("Accuracy of bagged tree:",bagged_tree.score(X_test, y_test))

Accuracy of bagged tree: 0.7374301675977654


## Random Forest

In [13]:
# create a random forest model
random_forest = RandomForestClassifier()
random_forest.fit(X_train, y_train)

print("Accuracy of random forest:",random_forest.score(X_test, y_test))

Accuracy of random forest: 0.7597765363128491


## Boosted Tree

In [14]:
# create a boosted tree model
boosted_tree = AdaBoostClassifier()
boosted_tree.fit(X_train, y_train)

print("Accuracy of the boosted tree:",boosted_tree.score(X_test, y_test))

Accuracy of the boosted tree: 0.770949720670391


#### Determing which of the random forest's features contribute the most to the prediction of whether a passenger survives or not.

In [15]:
# determine the features importances
most_prediction = pd.Series(random_forest.feature_importances_).sort_values(ascending=False)

# display feature importance from most to least important
most_prediction

1    0.384303
4    0.173050
5    0.162187
0    0.106732
2    0.066317
3    0.059773
8    0.020626
7    0.013584
6    0.013429
dtype: float64

From the information displayed above, the features that contribute mostly to the prediction of whether a passanger survives or not are the Age, followed by the Gender and then Class which the passanger embarked in.

#### Tuning the parameters n_estimators and max_depth using GridSearchCV

In [16]:
# create random forest classifier
rf_model = RandomForestClassifier()

def tuning_func(model):
    # define parameters for tuning
    params = {

        'n_estimators': np.arange(10,101,10),
        'max_depth': np.arange(1,6)

    }

    # create random forest GridSearchCV model
    GSCV = GridSearchCV(estimator = rf_model, param_grid = params, cv = 3)

    # fitting the model
    GSCV.fit(X_train,y_train);

    # Get the tuning parameters
    param_values = GSCV.best_params_ # getting tuned parameters
    
    return param_values

param_values = tuning_func(rf_model)

param_values

{'max_depth': 4, 'n_estimators': 50}

In [17]:
# for bagging
bg_params = tuning_func(bagged_tree)

bg_params

# #get the estimator value
# est_value = bg_params['n_estimators']

# # create a bagged tree model with the tuned paramater
# tuned_bagged_tree = BaggingClassifier(base_estimator = base, n_estimators = est_value)

# tuned_bagged_tree.fit(X_train, y_train)

# print("Accuracy of the tuned bagged tree model:", tuned_bagged_tree.score(X_test, y_test))

{'max_depth': 4, 'n_estimators': 60}

In [18]:
# for boosted tree
bt_params = tuning_func(boosted_tree)
bt_params

{'max_depth': 4, 'n_estimators': 80}

#### Reporting on the accuracy of the models, their n_estimators and max_depth

The accuracy of the Bagged model is 0.7374301675977654. The Bagged model has a max_depth of 4 and 60 n_estimators.
The accuracy of the Random Forest model is 0.7597765363128491. The Random Forest model has a max_depth of 4 and 50 n_estimators.
The accuracy of the Bossted Tree model is 0.770949720670391. The Boosted Tree model has a max_depth of 4 and 80 n_estimators.

The Boosted Tree Model has the highest accuracy.

