In [102]:

import numpy as np 
import pandas as pd


# **Load the Data**

In [103]:
raw_data = pd.read_csv('train.csv')
raw_data_test = pd.read_csv('test.csv')
raw_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# **Obtain dataset summary**

In [104]:
raw_data.describe(include='all')

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
count,891.0,891.0,891.0,891,891,714.0,891.0,891.0,891,891.0,204,889
unique,,,,891,2,,,,681,,147,3
top,,,,"Hagland, Mr. Ingvald Olai Olsen",male,,,,CA. 2343,,G6,S
freq,,,,1,577,,,,7,,4,644
mean,446.0,0.383838,2.308642,,,29.699118,0.523008,0.381594,,32.204208,,
std,257.353842,0.486592,0.836071,,,14.526497,1.102743,0.806057,,49.693429,,
min,1.0,0.0,1.0,,,0.42,0.0,0.0,,0.0,,
25%,223.5,0.0,2.0,,,20.125,0.0,0.0,,7.9104,,
50%,446.0,0.0,3.0,,,28.0,0.0,0.0,,14.4542,,
75%,668.5,1.0,3.0,,,38.0,1.0,0.0,,31.0,,


# **Preprocess the Dataset**

In [105]:
# Looking at the Dataset provided, we need to preprocess the data before we can use it to train any Machine Learning algorithm, 
# First we drop the id, Name, Ticket and Cabin columns

# The id column is not expected to hace any predictive power so it will not be part of our inputs for this dataset
# The Name column also follows suit
# The Ticket and Cabiun data contain is too unique for us to create as many dummies as that number and will not also be part
# of the inputs in this dataset

df = raw_data.copy() # Creating a checkpoint here
df_test = raw_data_test.copy()
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [106]:
df = df.drop(['PassengerId', 'Name', 'Ticket','Cabin'], axis=1)
df_test = df_test.drop(['Name', 'Ticket','Cabin'], axis=1)

In [107]:
df

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S
5,0,3,male,,0,0,8.4583,Q
6,0,1,male,54.0,0,0,51.8625,S
7,0,3,male,2.0,3,1,21.075,S
8,1,3,female,27.0,0,2,11.1333,S
9,1,2,female,14.0,1,0,30.0708,C


In [108]:
df_test

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,892,3,male,34.5,0,0,7.8292,Q
1,893,3,female,47.0,1,0,7.0,S
2,894,2,male,62.0,0,0,9.6875,Q
3,895,3,male,27.0,0,0,8.6625,S
4,896,3,female,22.0,1,1,12.2875,S
5,897,3,male,14.0,0,0,9.225,S
6,898,3,female,30.0,0,0,7.6292,Q
7,899,2,male,26.0,1,1,29.0,S
8,900,3,female,18.0,0,0,7.2292,C
9,901,3,male,21.0,2,0,24.15,S


In [109]:
# Next, we map the Sex column to 0s and 1s for Male and Female respectively as they are categorical variables

df_mapped = df.copy()
df_mapped_test = df_test.copy()

In [110]:
df_mapped['Sex'] = df_mapped['Sex'].map({'male':0, 'female':1})
df_mapped_test['Sex'] = df_mapped_test['Sex'].map({'male':0, 'female':1})

In [111]:
df_mapped

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,0,22.0,1,0,7.25,S
1,1,1,1,38.0,1,0,71.2833,C
2,1,3,1,26.0,0,0,7.925,S
3,1,1,1,35.0,1,0,53.1,S
4,0,3,0,35.0,0,0,8.05,S
5,0,3,0,,0,0,8.4583,Q
6,0,1,0,54.0,0,0,51.8625,S
7,0,3,0,2.0,3,1,21.075,S
8,1,3,1,27.0,0,2,11.1333,S
9,1,2,1,14.0,1,0,30.0708,C


In [112]:
# We drop other data that do not have a value

df_mapped = df_mapped.dropna(axis=0)
df_mapped_test = df_mapped_test.dropna(axis=0)

In [113]:
# The Embarked colums represents where the passenger embarked from. These are Categorical nominal variables so we create
# dummy variables for them. We will drop the first column which will represent the default embarkation point. We do this to
# avoid multicollinearity during regression analysis. 

embarked_dummies = pd.get_dummies(df_mapped['Embarked'], drop_first=True, prefix = 'Embarked')
embarked_dummies_test = pd.get_dummies(df_mapped_test['Embarked'], drop_first=True, prefix = 'Embarked')

In [114]:
pd.set_option('max_rows', None)
embarked_dummies # The default Embarkation point here is Cherbourg

Unnamed: 0,Embarked_Q,Embarked_S
0,0,1
1,0,0
2,0,1
3,0,1
4,0,1
6,0,1
7,0,1
8,0,1
9,0,0
10,0,1


In [115]:

# Next we concatenete the dummies with the dataset, but before we do that, we drop the Embarked column

df_mapped = df_mapped.drop(['Embarked'], axis=1)
df_mapped_test = df_mapped_test.drop(['Embarked'], axis=1)
df_mapped

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
0,0,3,0,22.0,1,0,7.25
1,1,1,1,38.0,1,0,71.2833
2,1,3,1,26.0,0,0,7.925
3,1,1,1,35.0,1,0,53.1
4,0,3,0,35.0,0,0,8.05
6,0,1,0,54.0,0,0,51.8625
7,0,3,0,2.0,3,1,21.075
8,1,3,1,27.0,0,2,11.1333
9,1,2,1,14.0,1,0,30.0708
10,1,3,1,4.0,1,1,16.7


In [116]:
df_with_dummies = pd.concat((df_mapped,embarked_dummies), axis=1)
df_test_with_dummies = pd.concat((df_mapped_test,embarked_dummies_test), axis=1)
df_with_dummies

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_Q,Embarked_S
0,0,3,0,22.0,1,0,7.25,0,1
1,1,1,1,38.0,1,0,71.2833,0,0
2,1,3,1,26.0,0,0,7.925,0,1
3,1,1,1,35.0,1,0,53.1,0,1
4,0,3,0,35.0,0,0,8.05,0,1
6,0,1,0,54.0,0,0,51.8625,0,1
7,0,3,0,2.0,3,1,21.075,0,1
8,1,3,1,27.0,0,2,11.1333,0,1
9,1,2,1,14.0,1,0,30.0708,0,0
10,1,3,1,4.0,1,1,16.7,0,1


In [117]:
df_dummies = df_with_dummies.copy()
df_dummies_test = df_test_with_dummies.copy()

In [118]:
df_dummies

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_Q,Embarked_S
0,0,3,0,22.0,1,0,7.25,0,1
1,1,1,1,38.0,1,0,71.2833,0,0
2,1,3,1,26.0,0,0,7.925,0,1
3,1,1,1,35.0,1,0,53.1,0,1
4,0,3,0,35.0,0,0,8.05,0,1
6,0,1,0,54.0,0,0,51.8625,0,1
7,0,3,0,2.0,3,1,21.075,0,1
8,1,3,1,27.0,0,2,11.1333,0,1
9,1,2,1,14.0,1,0,30.0708,0,0
10,1,3,1,4.0,1,1,16.7,0,1


In [119]:
# The Ticket class does not have a numerical meaning as it is also a categorical(ordinal) variable. Therefore we will map
# the values to 0 and 1. With 0 representing the lower class and 1 representing the upper class.

# First, we chek the number of unique entries in the Ticket class

pd.unique(df_dummies['Pclass'])

array([3, 1, 2])

In [120]:
df_mapped_class = df_dummies.copy()
df_test_mapped_class = df_dummies_test.copy()

In [121]:
df_mapped_class['Pclass'] = df_mapped_class['Pclass'].map({1:0, 2:1, 3:1})
df_test_mapped_class['Pclass'] = df_test_mapped_class['Pclass'].map({1:0, 2:1, 3:1})
df_mapped_class

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_Q,Embarked_S
0,0,1,0,22.0,1,0,7.25,0,1
1,1,0,1,38.0,1,0,71.2833,0,0
2,1,1,1,26.0,0,0,7.925,0,1
3,1,0,1,35.0,1,0,53.1,0,1
4,0,1,0,35.0,0,0,8.05,0,1
6,0,0,0,54.0,0,0,51.8625,0,1
7,0,1,0,2.0,3,1,21.075,0,1
8,1,1,1,27.0,0,2,11.1333,0,1
9,1,1,1,14.0,1,0,30.0708,0,0
10,1,1,1,4.0,1,1,16.7,0,1


# Arrange the Columns

In [122]:
# For organization purpose when the modelling will be done, we put the input columns to the left and the target to the right

df_mapped_class.columns.values

array(['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare',
       'Embarked_Q', 'Embarked_S'], dtype=object)

In [123]:
df_test_mapped_class.columns.values

array(['PassengerId', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare',
       'Embarked_Q', 'Embarked_S'], dtype=object)

In [124]:
cols = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked_Q', 'Embarked_S', 'Survived']
cols_test = ['PassengerId','Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked_Q', 'Embarked_S']
data_cleaned = df_mapped_class[cols]
data_cleaned_test = df_test_mapped_class[cols_test]

In [125]:
data_cleaned

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_Q,Embarked_S,Survived
0,1,0,22.0,1,0,7.25,0,1,0
1,0,1,38.0,1,0,71.2833,0,0,1
2,1,1,26.0,0,0,7.925,0,1,1
3,0,1,35.0,1,0,53.1,0,1,1
4,1,0,35.0,0,0,8.05,0,1,0
6,0,0,54.0,0,0,51.8625,0,1,0
7,1,0,2.0,3,1,21.075,0,1,0
8,1,1,27.0,0,2,11.1333,0,1,1
9,1,1,14.0,1,0,30.0708,0,0,1
10,1,1,4.0,1,1,16.7,0,1,1


In [126]:
data_preprocessed = data_cleaned.copy()
test_data_preprocessed = data_cleaned_test.copy()
data_preprocessed.to_csv('titanic_preprocessed.csv', index=False)
test_data_preprocessed.to_csv('titanic_preprocessed_test.csv', index=False)