# Data engineering on the famous Titatnic dataset.

1. Saving and loading the dataset

In [29]:
import pandas
import matplotlib.pyplot as plt

In [30]:
import random as rd
rd.seed(0)

In [31]:
raw_data = pandas.read_csv('titanic.csv')
raw_data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


2. Using Pnadas to study our dataset

In [32]:
# Examining the length of the dataset
print("The dataset has", len(raw_data), "rows")

The dataset has 891 rows


In [33]:
# Examining the columns in the dataset
print("Columns (features of the dataset)")
raw_data.columns

Columns (features of the dataset)


Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [34]:
# Examining the labels
print("Labels")
raw_data["Survived"]

Labels


0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 891, dtype: int64

In [35]:
# Examining how many passengers survived
print(sum(raw_data['Survived']),'passengers survived out of',len(raw_data))

342 passengers survived out of 891


In [36]:
# One can look at several columns together
raw_data[["Name", "Age"]]

Unnamed: 0,Name,Age
0,"Braund, Mr. Owen Harris",22.0
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0
2,"Heikkinen, Miss. Laina",26.0
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0
4,"Allen, Mr. William Henry",35.0
...,...,...
886,"Montvila, Rev. Juozas",27.0
887,"Graham, Miss. Margaret Edith",19.0
888,"Johnston, Miss. Catherine Helen ""Carrie""",
889,"Behr, Mr. Karl Howell",26.0


3. Cleaning up our dataset: Missing values and how to deal with them.

In [37]:
raw_data.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [38]:
raw_data['Cabin']

0       NaN
1       C85
2       NaN
3      C123
4       NaN
       ... 
886     NaN
887     B42
888     NaN
889    C148
890     NaN
Name: Cabin, Length: 891, dtype: object

In [39]:
print("The Cabin column is missing", sum(raw_data['Cabin'].isna()), "values out of",len(raw_data['Cabin']))

The Cabin column is missing 687 values out of 891


In [40]:
clean_data = raw_data.drop('Cabin', axis=1)

In [41]:
clean_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S


In [42]:
clean_data['Age']

0      22.0
1      38.0
2      26.0
3      35.0
4      35.0
       ... 
886    27.0
887    19.0
888     NaN
889    26.0
890    32.0
Name: Age, Length: 891, dtype: float64

In [43]:
median_age = raw_data["Age"].median()
median_age

28.0

In [44]:
clean_data["Age"] = clean_data["Age"].fillna(median_age)

In [45]:
clean_data["Embarked"] = clean_data["Embarked"].fillna('U')

In [46]:
clean_data.isna().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64

In [47]:
clean_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S


Saving our data for the future

In [48]:
clean_data.to_csv('./clean_titanic_data.csv', index=None)

# Feature engineering: Transforming the features in our dataset before training the models

* One-hot encoding
* Binning
* Feature selection

## One-hot encoding

In [49]:
preprocessed_data = pandas.read_csv('clean_titanic_data.csv')
preprocessed_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S


In [50]:
gender_columns = pandas.get_dummies(preprocessed_data['Sex'], prefix='Sex')
print(gender_columns)
embarked_columns = pandas.get_dummies(preprocessed_data["Embarked"], prefix="Embarked")
print(embarked_columns)

     Sex_female  Sex_male
0         False      True
1          True     False
2          True     False
3          True     False
4         False      True
..          ...       ...
886       False      True
887        True     False
888        True     False
889       False      True
890       False      True

[891 rows x 2 columns]
     Embarked_C  Embarked_Q  Embarked_S  Embarked_U
0         False       False        True       False
1          True       False       False       False
2         False       False        True       False
3         False       False        True       False
4         False       False        True       False
..          ...         ...         ...         ...
886       False       False        True       False
887       False       False        True       False
888       False       False        True       False
889        True       False       False       False
890       False        True       False       False

[891 rows x 4 columns]


In [51]:
preprocessed_data = pandas.concat([preprocessed_data, gender_columns], axis=1)
preprocessed_data = pandas.concat([preprocessed_data, embarked_columns], axis=1)

In [52]:
preprocessed_data = preprocessed_data.drop(['Sex', 'Embarked'], axis=1)

In [53]:
preprocessed_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Embarked_U
0,1,0,3,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.25,False,True,False,False,True,False
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,True,False,True,False,False,False
2,3,1,3,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.925,True,False,False,False,True,False
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1,True,False,False,False,True,False
4,5,0,3,"Allen, Mr. William Henry",35.0,0,0,373450,8.05,False,True,False,False,True,False


A rule of thumb for when to one-hot encode or not

In [54]:
class_survived = preprocessed_data[['Pclass', 'Survived']]

first_class = class_survived[class_survived['Pclass'] == 1]
second_class = class_survived[class_survived['Pclass'] == 2]
third_class = class_survived[class_survived['Pclass'] == 3]

print("In first class", sum(first_class['Survived'])/len(first_class)*100, "% of passengers survived")
print("In second class", sum(second_class['Survived'])/len(second_class)*100, "% of passengers survived")
print("In third class", sum(third_class['Survived'])/len(third_class)*100, "% of passengers survived")

In first class 62.96296296296296 % of passengers survived
In second class 47.28260869565217 % of passengers survived
In third class 24.236252545824847 % of passengers survived


In [55]:
categorized_pclass_columns = pandas.get_dummies(preprocessed_data['Pclass'], prefix='Pclass')
preprocessed_data = pandas.concat([preprocessed_data, categorized_pclass_columns], axis=1)
preprocessed_data = preprocessed_data.drop(['Pclass'], axis=1)

In [57]:
preprocessed_data.head()

Unnamed: 0,PassengerId,Survived,Name,Age,SibSp,Parch,Ticket,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Embarked_U,Pclass_1,Pclass_2,Pclass_3
0,1,0,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.25,False,True,False,False,True,False,False,False,True
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,True,False,True,False,False,False,True,False,False
2,3,1,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.925,True,False,False,False,True,False,False,False,True
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1,True,False,False,False,True,False,True,False,False
4,5,0,"Allen, Mr. William Henry",35.0,0,0,373450,8.05,False,True,False,False,True,False,False,False,True


## Binning: Turning numerical data into categorical data (and why would we want to do this?)

In [58]:
bins = [0, 10, 20, 30, 40, 50, 60, 70, 80]
categorized_age = pandas.cut(preprocessed_data['Age'], bins)
preprocessed_data['Categorized_age'] = categorized_age
preprocessed_data = preprocessed_data.drop(["Age"], axis=1)

In [59]:
preprocessed_data.head()

Unnamed: 0,PassengerId,Survived,Name,SibSp,Parch,Ticket,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Embarked_U,Pclass_1,Pclass_2,Pclass_3,Categorized_age
0,1,0,"Braund, Mr. Owen Harris",1,0,A/5 21171,7.25,False,True,False,False,True,False,False,False,True,"(20, 30]"
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,0,PC 17599,71.2833,True,False,True,False,False,False,True,False,False,"(30, 40]"
2,3,1,"Heikkinen, Miss. Laina",0,0,STON/O2. 3101282,7.925,True,False,False,False,True,False,False,False,True,"(20, 30]"
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,0,113803,53.1,True,False,False,False,True,False,True,False,False,"(30, 40]"
4,5,0,"Allen, Mr. William Henry",0,0,373450,8.05,False,True,False,False,True,False,False,False,True,"(30, 40]"


In [61]:
cagegorized_age_columns = pandas.get_dummies(preprocessed_data['Categorized_age'], prefix='Categorized_age')
preprocessed_data = pandas.concat([preprocessed_data, cagegorized_age_columns], axis=1)
preprocessed_data = preprocessed_data.drop(['Categorized_age'], axis=1)

In [62]:

preprocessed_data.head()

Unnamed: 0,PassengerId,Survived,Name,SibSp,Parch,Ticket,Fare,Sex_female,Sex_male,Embarked_C,...,Pclass_2,Pclass_3,"Categorized_age_(0, 10]","Categorized_age_(10, 20]","Categorized_age_(20, 30]","Categorized_age_(30, 40]","Categorized_age_(40, 50]","Categorized_age_(50, 60]","Categorized_age_(60, 70]","Categorized_age_(70, 80]"
0,1,0,"Braund, Mr. Owen Harris",1,0,A/5 21171,7.25,False,True,False,...,False,True,False,False,True,False,False,False,False,False
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,0,PC 17599,71.2833,True,False,True,...,False,False,False,False,False,True,False,False,False,False
2,3,1,"Heikkinen, Miss. Laina",0,0,STON/O2. 3101282,7.925,True,False,False,...,False,True,False,False,True,False,False,False,False,False
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,0,113803,53.1,True,False,False,...,False,False,False,False,False,True,False,False,False,False
4,5,0,"Allen, Mr. William Henry",0,0,373450,8.05,False,True,False,...,False,True,False,False,False,True,False,False,False,False


## Feature selection: Getting rid of unnecessary featutes

In [63]:
preprocessed_data = preprocessed_data.drop(['Name', 'Ticket', 'PassengerId'], axis=1)

In [None]:
# Saving for future use

preprocessed_data.to_csv('preprocessed_titanic_data.csv', index=None)

In [64]:
preprocessed_data.head()

Unnamed: 0,Survived,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Embarked_U,...,Pclass_2,Pclass_3,"Categorized_age_(0, 10]","Categorized_age_(10, 20]","Categorized_age_(20, 30]","Categorized_age_(30, 40]","Categorized_age_(40, 50]","Categorized_age_(50, 60]","Categorized_age_(60, 70]","Categorized_age_(70, 80]"
0,0,1,0,7.25,False,True,False,False,True,False,...,False,True,False,False,True,False,False,False,False,False
1,1,1,0,71.2833,True,False,True,False,False,False,...,False,False,False,False,False,True,False,False,False,False
2,1,0,0,7.925,True,False,False,False,True,False,...,False,True,False,False,True,False,False,False,False,False
3,1,1,0,53.1,True,False,False,False,True,False,...,False,False,False,False,False,True,False,False,False,False
4,0,0,0,8.05,False,True,False,False,True,False,...,False,True,False,False,False,True,False,False,False,False


### Saving for future use

In [65]:
preprocessed_data.to_csv('preprocessed_titanic_data.csv', index=None)

# Training Models

In [66]:
data = pandas.read_csv('./preprocessed_titanic_data.csv')
data.head()

Unnamed: 0,Survived,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Embarked_U,...,Pclass_2,Pclass_3,"Categorized_age_(0, 10]","Categorized_age_(10, 20]","Categorized_age_(20, 30]","Categorized_age_(30, 40]","Categorized_age_(40, 50]","Categorized_age_(50, 60]","Categorized_age_(60, 70]","Categorized_age_(70, 80]"
0,0,1,0,7.25,False,True,False,False,True,False,...,False,True,False,False,True,False,False,False,False,False
1,1,1,0,71.2833,True,False,True,False,False,False,...,False,False,False,False,False,True,False,False,False,False
2,1,0,0,7.925,True,False,False,False,True,False,...,False,True,False,False,True,False,False,False,False,False
3,1,1,0,53.1,True,False,False,False,True,False,...,False,False,False,False,False,True,False,False,False,False
4,0,0,0,8.05,False,True,False,False,True,False,...,False,True,False,False,False,True,False,False,False,False


Splitting the data into features and labels, and training and validation

In [67]:
features = data.drop(["Survived"], axis=1)
labels = data["Survived"]

In [68]:
from sklearn.model_selection import train_test_split

In [69]:
# remark: we fix random_state the end, to make sure we always get the same split
features_train, features_validation_test, labels_train, labels_validation_test = train_test_split(
    features, labels, test_size=0.4, random_state=100)

In [70]:
features_validation, features_test, labels_validation, labels_test = train_test_split(
    features_validation_test, labels_validation_test, test_size=0.5, random_state=100)

In [71]:
print(len(features_train))
print(len(features_validation))
print(len(features_test))
print(len(labels_train))
print(len(labels_validation))
print(len(labels_test))

534
178
179
534
178
179


# Training several models on our dataset

We'll train four models:

* Logistic regression (perceptron)
* Decision tree
* Naive Bayes
* Support vector machine (SVM)

In [74]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression

lr_model = LogisticRegression()
lr_model.fit(features_train, labels_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [75]:
# Decision Tree


from sklearn.tree import DecisionTreeClassifier

dt_model = DecisionTreeClassifier()
dt_model.fit(features_train, labels_train)

In [76]:
# Naive Bayes

from sklearn.naive_bayes import GaussianNB

nb_model = GaussianNB()
nb_model.fit(features_train, labels_train)

In [77]:
# SVM

from sklearn.svm import SVC

svm_model = SVC()
svm_model.fit(features_train, labels_train)

In [78]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier()
rf_model.fit(features_train, labels_train)

In [79]:
# Gradient Boosting


from sklearn.ensemble import GradientBoostingClassifier

gb_model = GradientBoostingClassifier()
gb_model.fit(features_train, labels_train)

In [80]:
# Ada Boost
from sklearn.ensemble import AdaBoostClassifier

ab_model = AdaBoostClassifier()
ab_model.fit(features_train, labels_train)



# Evaluating the Models

## Accuracy