# The Titanic - Machine Learning from Disaster

In [40]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [41]:
# Load and examine the datasets
train = pd.read_csv("Data/train.csv")
test = pd.read_csv("Data/test.csv")

----
&nbsp;
### `training` set

In [42]:
print(F"Shape of the DataFrame: {train.shape}")
train.head(10)

Shape of the DataFrame: (891, 12)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [43]:
train_columns = train.columns.tolist()
print(F"With columns:\n{train_columns}")

With columns:
['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']


In [44]:
# Apply an underscore preceding the second capital letter (if any) in column names
train.columns = train.columns.str.replace(r'([a-z])([A-Z])', r'\1_\2').str.lower()
train.columns = train.columns.str.lower()

  train.columns = train.columns.str.replace(r'([a-z])([A-Z])', r'\1_\2').str.lower()


In [45]:
test.columns = test.columns.str.replace(r'([a-z])([A-Z])', r'\1_\2').str.lower()
test.columns = test.columns.str.lower()

  test.columns = test.columns.str.replace(r'([a-z])([A-Z])', r'\1_\2').str.lower()


In [46]:
print(train.columns)

Index(['passenger_id', 'survived', 'pclass', 'name', 'sex', 'age', 'sib_sp',
       'parch', 'ticket', 'fare', 'cabin', 'embarked'],
      dtype='object')


In [47]:
print(test.columns)

Index(['passenger_id', 'pclass', 'name', 'sex', 'age', 'sib_sp', 'parch',
       'ticket', 'fare', 'cabin', 'embarked'],
      dtype='object')


No `survived` column in `test` DataFrame. This is our binary output label

Let's explicitly set the `passenger_id` column as the index columns

In [48]:
train = train.set_index("passenger_id")
test = test.set_index("passenger_id")

In [49]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   survived  891 non-null    int64  
 1   pclass    891 non-null    int64  
 2   name      891 non-null    object 
 3   sex       891 non-null    object 
 4   age       714 non-null    float64
 5   sib_sp    891 non-null    int64  
 6   parch     891 non-null    int64  
 7   ticket    891 non-null    object 
 8   fare      891 non-null    float64
 9   cabin     204 non-null    object 
 10  embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 83.5+ KB


`age` is fractional if $<1$

Numerical Attributes:

In [50]:
train.describe()

Unnamed: 0,survived,pclass,age,sib_sp,parch,fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


- Is our target variable binary?

In [51]:
train['survived'].value_counts()

0    549
1    342
Name: survived, dtype: int64

Variation of categorical attributes:

In [52]:
train['pclass'].value_counts()

3    491
1    216
2    184
Name: pclass, dtype: int64

In [53]:
train['sex'].value_counts()

male      577
female    314
Name: sex, dtype: int64

In [54]:
train['embarked'].value_counts()

S    644
C    168
Q     77
Name: embarked, dtype: int64

Where:
- C = Cherbourg
- Q = Queenstown
- S = Southampton

----
&nbsp;
## Building Preprocessing Pipelines