# Titanic Dataset - Machine Learning

In [222]:
import numpy as np 

import pandas as pd 

import seaborn as sns 
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

from sklearn.metrics import mean_squared_error

# ignore  the warning
import warnings  
warnings.filterwarnings('ignore') 

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


## 1. Exploring data.

Dataset information:

* class - Passenger Class (1 = 1st; 2 = 2nd; 3 = 3rd)
* name 
* sex 
* age 
* sibsp - Number of Siblings/Spouses Aboard
* parch - Number of Parents/Children Aboard
* ticket - Ticket Number
* fare - Passenger Fare
* cabin - Cabin
* embarked - Port of Embarkation (C = Cherbourg; Q = Queenstown; S = Southampton)

In [223]:
# Reading train and test files.

train_data = pd.read_csv("train.csv")

test_data = pd.read_csv("test.csv")

print(train_data.shape)
print(test_data.shape)

(891, 12)
(418, 11)


In [224]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [225]:
# Show the information of both df.

def show_info(train, test):

    print("Information of Train DataFrame")
    print(train.info())
    print("")
    print("")
    print("")
    print("Information of Test DataFrame")
    print(test.info())
    
show_info(train_data, test_data)

Information of Train DataFrame
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
None



Information of Test DataFrame
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       ----------

## 2. Feature Engineering.

In [226]:
# # Removing features.

# train_data = train_data.drop(['Name', 'Ticket', 'Cabin'], axis=1)
# print("Shape of train_data:", train_data.shape)

# test_data = test_data.drop(['Name', 'Ticket', 'Cabin'], axis=1)
# print("Shape of test_data:",test_data.shape)

In [227]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [228]:
# Age

age_total = train_data.Age.shape
print("Total of age rows: ", age_total)
age_null = train_data.Age.isnull().sum()
print("Total of NULL age rows: ",age_null)

perc_age = int(age_null*100/age_total)
print("Aproximated percentage of null age values:", perc_age, "%")

Total of age rows:  (891,)
Total of NULL age rows:  177
Aproximated percentage of null age values: 19 %


So almost **1 in 5 instances are missing**. That is a very significant number. Instead of deleting the rows, we'll substitute the value for the media. 

In [229]:
train_data['Age'] = train_data['Age'].fillna(train_data['Age'].mean()) 
test_data['Age'] = test_data['Age'].fillna(test_data['Age'].mean()) 

train_data.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In the process we'll also imputate the most frequent value **(S)** in the embarked column.

In [230]:
train_data['Embarked'].value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [231]:
train_data['Embarked'] = train_data['Embarked'].fillna('S') 
test_data['Embarked'] = test_data['Embarked'].fillna('S') 

After that, we want to drop the columns we think that won't affect our results.

In [232]:
train_data = train_data.drop(['Name', 'Ticket', 'Cabin', 'PassengerId'], axis=1)

test_data = test_data.drop(['Name', 'Ticket', 'Cabin', 'PassengerId'], axis=1)
train_data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


So now we have a cleaner dataset. But we need to make more modifications before starting modeling.

In [233]:
# Let's change the values of the sex column for int. 

train_data['Sex'] = train_data['Sex'].replace({'male': 0, 'female': 1})
test_data['Sex'] = test_data['Sex'].replace({'male': 0, 'female': 1})

train_data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,0,22.0,1,0,7.25,S
1,1,1,1,38.0,1,0,71.2833,C
2,1,3,1,26.0,0,0,7.925,S
3,1,1,1,35.0,1,0,53.1,S
4,0,3,0,35.0,0,0,8.05,S


In [234]:
# Doing the same replacement, but for the Embarked column.

train_data['Embarked'] = train_data['Embarked'].replace({'C': 1, 'S':2, 'Q': 3})
test_data['Embarked'] = test_data['Embarked'].replace({'C': 1, 'S': 2, 'Q': 3})

train_data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,0,22.0,1,0,7.25,2
1,1,1,1,38.0,1,0,71.2833,1
2,1,3,1,26.0,0,0,7.925,2
3,1,1,1,35.0,1,0,53.1,2
4,0,3,0,35.0,0,0,8.05,2


As we are changing the values for integers, we continue **doing it by dividing the age column in ranges and working with the family members**.

In [235]:
# We divide the age feature.

train_data['AgeBand'] = pd.cut(train_data['Age'], 5)
train_data[['AgeBand', 'Survived']].groupby(['AgeBand'], as_index=False).mean().sort_values(by='AgeBand', ascending=True)

Unnamed: 0,AgeBand,Survived
0,"(0.34, 16.336]",0.55
1,"(16.336, 32.252]",0.344168
2,"(32.252, 48.168]",0.404255
3,"(48.168, 64.084]",0.434783
4,"(64.084, 80.0]",0.090909


In [236]:
# And convert it into values depending on the range.

combine = [train_data, test_data]
train_data.head()

for dataset in combine:    
    dataset.loc[(dataset['Age'] <= 16), 'Age'] = 0
    dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 32), 'Age'] = 1
    dataset.loc[(dataset['Age'] > 32) & (dataset['Age'] <= 48), 'Age'] = 2
    dataset.loc[(dataset['Age'] > 48) & (dataset['Age'] <= 64), 'Age'] = 3
    dataset.loc[ dataset['Age'] > 64, 'Age'] = 4
train_data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,AgeBand
0,0,3,0,1.0,1,0,7.25,2,"(16.336, 32.252]"
1,1,1,1,2.0,1,0,71.2833,1,"(32.252, 48.168]"
2,1,3,1,1.0,0,0,7.925,2,"(16.336, 32.252]"
3,1,1,1,2.0,1,0,53.1,2,"(32.252, 48.168]"
4,0,3,0,2.0,0,0,8.05,2,"(32.252, 48.168]"


In [237]:
# Now with the number of family members.

for dataset in combine:
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1

train_data[['FamilySize', 'Survived']].groupby(['FamilySize'], as_index=False).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0,FamilySize,Survived
3,4,0.724138
2,3,0.578431
1,2,0.552795
6,7,0.333333
0,1,0.303538
4,5,0.2
5,6,0.136364
7,8,0.0
8,11,0.0


In [238]:
# We'll see if the fact that being alone is a good feature for modeling.

for dataset in combine:
    dataset['IsAlone'] = 0
    dataset.loc[dataset['FamilySize'] == 1, 'IsAlone'] = 1

train_data[['IsAlone', 'Survived']].groupby(['IsAlone'], as_index=False).mean()

Unnamed: 0,IsAlone,Survived
0,0,0.50565
1,1,0.303538


In [239]:
# Dropping the columns 'Parch', 'SibSp' y 'FamilySize'.

train_data = train_data.drop(['Parch', 'SibSp', 'FamilySize'], axis=1)
test_data = test_data.drop(['Parch', 'SibSp', 'FamilySize'], axis=1)
combine = [train_data, test_data]

train_data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked,AgeBand,IsAlone
0,0,3,0,1.0,7.25,2,"(16.336, 32.252]",0
1,1,1,1,2.0,71.2833,1,"(32.252, 48.168]",0
2,1,3,1,1.0,7.925,2,"(16.336, 32.252]",1
3,1,1,1,2.0,53.1,2,"(32.252, 48.168]",0
4,0,3,0,2.0,8.05,2,"(32.252, 48.168]",1


In [240]:
# We'll do the same with the fares.

train_data['FareBand'] = pd.qcut(train_data['Fare'], 4)
train_data[['FareBand', 'Survived']].groupby(['FareBand'], as_index=False).mean().sort_values(by='FareBand', ascending=True)

Unnamed: 0,FareBand,Survived
0,"(-0.001, 7.91]",0.197309
1,"(7.91, 14.454]",0.303571
2,"(14.454, 31.0]",0.454955
3,"(31.0, 512.329]",0.581081


In [241]:
test_data.tail()

Unnamed: 0,Pclass,Sex,Age,Fare,Embarked,IsAlone
413,3,0,1.0,8.05,2,1
414,1,1,2.0,108.9,1,1
415,3,0,2.0,7.25,2,1
416,3,0,1.0,8.05,2,1
417,3,0,1.0,22.3583,1,0


## 3. Modeling.

After selecting the features we will be working with, and creating a train_test_split, we continue by training the models and selecting the one with best performance. As this is a regression problem, we'll use **regressors**. For this:

1. Preparing data.
2. Training models.
3. Printing accuracy score. 

In [242]:
y = train_data.Survived
feature_columns = ['Pclass', 'Sex', 'Age', 'Fare', 'Embarked', 'IsAlone']
X = train_data[feature_columns]

In [243]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1)

tree_regressor = DecisionTreeRegressor()
knn_regressor = KNeighborsRegressor()
random_tree = RandomForestRegressor()

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(668, 6) (223, 6) (668,) (223,)


In [244]:
tree_regressor.fit(X_train, y_train)
knn_regressor.fit(X_train, y_train)
random_tree.fit(X_train, y_train)

RandomForestRegressor()

In [245]:
models = ['Decision Tree Regressor', 'KNeighbors Regressor', 'Random Forest Regressor']

for i, model in enumerate([tree_regressor, knn_regressor, random_tree]):
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    print(f'Model: {models[i]}')
    print("-"*30)

    rmse_train = np.sqrt(mean_squared_error(y_train, y_train_pred))
    rmse_test = np.sqrt(mean_squared_error(y_test, y_test_pred))
    print(f'RMSE Train: {rmse_train}')
    print(f'RMSE Test: {rmse_test}')

Model: Decision Tree Regressor
------------------------------
RMSE Train: 0.197442135428784
RMSE Test: 0.47310418796769593
Model: KNeighbors Regressor
------------------------------
RMSE Train: 0.3514725225842506
RMSE Test: 0.41819599756176723
Model: Random Forest Regressor
------------------------------
RMSE Train: 0.22559498157938393
RMSE Test: 0.3952713829832438


<hr>

### 3.1. Evaluating models.

All these models works pretty well with their default parameters, but we may want to try to improve them.

In [246]:
rmse_train_list = []
rmse_test_list = []

k_neigh = [1,2,3,4,5,6,7,8,9,10,15,20,25,30,35,40,50]

for k in k_neigh:
    
    clf = KNeighborsRegressor(n_neighbors= k)
    
    clf.fit(X_train, y_train)
    
    y_test_pred = clf.predict(X_test)
    test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
    
    y_train_pred = clf.predict(X_train)
    train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
    
    rmse_train_list.append(train_rmse)
    rmse_test_list.append(test_rmse)


In [247]:
rmse_train_tree_list = []
rmse_test_tree_list = []

t_depth = [1,2,3,4,5,6,7,8,9,10,15,20,25,30,35,40,50]

for t in t_depth:
    
    tree = DecisionTreeRegressor(max_depth = t)
    
    tree.fit(X_train, y_train)
    
    y_test_pred_tree = tree.predict(X_test)
    test_rmse_tree = np.sqrt(mean_squared_error(y_test, y_test_pred_tree))
    
    y_train_pred_tree = tree.predict(X_train)
    train_rmse_tree = np.sqrt(mean_squared_error(y_train, y_train_pred_tree))
    
    rmse_train_tree_list.append(train_rmse_tree)
    rmse_test_tree_list.append(test_rmse_tree)

In [248]:
rmse_train_rndm_list = []
rmse_test_rndm_list = []

r_depth = [1,2,3,4,5,6,7,8,9,10,15,20,25,30,35,40,50]

for r in r_depth:
    
    rndm_forest = RandomForestRegressor(max_depth = r)
    
    rndm_forest.fit(X_train, y_train)
    
    y_test_pred_forest = rndm_forest.predict(X_test)
    test_rmse_forest = np.sqrt(mean_squared_error(y_test, y_test_pred_forest))
    
    y_train_pred_forest = rndm_forest.predict(X_train)
    train_rmse_forest = np.sqrt(mean_squared_error(y_train, y_train_pred_forest))
    
    rmse_train_rndm_list.append(train_rmse_forest)
    rmse_test_rndm_list.append(test_rmse_forest)


In [249]:
df_score = pd.DataFrame()

index_dfscore = [1,2,3,4,5,6,7,8,9,10,15,20,25,30,35,40,50]
df_score['Value'] = index_dfscore
df_score['KNN TRAIN'] = rmse_train_list
df_score['KNN TEST'] = rmse_test_list
df_score['Decission Tree TRAIN'] = rmse_train_tree_list
df_score['Decission Tree TEST'] = rmse_test_tree_list
df_score['Random Forest TRAIN'] = rmse_train_rndm_list
df_score['Random Forest TEST'] = rmse_test_rndm_list
df_score.set_index('Value')

Unnamed: 0_level_0,KNN TRAIN,KNN TEST,Decission Tree TRAIN,Decission Tree TEST,Random Forest TRAIN,Random Forest TEST
Value,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,0.284321,0.53572,0.406597,0.413996,0.406609,0.41393
2,0.297821,0.461524,0.371105,0.415419,0.368138,0.406
3,0.319055,0.424698,0.350126,0.392005,0.34721,0.390266
4,0.33619,0.413478,0.33547,0.391004,0.329187,0.385067
5,0.351473,0.418196,0.323713,0.397353,0.312834,0.379763
6,0.363985,0.414756,0.309593,0.400155,0.296007,0.37728
7,0.371113,0.420379,0.293704,0.424736,0.278099,0.383963
8,0.377394,0.420118,0.277263,0.419728,0.264053,0.377835
9,0.380093,0.41534,0.265996,0.4279,0.252048,0.38017
10,0.386525,0.417015,0.256002,0.424886,0.239797,0.38495


### 3.2. Updating models.

In [250]:
y = train_data.Survived
feature_columns = ['Pclass', 'Sex', 'Age', 'Fare', 'Embarked', 'IsAlone']
X = train_data[feature_columns]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1)

tree_regressor2 = DecisionTreeRegressor(max_depth=4)
knn_regressor2 = KNeighborsRegressor(n_neighbors=4)
random_tree2 = RandomForestRegressor(max_depth=8)

tree_regressor2.fit(X_train, y_train)
knn_regressor2.fit(X_train, y_train)
random_tree2.fit(X_train, y_train)

models = ['Decision Tree Regressor', 'KNeighbors Regressor', 'Random Forest Regressor']

for i, model in enumerate([tree_regressor2, knn_regressor2, random_tree2]):
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    print(f'Model: {models[i]}')
    print("-"*30)

    rmse_train2 = np.sqrt(mean_squared_error(y_train, y_train_pred))
    rmse_test2 = np.sqrt(mean_squared_error(y_test, y_test_pred))
    print(f'RMSE Train: {rmse_train2}')
    print(f'RMSE Test: {rmse_test2}')

Model: Decision Tree Regressor
------------------------------
RMSE Train: 0.33547047602076574
RMSE Test: 0.39100409551135823
Model: KNeighbors Regressor
------------------------------
RMSE Train: 0.33619035098558137
RMSE Test: 0.41347808353108406
Model: Random Forest Regressor
------------------------------
RMSE Train: 0.26369129826840654
RMSE Test: 0.3797111203215149
