In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

from google.colab import drive
from google.colab import files

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

# **Loading the data**

In [3]:
folder_path='/content/gdrive/My Drive/Data Science journey/Kaggle competitions/Titanic - Machine Learning from Disaster/'
file_path_train=folder_path+'train.csv'
file_path_test=folder_path+'test.csv'

print(file_path_train)
print(file_path_test)

/content/gdrive/My Drive/Data Science journey/Kaggle competitions/Titanic - Machine Learning from Disaster/train.csv
/content/gdrive/My Drive/Data Science journey/Kaggle competitions/Titanic - Machine Learning from Disaster/test.csv


In [4]:
# connect to Google Drive
drive.mount('/content/gdrive')
open(file_path_train).read()
open(file_path_test).read()

Mounted at /content/gdrive


'PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked\n892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q\n893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47,1,0,363272,7,,S\n894,2,"Myles, Mr. Thomas Francis",male,62,0,0,240276,9.6875,,Q\n895,3,"Wirz, Mr. Albert",male,27,0,0,315154,8.6625,,S\n896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22,1,1,3101298,12.2875,,S\n897,3,"Svensson, Mr. Johan Cervin",male,14,0,0,7538,9.225,,S\n898,3,"Connolly, Miss. Kate",female,30,0,0,330972,7.6292,,Q\n899,2,"Caldwell, Mr. Albert Francis",male,26,1,1,248738,29,,S\n900,3,"Abrahim, Mrs. Joseph (Sophie Halaut Easu)",female,18,0,0,2657,7.2292,,C\n901,3,"Davies, Mr. John Samuel",male,21,2,0,A/4 48871,24.15,,S\n902,3,"Ilieff, Mr. Ylio",male,,0,0,349220,7.8958,,S\n903,1,"Jones, Mr. Charles Cresson",male,46,0,0,694,26,,S\n904,1,"Snyder, Mrs. John Pillsbury (Nelle Stevenson)",female,23,1,0,21228,82.2667,B45,S\n905,2,"Howard, Mr. Benjamin",male,63,1,0,24065,26,,S\n906,1,"Chaff

In [5]:
df_train=pd.read_csv(file_path_train)
df_test=pd.read_csv(file_path_test)

# **Exploring the data**

In [6]:
df_train.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [7]:
df_test.head(3)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q


In [8]:
df_train.set_index('PassengerId', inplace=True)
df_test.set_index('PassengerId', inplace=True)

In [9]:
df_train.isnull().sum()

Unnamed: 0,0
Survived,0
Pclass,0
Name,0
Sex,0
Age,177
SibSp,0
Parch,0
Ticket,0
Fare,0
Cabin,687


In [10]:
df_train.shape

(891, 11)

# **Feature engineering**

In [11]:
df_train.drop(columns=['Cabin'], inplace=True)
df_test.drop(columns=['Cabin'], inplace=True)

In [12]:
# extract Title from Name
df_train['Title'] = df_train.Name.str.extract(' ([A-Za-z]+)\.', expand=False) #Creating new column name Title
df_test['Title'] = df_test.Name.str.extract(' ([A-Za-z]+)\.', expand=False)

In [13]:
df_train['Title'].value_counts()

Unnamed: 0_level_0,count
Title,Unnamed: 1_level_1
Mr,517
Miss,182
Mrs,125
Master,40
Dr,7
Rev,6
Col,2
Mlle,2
Major,2
Ms,1


In [14]:
#classify common titles and group them.
df_train["Title"] = df_train["Title"].replace('Master', 'Master')
df_train["Title"] = df_train["Title"].replace('Mlle', 'Miss')
df_train["Title"] = df_train["Title"].replace(['Mme', 'Dona', 'Ms'], 'Mrs')
df_train["Title"] = df_train["Title"].replace(['Don','Jonkheer'],'Mr')
df_train["Title"] = df_train["Title"].replace(['Capt','Rev','Major', 'Col','Dr'], 'Professional')
df_train["Title"] = df_train["Title"].replace(['Lady', 'Countess','Sir'], 'Noble')

In [15]:
df_test["Title"] = df_test["Title"].replace('Master', 'Master')
df_test["Title"] = df_test["Title"].replace('Mlle', 'Miss')
df_test["Title"] = df_test["Title"].replace(['Mme', 'Dona', 'Ms'], 'Mrs')
df_test["Title"] = df_test["Title"].replace(['Don','Jonkheer'],'Mr')
df_test["Title"] = df_test["Title"].replace(['Capt','Rev','Major', 'Col','Dr'], 'Professional')
df_test["Title"] = df_test["Title"].replace(['Lady', 'Countess','Sir'], 'Noble')

In [16]:
X_train = df_train.drop(columns=['Survived'])
y_train = df_train['Survived']

In [17]:
# determine the type of columns in X_train
numerical_cols=[col for col in X_train.columns if X_train[col].dtype in ['int64', 'float64']]
categorical_cols=[col for col in X_train.columns if X_train[col].dtype=='object']
print(numerical_cols)
print(categorical_cols)

['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']
['Name', 'Sex', 'Ticket', 'Embarked', 'Title']


# **Preprocessing num/cat columns**

In [18]:
# Preprocessing for numerical data
numerical_transformer = Pipeline (steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

In [19]:
# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder (handle_unknown='ignore'))
])

In [20]:
# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# **Pipeline for Logistic Regression**

In [None]:
# Define model
model_1 = LogisticRegression(max_iter=1000)

In [None]:
# Bundle preprocessing and modeling code in a pipeline
pipe_1= Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model_1)
])

In [None]:
scores =cross_val_score(pipe_1, X_train, y_train, cv=3, scoring='accuracy')
print(np.round(scores, decimals=3))
print("The mean accuracy from 3-fold validation is", np.round(scores.mean(), decimals=3))

[0.815 0.822 0.838]
The mean accuracy from 3-fold validation is 0.825


In [None]:
pipe_1.fit(X_train, y_train)

# **Predictions on test data (Logistic)**

In [None]:
preds_1 = pipe_1.predict(df_test)

In [None]:
preds_1

array([0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0,
       1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [None]:
final_sub = pd.DataFrame()
final_sub['PassengerId'] = df_test.index
final_sub['Survived'] = preds_1

# Write DataFrame to a CSV file without index
file_name='final_submission.csv'
final_sub.to_csv(file_name, index=False)

In [None]:
# download CSV file
files.download(file_name)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# **Pipeline for Random Forest Classifier**

In [None]:
model_2 = RandomForestClassifier(n_estimators=100, random_state=0)

In [None]:
# Bundle preprocessing and modeling code in a pipeline
pipe_2= Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model_2)
])

In [None]:
scores =cross_val_score(pipe_2, X_train, y_train, cv=3, scoring='accuracy')
print(np.round(scores, decimals=3))
print("The mean accuracy from 3-fold validation is", np.round(scores.mean(), decimals=3))

[0.822 0.835 0.818]
The mean accuracy from 3-fold validation is 0.825


In [None]:
pipe_2.fit(X_train, y_train)

# **Predictions on test data (Random Forest)**

In [None]:
preds_2 = pipe_2.predict(df_test)

In [None]:
preds_2

array([0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [None]:
final_sub = pd.DataFrame()
final_sub['PassengerId'] = df_test.index
final_sub['Survived'] = preds_2

# Write DataFrame to a CSV file without index
file_name='final_submission.csv'
final_sub.to_csv(file_name, index=False)

In [None]:
# download CSV file
files.download(file_name)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# **Grid search (Random Forest Classifier)**

In [21]:
# redefine model and pipeline to avoid conflict with previous fitting
model_rfc= RandomForestClassifier(random_state=42)

pipe_rfc= Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model_rfc)
])

In [22]:
param_grid = {
    # the arguments should follow the prefix of the model name in the pipeline
    'model__n_estimators': [100, 200, 500, 1000],
    'model__max_depth': [None, 10, 20, 30],
    'model__min_samples_split': [2, 5, 10]
}

In [23]:
cv = StratifiedKFold(n_splits=3, shuffle=True)

In [24]:
grid_rfc=GridSearchCV(estimator=pipe_rfc, param_grid=param_grid , cv=cv, scoring='accuracy', verbose=2)

In [25]:
grid_rfc.fit(X_train, y_train)

Fitting 3 folds for each of 48 candidates, totalling 144 fits
[CV] END model__max_depth=None, model__min_samples_split=2, model__n_estimators=100; total time=   0.4s
[CV] END model__max_depth=None, model__min_samples_split=2, model__n_estimators=100; total time=   0.9s
[CV] END model__max_depth=None, model__min_samples_split=2, model__n_estimators=100; total time=   0.6s
[CV] END model__max_depth=None, model__min_samples_split=2, model__n_estimators=200; total time=   0.7s
[CV] END model__max_depth=None, model__min_samples_split=2, model__n_estimators=200; total time=   1.1s
[CV] END model__max_depth=None, model__min_samples_split=2, model__n_estimators=200; total time=   1.1s
[CV] END model__max_depth=None, model__min_samples_split=2, model__n_estimators=500; total time=   2.2s
[CV] END model__max_depth=None, model__min_samples_split=2, model__n_estimators=500; total time=   2.2s
[CV] END model__max_depth=None, model__min_samples_split=2, model__n_estimators=500; total time=   6.3s
[C

In [26]:
print("\nBest parameters found: ", grid_rfc.best_params_)
print("Best cross-validation score: {:.2f}".format(grid_rfc.best_score_))


Best parameters found:  {'model__max_depth': None, 'model__min_samples_split': 2, 'model__n_estimators': 200}
Best cross-validation score: 0.84


In [27]:
# the grid search finds the model with the best-performing hyperparameters
grid_rfc_preds = grid_rfc.predict(df_test)

In [28]:
grid_rfc_preds

array([0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1,
       0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [29]:
final_sub = pd.DataFrame()
final_sub['PassengerId'] = df_test.index
final_sub['Survived'] = grid_rfc_preds

# Write DataFrame to a CSV file without index
file_name='rfc_grid_submission.csv'
final_sub.to_csv(file_name, index=False)

In [30]:
# download CSV file
files.download(file_name)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>