In this assignment, you are going to measure the performance of the model you created with the Titanic dataset in the previous lesson. To complete this assignment, send a link to a Jupyter notebook containing solutions to the following tasks.

- Evaluate your model's performance with cross validation and using different metrics.
- Determine the model with the most appropriate parameters by hyperparameter tuning.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import cross_validate

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('train.csv')

df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [4]:
df['Age'] = df['Age'].fillna(int(df['Age'].mean())).astype('int64')
df['Embarked'] = df['Embarked'].fillna(df['Embarked'].value_counts().index[0])
df.drop('PassengerId', axis=1, inplace=True)
df.drop('Cabin', axis=1, inplace=True)
df.drop('Name', axis=1, inplace=True)
df.drop('Ticket', axis=1, inplace=True)
df['Sex'] = pd.get_dummies(df['Sex'], drop_first=True).rename(columns={'male':'Sex'})

df = pd.concat([df.drop('Embarked', axis=1), pd.get_dummies(df['Embarked'], drop_first=True).rename({})], axis=1)

In [5]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Q,S
0,0,3,1,22,1,0,7.25,0,1
1,1,1,0,38,1,0,71.2833,0,0
2,1,3,0,26,0,0,7.925,0,1
3,1,1,0,35,1,0,53.1,0,1
4,0,3,1,35,0,0,8.05,0,1


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    uint8  
 3   Age       891 non-null    int64  
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Fare      891 non-null    float64
 7   Q         891 non-null    uint8  
 8   S         891 non-null    uint8  
dtypes: float64(1), int64(5), uint8(3)
memory usage: 44.5 KB


In [7]:
X = df.drop('Survived', axis=1)
y = df['Survived']

# Cross Validation

In [8]:
lr = LogisticRegression(max_iter=1000)

cv = cross_validate(estimator=lr,
                    X=X,
                    y=y,
                    cv=10,
                    return_train_score=True,
                    scoring = ['accuracy', 'precision', 'r2']
                   )

print('Train Set Mean Accuracy  : {:.2f}  '.format(cv['train_accuracy'].mean()))
print('Train Set Mean R-square  : {:.2f}  '.format(cv['train_r2'].mean()))
print('Train Set Mean Precision : {:.2f}\n'.format(cv['train_precision'].mean()))

print('Test Set Mean Accuracy   : {:.2f}  '.format(cv['test_accuracy'].mean()))
print('Test Set Mean R-square   : {:.2f}  '.format(cv['test_r2'].mean()))
print('Test Set Mean Precision  : {:.2f}  '.format(cv['test_precision'].mean()))

Train Set Mean Accuracy  : 0.80  
Train Set Mean R-square  : 0.17  
Train Set Mean Precision : 0.76

Test Set Mean Accuracy   : 0.79  
Test Set Mean R-square   : 0.13  
Test Set Mean Precision  : 0.75  


# Hyperparameter Tuning

## Grid Search

In [9]:
parameters = {"C": [10 ** x for x in range (-5, 5, 1)],
              "penalty": ['l1', 'l2']} 

In [10]:
grid_cv = GridSearchCV(estimator=lr,
                       param_grid = parameters,
                       cv = 10,
                       verbose = 2)

grid_cv.fit(X, y)

Fitting 10 folds for each of 20 candidates, totalling 200 fits
[CV] C=1e-05, penalty=l1 .............................................
[CV] .............................. C=1e-05, penalty=l1, total=   0.0s
[CV] C=1e-05, penalty=l1 .............................................
[CV] .............................. C=1e-05, penalty=l1, total=   0.0s
[CV] C=1e-05, penalty=l1 .............................................
[CV] .............................. C=1e-05, penalty=l1, total=   0.0s
[CV] C=1e-05, penalty=l1 .............................................
[CV] .............................. C=1e-05, penalty=l1, total=   0.0s
[CV] C=1e-05, penalty=l1 .............................................
[CV] .............................. C=1e-05, penalty=l1, total=   0.0s
[CV] C=1e-05, penalty=l1 .............................................
[CV] .............................. C=1e-05, penalty=l1, total=   0.0s
[CV] C=1e-05, penalty=l1 .............................................
[CV] .........

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s



[CV] C=1e-05, penalty=l2 .............................................
[CV] .............................. C=1e-05, penalty=l2, total=   0.0s
[CV] C=1e-05, penalty=l2 .............................................
[CV] .............................. C=1e-05, penalty=l2, total=   0.0s
[CV] C=1e-05, penalty=l2 .............................................
[CV] .............................. C=1e-05, penalty=l2, total=   0.0s
[CV] C=1e-05, penalty=l2 .............................................
[CV] .............................. C=1e-05, penalty=l2, total=   0.0s
[CV] C=0.0001, penalty=l1 ............................................
[CV] ............................. C=0.0001, penalty=l1, total=   0.0s
[CV] C=0.0001, penalty=l1 ............................................
[CV] ............................. C=0.0001, penalty=l1, total=   0.0s
[CV] C=0.0001, penalty=l1 ............................................
[CV] ............................. C=0.0001, penalty=l1, total=   0.0s
[CV] 

[CV] ............................... C=0.01, penalty=l2, total=   0.1s
[CV] C=0.01, penalty=l2 ..............................................
[CV] ............................... C=0.01, penalty=l2, total=   0.0s
[CV] C=0.01, penalty=l2 ..............................................
[CV] ............................... C=0.01, penalty=l2, total=   0.1s
[CV] C=0.01, penalty=l2 ..............................................
[CV] ............................... C=0.01, penalty=l2, total=   0.1s
[CV] C=0.01, penalty=l2 ..............................................
[CV] ............................... C=0.01, penalty=l2, total=   0.1s
[CV] C=0.1, penalty=l1 ...............................................
[CV] ................................ C=0.1, penalty=l1, total=   0.0s
[CV] C=0.1, penalty=l1 ...............................................
[CV] ................................ C=0.1, penalty=l1, total=   0.0s
[CV] C=0.1, penalty=l1 ...............................................
[CV] .

[CV] ................................. C=10, penalty=l2, total=   0.1s
[CV] C=10, penalty=l2 ................................................
[CV] ................................. C=10, penalty=l2, total=   0.1s
[CV] C=10, penalty=l2 ................................................
[CV] ................................. C=10, penalty=l2, total=   0.1s
[CV] C=10, penalty=l2 ................................................
[CV] ................................. C=10, penalty=l2, total=   0.1s
[CV] C=10, penalty=l2 ................................................
[CV] ................................. C=10, penalty=l2, total=   0.1s
[CV] C=10, penalty=l2 ................................................
[CV] ................................. C=10, penalty=l2, total=   0.1s
[CV] C=100, penalty=l1 ...............................................
[CV] ................................ C=100, penalty=l1, total=   0.0s
[CV] C=100, penalty=l1 ...............................................
[CV] .

[CV] .............................. C=10000, penalty=l2, total=   0.2s
[CV] C=10000, penalty=l2 .............................................
[CV] .............................. C=10000, penalty=l2, total=   0.1s
[CV] C=10000, penalty=l2 .............................................
[CV] .............................. C=10000, penalty=l2, total=   0.1s
[CV] C=10000, penalty=l2 .............................................
[CV] .............................. C=10000, penalty=l2, total=   0.1s
[CV] C=10000, penalty=l2 .............................................
[CV] .............................. C=10000, penalty=l2, total=   0.1s
[CV] C=10000, penalty=l2 .............................................
[CV] .............................. C=10000, penalty=l2, total=   0.2s
[CV] C=10000, penalty=l2 .............................................
[CV] .............................. C=10000, penalty=l2, total=   0.1s
[CV] C=10000, penalty=l2 .............................................
[CV] .

[Parallel(n_jobs=1)]: Done 200 out of 200 | elapsed:   10.6s finished


GridSearchCV(cv=10, estimator=LogisticRegression(max_iter=1000),
             param_grid={'C': [1e-05, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100,
                               1000, 10000],
                         'penalty': ['l1', 'l2']},
             verbose=2)

In [11]:
print("Best Parameters : ", grid_cv.best_params_)
print("Best Score      : ", grid_cv.best_score_)

Best Parameters :  {'C': 0.1, 'penalty': 'l2'}
Best Score      :  0.7968913857677904


In [12]:
display(pd.DataFrame(grid_cv.cv_results_)[['param_penalty','param_C', 'mean_test_score']].sort_values(by='mean_test_score', ascending = False).head())

Unnamed: 0,param_penalty,param_C,mean_test_score
9,l2,0.1,0.796891
15,l2,100.0,0.79573
17,l2,1000.0,0.79573
11,l2,1.0,0.794632
13,l2,10.0,0.794619


## Random Search

In [13]:
parameters = {"C": [10 ** x for x in range (-5, 5, 1)],
              "penalty": ['l1', 'l2']
             }

In [14]:
rs_cv = RandomizedSearchCV(estimator=lr,
                           param_distributions = parameters,
                           cv = 10,
                           n_iter = 10,
                           random_state = 42,
                           scoring = 'precision',
                           verbose = 2
                      )

rs_cv.fit(X, y)

Fitting 10 folds for each of 10 candidates, totalling 100 fits
[CV] penalty=l1, C=1e-05 .............................................
[CV] .............................. penalty=l1, C=1e-05, total=   0.0s
[CV] penalty=l1, C=1e-05 .............................................
[CV] .............................. penalty=l1, C=1e-05, total=   0.0s
[CV] penalty=l1, C=1e-05 .............................................
[CV] .............................. penalty=l1, C=1e-05, total=   0.0s
[CV] penalty=l1, C=1e-05 .............................................
[CV] .............................. penalty=l1, C=1e-05, total=   0.0s
[CV] penalty=l1, C=1e-05 .............................................
[CV] .............................. penalty=l1, C=1e-05, total=   0.0s
[CV] penalty=l1, C=1e-05 .............................................
[CV] .............................. penalty=l1, C=1e-05, total=   0.0s
[CV] penalty=l1, C=1e-05 .............................................
[CV] .........

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s


[CV] ............................... penalty=l2, C=1000, total=   0.1s
[CV] penalty=l2, C=1000 ..............................................
[CV] ............................... penalty=l2, C=1000, total=   0.1s
[CV] penalty=l2, C=1000 ..............................................
[CV] ............................... penalty=l2, C=1000, total=   0.1s
[CV] penalty=l2, C=1000 ..............................................
[CV] ............................... penalty=l2, C=1000, total=   0.1s
[CV] penalty=l2, C=1000 ..............................................
[CV] ............................... penalty=l2, C=1000, total=   0.1s
[CV] penalty=l2, C=1000 ..............................................
[CV] ............................... penalty=l2, C=1000, total=   0.2s
[CV] penalty=l2, C=1000 ..............................................
[CV] ............................... penalty=l2, C=1000, total=   0.1s
[CV] penalty=l2, C=1000 ..............................................
[CV] .

[CV] ............................. penalty=l2, C=0.0001, total=   0.0s
[CV] penalty=l2, C=0.0001 ............................................
[CV] ............................. penalty=l2, C=0.0001, total=   0.0s
[CV] penalty=l2, C=0.0001 ............................................
[CV] ............................. penalty=l2, C=0.0001, total=   0.1s
[CV] penalty=l2, C=0.0001 ............................................
[CV] ............................. penalty=l2, C=0.0001, total=   0.1s
[CV] penalty=l2, C=0.0001 ............................................
[CV] ............................. penalty=l2, C=0.0001, total=   0.1s
[CV] penalty=l2, C=0.0001 ............................................
[CV] ............................. penalty=l2, C=0.0001, total=   0.0s
[CV] penalty=l2, C=0.0001 ............................................
[CV] ............................. penalty=l2, C=0.0001, total=   0.0s
[CV] penalty=l2, C=0.0001 ............................................
[CV] .

[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    6.4s finished


RandomizedSearchCV(cv=10, estimator=LogisticRegression(max_iter=1000),
                   param_distributions={'C': [1e-05, 0.0001, 0.001, 0.01, 0.1,
                                              1, 10, 100, 1000, 10000],
                                        'penalty': ['l1', 'l2']},
                   random_state=42, scoring='precision', verbose=2)

In [15]:
print("Best parameters      : ", rs_cv.best_params_)
print("Best precision value : ", rs_cv.best_score_)

Best parameters      :  {'penalty': 'l2', 'C': 0.001}
Best precision value :  0.7565770486823118


In [16]:
display(pd.DataFrame(rs_cv.cv_results_)[['param_penalty','param_C', 'mean_test_score']].sort_values(by='mean_test_score', ascending = False).head())

Unnamed: 0,param_penalty,param_C,mean_test_score
5,l2,0.001,0.756577
1,l2,1000.0,0.753069
2,l2,100.0,0.753069
6,l2,1.0,0.751608
3,l2,1e-05,0.711389
