In this assignment, you are going to measure the performance of the model you created with the Titanic dataset in the previous lesson. To complete this assignment, send a link to a Jupyter notebook containing solutions to the following tasks.

   -  Evaluate your model's performance with cross validation and using different metrics.
   -  Determine the model with the most appropriate parameters by hyperparameter tuning.

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, mean_squared_error
from sklearn.model_selection import cross_validate, cross_val_score

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

import warnings
warnings.filterwarnings('ignore')

import re

In [2]:
df = pd.read_csv('titanic_final.csv', index_col=0)

df.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Embarked_Q,Embarked_S,Sex_male,Title_Miss,Title_Mr,Title_Mrs,Title_Officer,Title_Royal
0,0,3,22.0,1,0,7.25,0,1,1,0,1,0,0,0
1,1,1,38.0,1,0,71.2833,0,0,0,0,0,1,0,0
2,1,3,26.0,0,0,7.925,0,1,0,1,0,0,0,0
3,1,1,35.0,1,0,53.1,0,1,0,0,0,1,0,0
4,0,3,35.0,0,0,8.05,0,1,1,0,1,0,0,0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Survived       891 non-null    int64  
 1   Pclass         891 non-null    int64  
 2   Age            891 non-null    float64
 3   SibSp          891 non-null    int64  
 4   Parch          891 non-null    int64  
 5   Fare           891 non-null    float64
 6   Embarked_Q     891 non-null    int64  
 7   Embarked_S     891 non-null    int64  
 8   Sex_male       891 non-null    int64  
 9   Title_Miss     891 non-null    int64  
 10  Title_Mr       891 non-null    int64  
 11  Title_Mrs      891 non-null    int64  
 12  Title_Officer  891 non-null    int64  
 13  Title_Royal    891 non-null    int64  
dtypes: float64(2), int64(12)
memory usage: 104.4 KB


In [4]:
X = df.drop('Survived', axis=1)
y = df['Survived']

In [5]:
X_train, X_test, y_train, y_test =  train_test_split(X, y, test_size=0.3, random_state=42)

In [6]:
log = LogisticRegression(max_iter=1000, C=10000, solver = 'newton-cg')
log_model = log.fit(X_train, y_train)

In [7]:
y_pred_test = log_model.predict(X_test)
y_pred_train = log_model.predict(X_train)

In [8]:
print('Accuracy score :',log_model.score(X_test, y_test))

Accuracy score : 0.8283582089552238


### Cross Validation

##### We have 10 different training and test sets. 

In [9]:
log = LogisticRegression(max_iter=1000)

cv = cross_validate(estimator=log,
                    X=X,
                    y=y,
                    cv=10,
                    return_train_score=True,
                    scoring = ['accuracy', 'precision', 'recall']
                   )

print('Train Set Mean Accuracy  : {:.2f}  '.format(cv['train_accuracy'].mean()))
print('Train Set Mean Recall    : {:.2f}  '.format(cv['train_recall'].mean()))
print('Train Set Mean Precision : {:.2f}\n'.format(cv['train_precision'].mean()))

print('Test Set Mean Accuracy   : {:.2f}  '.format(cv['test_accuracy'].mean()))
print('Test Set Mean Precision  : {:.2f}  '.format(cv['test_precision'].mean()))
print('Test Set Mean Recall     : {:.2f}  '.format(cv['test_recall'].mean()))

Train Set Mean Accuracy  : 0.83  
Train Set Mean Recall    : 0.75  
Train Set Mean Precision : 0.80

Test Set Mean Accuracy   : 0.83  
Test Set Mean Precision  : 0.80  
Test Set Mean Recall     : 0.75  


In [10]:
cv = cross_val_score(estimator=log,
                     X=X,
                     y=y,
                     cv=10,
                    )

print('Model Scores : \n', cv)

Model Scores : 
 [0.82222222 0.84269663 0.7752809  0.87640449 0.82022472 0.78651685
 0.80898876 0.82022472 0.86516854 0.86516854]


## Hyperparameter Tuning

### GridSearch

In [11]:
params = {"C": [10 ** x for x in range (-5, 5, 1)],
              "penalty": ['l1', 'l2']
             }

In [12]:
grid_cv = GridSearchCV(estimator=log,
                       param_grid=params,
                       cv=10
                      ).fit(X,y)

In [13]:
print("Best Parameters : ", grid_cv.best_params_)
print("Best Score      : ", grid_cv.best_score_)

Best Parameters :  {'C': 10, 'penalty': 'l2'}
Best Score      :  0.8305493133583022


In [14]:
results = grid_cv.cv_results_

df = pd.DataFrame(results)
display(df.head().iloc[:,:6])

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_penalty
0,0.000645,0.000199,0.0,0.0,1e-05,l1
1,0.007551,0.001094,0.001521,0.000221,1e-05,l2
2,0.000419,5.6e-05,0.0,0.0,0.0001,l1
3,0.008093,0.000755,0.001434,0.000107,0.0001,l2
4,0.000396,3.5e-05,0.0,0.0,0.001,l1


In [15]:
df.columns

Index(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time',
       'param_C', 'param_penalty', 'params', 'split0_test_score',
       'split1_test_score', 'split2_test_score', 'split3_test_score',
       'split4_test_score', 'split5_test_score', 'split6_test_score',
       'split7_test_score', 'split8_test_score', 'split9_test_score',
       'mean_test_score', 'std_test_score', 'rank_test_score'],
      dtype='object')

In [16]:
df = df[['param_penalty','param_C', 'mean_test_score']]
df = df.sort_values(by='mean_test_score', ascending = False)
df[:10]

Unnamed: 0,param_penalty,param_C,mean_test_score
13,l2,10.0,0.830549
15,l2,100.0,0.828315
11,l2,1.0,0.82829
17,l2,1000.0,0.827191
19,l2,10000.0,0.827191
9,l2,0.1,0.817079
7,l2,0.01,0.800262
5,l2,0.001,0.682422
3,l2,0.0001,0.662247
1,l2,1e-05,0.656642


### Random Search 

In [17]:
params = {"C": [10 ** x for x in range (-5, 5, 1)],
              "penalty": ['l1', 'l2']
             }

In [18]:
rs_cv = RandomizedSearchCV(estimator=log,
                           param_distributions=params,
                           cv=10,
                           n_iter=10,
                           random_state=42,
                           scoring='precision'
                      )

rs_cv.fit(X, y)

RandomizedSearchCV(cv=10, estimator=LogisticRegression(max_iter=1000),
                   param_distributions={'C': [1e-05, 0.0001, 0.001, 0.01, 0.1,
                                              1, 10, 100, 1000, 10000],
                                        'penalty': ['l1', 'l2']},
                   random_state=42, scoring='precision')

In [19]:
print("Best parameters      : ", rs_cv.best_params_, "\n")
print("All precision values : ", rs_cv.cv_results_['mean_test_score'], "\n")
print("Best precision value : ", rs_cv.best_score_)

Best parameters      :  {'penalty': 'l2', 'C': 1} 

All precision values :  [       nan 0.79218561 0.79304583 0.7052381         nan 0.72873822
 0.79592061 0.66405559        nan        nan] 

Best precision value :  0.7959206113834063


In [20]:
results_rs = rs_cv.cv_results_

df_rs = pd.DataFrame(results_rs)
df_rs = df_rs[['param_penalty','param_C', 'mean_test_score']]
df_rs = df_rs.sort_values(by='mean_test_score', ascending = False)

df_rs[:5]

Unnamed: 0,param_penalty,param_C,mean_test_score
6,l2,1.0,0.795921
2,l2,100.0,0.793046
1,l2,1000.0,0.792186
5,l2,0.001,0.728738
3,l2,1e-05,0.705238
