## Looking at Model Building Metrics

Looking at basic logitistic regression problem and leveraging sklearn to calculate metrics in addition to manually creating

In [2]:
import pandas as pd

In [3]:
df1 = pd.read_csv('./Data/Train1.csv')
df2 = pd.read_csv('./Data/Train2.csv')
print(df1.shape)
print(df2.shape)
titanic_df = df1.merge(df2, on = 'passenger_id', how = 'inner')
titanic_df['survived'] = titanic_df['survived'].fillna(0)
df['embarked'] = df['embarked'].fillna('S')
titanic_df['loc']= titanic_df['cabin'].apply(lambda x: x[0] if pd.notnull(x) else 'X')
titanic_df.head()

(917, 6)
(917, 8)


Unnamed: 0,passenger_id,fare,cabin,embarked,home.dest,survived,pclass,name,sex,age,sibsp,parch,ticket,loc
0,501,8.05,,S,,0.0,3.0,"Webber, Mr. James",male,,0.0,0.0,SOTON/OQ 3101316,X
1,588,21.0,,S,"Ilfracombe, Devon",0.0,2.0,"Phillips, Mr. Escott Robert",male,43.0,0.0,1.0,S.O./P.P. 2,X
2,402,24.15,,S,,0.0,3.0,"Van Impe, Miss. Catharina",female,10.0,0.0,2.0,345773,X
3,1193,15.5,,Q,,0.0,3.0,"McEvoy, Mr. Michael",male,,0.0,0.0,36568,X
4,686,211.3375,B3,S,"St Louis, MO",1.0,1.0,"Robert, Mrs. Edward Scott (Elisabeth Walton Mc...",female,43.0,0.0,1.0,24160,B


In [4]:
titanic_df = pd.get_dummies(titanic_df, columns = ['embarked', 'sex', 'loc'])
titanic_df = titanic_df.drop(['name', 'ticket', 'cabin', 'home.dest'], axis = 1)
titanic_df.head()

Unnamed: 0,passenger_id,fare,survived,pclass,age,sibsp,parch,embarked_C,embarked_Q,embarked_S,...,sex_male,loc_A,loc_B,loc_C,loc_D,loc_E,loc_F,loc_G,loc_T,loc_X
0,501,8.05,0.0,3.0,,0.0,0.0,0,0,1,...,1,0,0,0,0,0,0,0,0,1
1,588,21.0,0.0,2.0,43.0,0.0,1.0,0,0,1,...,1,0,0,0,0,0,0,0,0,1
2,402,24.15,0.0,3.0,10.0,0.0,2.0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
3,1193,15.5,0.0,3.0,,0.0,0.0,0,1,0,...,1,0,0,0,0,0,0,0,0,1
4,686,211.3375,1.0,1.0,43.0,0.0,1.0,0,0,1,...,0,0,1,0,0,0,0,0,0,0


In [5]:
titanic_df['age'] = titanic_df.groupby(['pclass'])['age'].apply(lambda x: x.fillna(x.median()))

In [6]:
titanic_df.isnull().sum()

passenger_id    0
fare            0
survived        0
pclass          0
age             0
sibsp           0
parch           0
embarked_C      0
embarked_Q      0
embarked_S      0
sex_female      0
sex_male        0
loc_A           0
loc_B           0
loc_C           0
loc_D           0
loc_E           0
loc_F           0
loc_G           0
loc_T           0
loc_X           0
dtype: int64

In [7]:
titanic_df.shape

(917, 21)

In [8]:
from sklearn.model_selection import train_test_split

X = titanic_df.drop('survived', axis=1)
Y = titanic_df['survived']

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)

In [9]:
y_train

364    0.0
214    0.0
182    1.0
130    1.0
478    0.0
      ... 
384    0.0
139    1.0
157    0.0
693    1.0
112    1.0
Name: survived, Length: 733, dtype: float64

In [10]:
x_train.shape, y_train.shape

((733, 20), (733,))

In [11]:
x_test.shape, y_test.shape

((184, 20), (184,))

### Logistic regression for classification

https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html

In [12]:
from sklearn.linear_model import LogisticRegression

logistic_model = LogisticRegression(penalty='l2', C=1.0, solver='liblinear').fit(x_train, y_train)

In [13]:
y_pred = logistic_model.predict(x_test)

### Confusion matrix

In [14]:
pred_results = pd.DataFrame({'y_test': y_test,
                             'y_pred': y_pred})

In [15]:
pred_results.head()

Unnamed: 0,y_test,y_pred
280,0.0,0.0
30,0.0,0.0
238,0.0,0.0
67,0.0,0.0
897,0.0,0.0


In [16]:
titanic_crosstab = pd.crosstab(pred_results.y_test, pred_results.y_pred)

titanic_crosstab

y_pred,0.0,1.0
y_test,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,92,21
1.0,19,52


### Precision-recall scores

When we use these for multiclass classification we need to specify an averaging method to determine how the precision and recall scores for different labels should be weighted

https://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_score.html
https://scikit-learn.org/stable/modules/generated/sklearn.metrics.recall_score.html

In [17]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

In [18]:
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

print("accuracy_score : ", acc)
print("precision_score : ", prec)
print("recall_score : ", recall)

accuracy_score :  0.782608695652174
precision_score :  0.7123287671232876
recall_score :  0.7323943661971831


In [19]:
titanic_crosstab

y_pred,0.0,1.0
y_test,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,92,21
1.0,19,52


In [31]:
TP = titanic_crosstab[1][1]
print(TP)
TN = titanic_crosstab[0][0]
print(TN)
FP = titanic_crosstab[1][0]
print(FP)
FN = titanic_crosstab[0][1]
print(FN)

52
92
21
19


In [32]:
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, y_pred)
print(confusion_matrix)

[[92 21]
 [19 52]]


In [33]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.83      0.81      0.82       113
         1.0       0.71      0.73      0.72        71

    accuracy                           0.78       184
   macro avg       0.77      0.77      0.77       184
weighted avg       0.78      0.78      0.78       184



In [27]:
accuracy_score_verified = (TP + TN) / (TP + FP + TN + FN)

accuracy_score_verified

0.782608695652174

In [28]:
precision_score_survived = TP / (TP + FP)

precision_score_survived

0.7123287671232876

In [29]:
recall_score_survived = TP / (TP + FN)

recall_score_survived

0.7323943661971831