In [4]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
pd.set_option('display.max_columns', 20)
pd.set_option('display.width', 350)

### Understanding the output of a confusion matrix!

In [5]:
y_true = [0, 1, 0, 1, 0, 1, 0]
y_pred = [1, 1, 1, 0, 1, 0, 1]

# TN = 0
# FP = 4
# FN = 2
# TP = 1

In [6]:
confusion_matrix(y_true, y_pred)

array([[0, 4],
       [2, 1]])

In [7]:
# TN, FP
# FN, TP

In [8]:
#set seed to be able to reproduce the results
np.random.seed(4684)

#read from google drive. Again, it is always the email dataset
data = pd.read_csv('emails.csv')
  
#get dummy variables from categorical ones
data_dummy = pd.get_dummies(data, drop_first=True).drop('email_id', axis=1)
  
#split into train and test to avoid overfitting
train, test = train_test_split(data_dummy, test_size = 0.34)
  
#build the model. We choose a RF, but this issues applies to pretty much all models
rf = RandomForestClassifier(n_estimators=50, oob_score=True)
rf.fit(train.drop('clicked', axis=1), train['clicked'])

#let's print OOB confusion matrix
pd.DataFrame(confusion_matrix(train['clicked'], rf.oob_decision_function_[:,1].round(), labels=[0, 1]))

Unnamed: 0,0,1
0,64342,269
1,1342,14


In [9]:
#and let's print test set confusion matrix
pd.DataFrame(confusion_matrix(test['clicked'], rf.predict(test.drop('clicked', axis=1)), labels=[0, 1]))

Unnamed: 0,0,1
0,33126,144
1,706,7


In [10]:
#confusion matrix test set
conf_matrix = pd.DataFrame(confusion_matrix(test['clicked'], rf.predict(test.drop('clicked', axis=1)), labels=[0, 1]))

#class0/1 errors are 1 -  (correctly classified events/total events belonging to that class)
class0_error = 1 - conf_matrix.loc[0,0]/(conf_matrix.loc[0,0]+conf_matrix.loc[0,1])
class1_error = 1 - conf_matrix.loc[1,1]/(conf_matrix.loc[1,0]+conf_matrix.loc[1,1])
  
pd.DataFrame( {'test_class0_error':[class0_error],
                        'test_class1_error':[class1_error]
              })

Unnamed: 0,test_class0_error,test_class1_error
0,0.004328,0.990182


### Playing with classification thresholds

In [13]:
from sklearn.metrics import roc_curve
  
#get test set predictions as a probability
pred_prob=rf.predict_proba(test.drop('clicked', axis=1))[:,1]

#get false positive rate and true positive rate, for different cut-off points
#and let's save them in a dataset. 
fpr, tpr, thresholds = roc_curve(test['clicked'],pred_prob)
#For consistency with R, we will focus on class errors, defined as
# class0_error = fpr and class1_error = 1 - tpr
error_cutoff=pd.DataFrame({'cutoff':pd.Series(thresholds),
                               'class0_error':pd.Series(fpr),
                               'class1_error': 1 - pd.Series(tpr)
                                })
#let's also add accuracy to the dataset, i.e. overall correctly classified events.
#This is: (tpr * positives samples in the test set + tnr * positive samples in the dataset)/total_events_in_the_data_set
error_cutoff['accuracy']=((1-error_cutoff['class0_error'])*sum(test['clicked']==0)+(1-error_cutoff['class1_error'])*sum(test['clicked']==1))/len(test['clicked'])
  
error_cutoff

Unnamed: 0,cutoff,class0_error,class1_error,accuracy
0,1.960000,0.000000,1.000000,0.979019
1,0.960000,0.000030,1.000000,0.978989
2,0.944000,0.000090,1.000000,0.978931
3,0.900000,0.000120,1.000000,0.978901
4,0.860000,0.000180,1.000000,0.978842
...,...,...,...,...
1178,0.001429,0.188127,0.674614,0.801666
1179,0.001176,0.188157,0.674614,0.801636
1180,0.001053,0.188218,0.674614,0.801577
1181,0.000909,0.188248,0.674614,0.801548


In [16]:
#let's check best combination of class0 and class1 errors
error_cutoff['optimal_value'] = 1 - error_cutoff['class1_error'] - error_cutoff['class0_error']
  
error_cutoff.sort_values('optimal_value', ascending=False).head(15)

Unnamed: 0,cutoff,class0_error,class1_error,accuracy,optimal_value
1172,0.0025,0.186444,0.676017,0.803284,0.137539
1178,0.001429,0.188127,0.674614,0.801666,0.137258
1179,0.001176,0.188157,0.674614,0.801636,0.137228
1180,0.001053,0.188218,0.674614,0.801577,0.137168
1181,0.000909,0.188248,0.674614,0.801548,0.137138
1173,0.002222,0.186865,0.676017,0.802872,0.137118
1174,0.002,0.187136,0.676017,0.802607,0.136848
1171,0.002857,0.185993,0.677419,0.803696,0.136587
1175,0.001818,0.187647,0.676017,0.802107,0.136337
1176,0.001667,0.187767,0.676017,0.801989,0.136216


In [29]:
#we already have predicted probabilities from the previous step, i.e.
#pred_prob=rf.predict_proba(test.drop('clicked', axis=1))[:,1]
  
#let's create a 0/1 vector according to the 0.002 cutoff
best_cutoff = error_cutoff.sort_values('optimal_value', ascending=False)['cutoff'].values[0]
predictions=np.where(pred_prob>=best_cutoff,1,0)
#get confusion matrix for those predictions
#confusion matrix test set
conf_matrix_new = pd.DataFrame(confusion_matrix(test['clicked'], predictions, labels=[0, 1]))
conf_matrix_new

Unnamed: 0,0,1
0,27067,6203
1,482,231


In [30]:
# oringial results
conf_matrix

Unnamed: 0,0,1
0,33126,144
1,706,7


In [27]:
best_cutoff

0.0025

In [28]:
predictions

array([0, 0, 1, ..., 1, 0, 0])