In [2]:
import pandas as pd
import numpy as np
import graphviz
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from graphviz import Source
from scipy import stats
  
pd.set_option('display.max_columns', 20)
pd.set_option('display.width', 350)

In [4]:
#set seed to be able to reproduce the results
np.random.seed(4684)

#read from google drive. Again, it is always the email dataset
data = pd.read_csv('emails.csv')

In [6]:
#get dummy variables from categorical ones
data_dummy = pd.get_dummies(data, drop_first=True).drop('email_id', axis=1)
  
#split into train and test to avoid overfitting
train, test = train_test_split(data_dummy, test_size = 0.34)
  
#build the model. We choose a single decision tree here, but this issue might apply to all models.
#we have a split if impurity (think about it as loss) improves by at least 0.001, which is a very low number, forcing the tree to split pretty much always except for clearly useless splits
tree=DecisionTreeClassifier(min_impurity_decrease=0.001)
tree.fit(train.drop('clicked', axis=1),train['clicked'])
  
#visualize it
export_graphviz(tree, out_file="tree.dot", feature_names=train.drop('clicked', axis=1).columns, proportion=True, rotate=True)
with open("tree.dot") as f:
    dot_graph = f.read()
s = Source.from_file("tree.dot")
s.view()

'tree.dot.pdf'

In [8]:
tree_probabilities = tree.predict_proba(test.drop('clicked', axis=1))[:,1]
stats.describe(tree_probabilities)

DescribeResult(nobs=33983, minmax=(0.020555732411660376, 0.020555732411660376), mean=0.020555732411660376, variance=0.0, skewness=0.0, kurtosis=-3.0)

In [12]:
pd.DataFrame(tree_probabilities).describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
0,33983.0,0.020556,8.0874e-15,0.020556,0.020556,0.020556,0.020556,0.020556


### Changing class weights

In [15]:
# RF weights can be passed as class weights inside RandomForestClassifier. 
#Then, for each weight configuration, we save class errors and accuracy. 
#Finally, pick the best combination of both class errors 
  
#Build 20 RF models with different weights
class0_error = []
class1_error = []
accuracy = []
  
#apply weights from 10 to 200 with 10 as a step
for i in range(10,210,10):
    rf = RandomForestClassifier(n_estimators=50, oob_score=True, class_weight={0:1,1:i})
    rf.fit(train.drop('clicked', axis=1), train['clicked'])
    #let's get confusion matrix
    conf_matrix = pd.DataFrame(confusion_matrix(test['clicked'], rf.predict(test.drop('clicked', axis=1)), labels=[0, 1]))
    #class0/1 errors are 1 -  (correctly classified events/total events belonging to that class)
    class0_error.append( 1 - conf_matrix.loc[0,0]/(conf_matrix.loc[0,0]+conf_matrix.loc[0,1]))
    class1_error.append( 1 - conf_matrix.loc[1,1]/(conf_matrix.loc[1,0]+conf_matrix.loc[1,1]))
    accuracy.append((conf_matrix.loc[1,1]+conf_matrix.loc[0,0])/conf_matrix.values.sum())
  

dataset_weights = pd.DataFrame ({'minority_class_weight': pd.Series(range(10,210,10)),
                                     'class0_error': pd.Series(class0_error),
                                     'class1_error': pd.Series(class1_error),
                                     'accuracy':     pd.Series(accuracy)
                                   })
  
dataset_weights

Unnamed: 0,minority_class_weight,class0_error,class1_error,accuracy
0,10,0.030268,0.964937,0.950122
1,20,0.050526,0.931276,0.930995
2,30,0.063781,0.907433,0.918518
3,40,0.075894,0.889201,0.907042
4,50,0.080944,0.890603,0.902069
5,60,0.084521,0.889201,0.898596
6,70,0.087887,0.886396,0.895359
7,80,0.087827,0.88359,0.895477
8,90,0.085242,0.884993,0.897978
9,100,0.090201,0.880785,0.893211


In [17]:
#Calculate trade-ff between class errors
dataset_weights['optimal_value'] = 1 - dataset_weights['class1_error'] - dataset_weights['class0_error']
  
#Order by optimal_value and pick the first row
dataset_weights.sort_values('optimal_value', ascending=False).head(1)

Unnamed: 0,minority_class_weight,class0_error,class1_error,accuracy,optimal_value
3,40,0.075894,0.889201,0.907042,0.034905
