In [4]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
pd.set_option('display.max_columns', 20)
pd.set_option('display.width', 350)
  
#set seed to be able to reproduce the results
np.random.seed(4684)

In [5]:
data = pd.read_csv('emails.csv')

In [6]:
data.shape

(99950, 8)

In [7]:
data.head()

Unnamed: 0,email_id,email_text,email_version,hour,weekday,user_country,user_past_purchases,clicked
0,8,short_email,generic,9,Thursday,US,3,0
1,33,long_email,personalized,6,Monday,US,0,0
2,46,short_email,generic,14,Tuesday,US,3,0
3,49,long_email,personalized,11,Thursday,US,10,0
4,65,short_email,generic,8,Wednesday,UK,3,0


### Prepare the data

In [8]:
#Bin continuous variables

#Hour
data['hour_binned']= pd.cut(data['hour'], bins=[1,5, 13, 21, 24], include_lowest=True, labels=['night', 'morning', 'afternoon', 'night2'])

#replace night2 with night
data['hour_binned']= data['hour_binned'].replace('night2', 'night').cat.remove_unused_categories() # Pandas function for removing unused categories

In [9]:
#Bin purchases
data['purchase_binned']= pd.cut(data['user_past_purchases'], bins=[0,1, 4, 8, 23], include_lowest=True, right=False, labels=['None', 'Low', 'Medium', 'High'])

In [10]:
data.head()

Unnamed: 0,email_id,email_text,email_version,hour,weekday,user_country,user_past_purchases,clicked,hour_binned,purchase_binned
0,8,short_email,generic,9,Thursday,US,3,0,morning,Low
1,33,long_email,personalized,6,Monday,US,0,0,morning,
2,46,short_email,generic,14,Tuesday,US,3,0,afternoon,Low
3,49,long_email,personalized,11,Thursday,US,10,0,morning,High
4,65,short_email,generic,8,Wednesday,UK,3,0,morning,Low


In [11]:
#prepare the data for the model
data_dummy = pd.get_dummies(data, drop_first=True)
data_dummy.drop(['email_id', 'hour', 'user_past_purchases'], axis=1)

Unnamed: 0,clicked,email_text_short_email,email_version_personalized,weekday_Monday,weekday_Saturday,weekday_Sunday,weekday_Thursday,weekday_Tuesday,weekday_Wednesday,user_country_FR,user_country_UK,user_country_US,hour_binned_morning,hour_binned_afternoon,purchase_binned_Low,purchase_binned_Medium,purchase_binned_High
0,0,1,0,0,0,0,1,0,0,0,0,1,1,0,1,0,0
1,0,0,1,1,0,0,0,0,0,0,0,1,1,0,0,0,0
2,0,1,0,0,0,0,0,1,0,0,0,1,0,1,1,0,0
3,0,0,1,0,0,0,1,0,0,0,0,1,1,0,0,0,1
4,0,1,0,0,0,0,0,0,1,0,1,0,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99945,0,1,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0
99946,0,0,1,0,0,0,0,1,0,0,0,1,1,0,0,1,0
99947,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0
99948,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,1,0


In [12]:
#split into train and test to avoid overfitting
train, test = train_test_split(data_dummy, test_size = 0.34)

### Build the model

In [13]:
#build the model. We choose a RF, but this personalization approach works with any kinds of models
rf = RandomForestClassifier(class_weight={0:0.05,1:0.95}, n_estimators=50, oob_score=True) # oob_score is a validation method which test s DTs using samples not included in training.

rf.fit(train.drop('clicked', axis=1), train['clicked'])

RandomForestClassifier(class_weight={0: 0.05, 1: 0.95}, n_estimators=50,
                       oob_score=True)

In [14]:
rf.oob_decision_function_[:,:] # what does this mean? It is givig me the probability of being in each class

array([[1.        , 0.        ],
       [1.        , 0.        ],
       [1.        , 0.        ],
       ...,
       [0.92857143, 0.07142857],
       [1.        , 0.        ],
       [1.        , 0.        ]])

In [15]:
#let's print OOB confusion matrix
pd.DataFrame(confusion_matrix(train['clicked'], rf.oob_decision_function_[:,1].round(), labels=[0, 1]))

Unnamed: 0,0,1
0,64472,139
1,1350,6


- the model correctly predicted clicks 6 / 145 (4% of the time)
- the model correctly predicted no-clicks 64472 / 65822 (~98% of the time)

In [16]:
# current, base predictions for clcking on an email. ~2% CTR
train['clicked'].value_counts(normalize=True)

0    0.979444
1    0.020556
Name: clicked, dtype: float64

### Predict CTR for each segment

In [17]:
#We remove the label, we don't need it here
data_unique = data_dummy.drop(['clicked'], axis=1)
  
#We create all unique combinations of our features
data_unique = data_unique.drop_duplicates()
  
#Now we feed this into our model and get a prediction for each row
predictions = rf.predict_proba(data_unique)
  
#Finally, we add these predictions (for clicking) to the dataset
data_unique['prediction'] = [x[1] for x in predictions]
  
#And this is how it looks like
data_unique.head()

Unnamed: 0,email_id,hour,user_past_purchases,email_text_short_email,email_version_personalized,weekday_Monday,weekday_Saturday,weekday_Sunday,weekday_Thursday,weekday_Tuesday,weekday_Wednesday,user_country_FR,user_country_UK,user_country_US,hour_binned_morning,hour_binned_afternoon,purchase_binned_Low,purchase_binned_Medium,purchase_binned_High,prediction
0,8,9,3,1,0,0,0,0,1,0,0,0,0,1,1,0,1,0,0,0.04
1,33,6,0,0,1,1,0,0,0,0,0,0,0,1,1,0,0,0,0,0.0
2,46,14,3,1,0,0,0,0,0,1,0,0,0,1,0,1,1,0,0,0.02
3,49,11,10,0,1,0,0,0,1,0,0,0,0,1,1,0,0,0,1,0.1
4,65,8,3,1,0,0,0,0,0,0,1,0,1,0,1,0,1,0,0,0.1


### Identify the best email characteristics for each user

In [18]:
#Sort by prediction. This way highest predictions will be at the top of the dataset 
data_unique = data_unique.sort_values('prediction', ascending=False)
  
#Remove duplicates for country and purchase binned. This way, for each unique combination of country and purchase,
#we will only have the top 1 value, which means the highest prediction
best_segment = data_unique.drop_duplicates(subset=['user_country_FR', 'user_country_UK', 'user_country_US', 
                                         'purchase_binned_Low', 'purchase_binned_Medium', 'purchase_binned_High'
                                         ]).copy()

In [19]:
                                ################### "Unbin"" - Start  ######################
#This is not strictly needed. However, it is pretty hard to read that dataset cause we have all the dummy variables
#So let's reconstruct manually the original categorical varibles. It will be so much clearer that way
  
#Country
best_segment['user_country'] = np.where(best_segment['user_country_UK'] == 1, "UK", 
                                   np.where(best_segment['user_country_US'] == 1, "US", 
                                      np.where(best_segment['user_country_FR'] == 1, "FR",
                                     "ES"
)))
best_segment = best_segment.drop([e for e in list(data_unique) if e.startswith('user_country_')], axis=1)
  
#Number_purchases
best_segment['purchase_binned'] = np.where(best_segment['purchase_binned_High'] == 1, "High", 
                                   np.where(best_segment['purchase_binned_Medium'] == 1, "Medium", 
                                    np.where(best_segment['purchase_binned_Low'] == 1, "Low",
                                     "None"
)))
best_segment = best_segment.drop([e for e in list(data_unique) if e.startswith('purchase_binned_')], axis=1)
  
#Email Text
best_segment['email_text'] = np.where(best_segment['email_text_short_email'] == 1, "short_email", "long_email")
best_segment = best_segment.drop('email_text_short_email', axis=1)
  
#Email version
best_segment['email_version'] = np.where(best_segment['email_version_personalized'] == 1, "personalized", "generic")
best_segment = best_segment.drop('email_version_personalized', axis=1)
  
#Weekday
best_segment['weekday'] = np.where(best_segment['weekday_Monday'] == 1, "Monday", 
                                    np.where(best_segment['weekday_Saturday'] == 1, "Saturday", 
                                       np.where(best_segment['weekday_Sunday'] == 1, "Sunday",
                                          np.where(best_segment['weekday_Thursday'] == 1, "Thursday", 
                                              np.where(best_segment['weekday_Tuesday'] == 1, "Tuesday",
                                                   np.where(best_segment['weekday_Wednesday'] == 1, "Wednesday",
                                                      "Friday"
))))))
best_segment = best_segment.drop([e for e in list(data_unique) if e.startswith('weekday_')], axis=1)      
  
#Hour
best_segment['hour_binned'] = np.where(best_segment['hour_binned_afternoon'] == 1, "afternoon", 
                                   np.where(best_segment['hour_binned_morning'] == 1, "morning", 
                                     "night"
))
best_segment = best_segment.drop([e for e in list(data_unique) if e.startswith('hour_binned_')], axis=1) 
                              ################### "Unbin"" - End ######################
best_segment     

Unnamed: 0,email_id,hour,user_past_purchases,prediction,user_country,purchase_binned,email_text,email_version,weekday,hour_binned
87702,876592,5,10,0.88,UK,High,short_email,personalized,Friday,night
21130,208925,12,11,0.88,US,High,long_email,personalized,Friday,morning
1300,12407,9,6,0.88,US,Medium,short_email,personalized,Tuesday,morning
42089,418758,13,1,0.86,US,Low,long_email,personalized,Wednesday,morning
11199,111186,9,14,0.84,FR,High,short_email,personalized,Monday,morning
15275,151053,9,3,0.82,UK,Low,short_email,personalized,Thursday,morning
89203,892395,10,3,0.8,ES,Low,short_email,generic,Monday,morning
53319,532174,3,6,0.8,UK,Medium,long_email,personalized,Wednesday,night
11296,112142,5,1,0.78,FR,Low,short_email,personalized,Sunday,night
62898,628756,11,5,0.76,ES,Medium,long_email,personalized,Wednesday,morning


### Estimate A/B test gains

In [21]:
#Firstly let's get count by group. We need this for the weighted average at the end
count_segment = data[['user_country','purchase_binned']].groupby(['user_country','purchase_binned']).size().reset_index(name='counts')
  
#Get the proportion instead of the counts. Just easier to deal with to later get weighted average
count_segment['weight'] = count_segment['counts'].div(count_segment['counts'].sum())
  
#Merge it, so in our final dataset we also have weight
best_segment = pd.merge(best_segment, count_segment).sort_values('prediction',ascending=False)
  
best_segment

Unnamed: 0,email_id,hour,user_past_purchases,prediction,user_country,purchase_binned,email_text,email_version,weekday,hour_binned,counts,weight
0,876592,5,10,0.88,UK,High,short_email,personalized,Friday,night,2712,0.027134
1,208925,12,11,0.88,US,High,long_email,personalized,Friday,morning,8325,0.083292
2,12407,9,6,0.88,US,Medium,short_email,personalized,Tuesday,morning,20008,0.20018
3,418758,13,1,0.86,US,Low,long_email,personalized,Wednesday,morning,23364,0.233757
4,111186,9,14,0.84,FR,High,short_email,personalized,Monday,morning,1444,0.014447
5,151053,9,3,0.82,UK,Low,short_email,personalized,Thursday,morning,7803,0.078069
6,892395,10,3,0.8,ES,Low,short_email,generic,Monday,morning,3785,0.037869
7,532174,3,6,0.8,UK,Medium,long_email,personalized,Wednesday,night,6622,0.066253
8,112142,5,1,0.78,FR,Low,short_email,personalized,Sunday,night,3890,0.038919
9,628756,11,5,0.76,ES,Medium,long_email,personalized,Wednesday,morning,3389,0.033907


In [23]:
count_segment.head()

Unnamed: 0,user_country,purchase_binned,counts,weight
0,ES,,1368,0.013687
1,ES,Low,3785,0.037869
2,ES,Medium,3389,0.033907
3,ES,High,1422,0.014227
4,FR,,1341,0.013417


In [25]:
rf.predict(test.drop('clicked', axis=1)) # returns the predicted class

array([0, 0, 0, ..., 0, 0, 0])

In [28]:
#Now let's add class1 and class 0 errors to the dataset. We will take it from the test error confusion matrix
conf_matrix = pd.DataFrame(confusion_matrix(test['clicked'], rf.predict(test.drop('clicked', axis=1)), labels=[0, 1]))
  
#We define positive predictive value (ppv) as the proportion of times the model is right when it predicts 1, this is also called precision 
ppv = conf_matrix.loc[1,1]/(conf_matrix.loc[1,1]+conf_matrix.loc[0,1])
  
# The proportion of times class 0 (non-click) is incorrectly predicted as a click
forate = conf_matrix.loc[1,0]/(conf_matrix.loc[1,0]+conf_matrix.loc[0,0])
  
#Adjusted predicted click-rate for each segment
best_segment['adjusted_prediction'] = best_segment['prediction'] * ppv + (1-best_segment['prediction']) * forate
  
#Finally, let's multiply this by the weight of each segment in the dataset and compare it with the starting click-rate
CTR_comparison = pd.DataFrame( {'predicted_click_rate':[(best_segment['adjusted_prediction']*best_segment['weight']).sum()],
                                    'old_click_rate':[data['clicked'].mean()]
                                    })

In [29]:
CTR_comparison

Unnamed: 0,predicted_click_rate,old_click_rate
0,0.039937,0.0207


In [38]:
CTR_comparison.iloc[0,0] / CTR_comparison.iloc[0,1]

1.929287971133639

In [30]:
conf_matrix

Unnamed: 0,0,1
0,33228,42
1,711,2


In [31]:
forate # 711 / 33939. % of times class 0 was incorrectly prediction as class 1. (False positive - type 1 error)

0.020949350304958897

In [32]:
ppv # 2 / 44 in confusion matrix. Correct predictions of class 1

0.045454545454545456

In [33]:
best_segment

Unnamed: 0,email_id,hour,user_past_purchases,prediction,user_country,purchase_binned,email_text,email_version,weekday,hour_binned,counts,weight,adjusted_prediction
0,876592,5,10,0.88,UK,High,short_email,personalized,Friday,night,2712,0.027134,0.042514
1,208925,12,11,0.88,US,High,long_email,personalized,Friday,morning,8325,0.083292,0.042514
2,12407,9,6,0.88,US,Medium,short_email,personalized,Tuesday,morning,20008,0.20018,0.042514
3,418758,13,1,0.86,US,Low,long_email,personalized,Wednesday,morning,23364,0.233757,0.042024
4,111186,9,14,0.84,FR,High,short_email,personalized,Monday,morning,1444,0.014447,0.041534
5,151053,9,3,0.82,UK,Low,short_email,personalized,Thursday,morning,7803,0.078069,0.041044
6,892395,10,3,0.8,ES,Low,short_email,generic,Monday,morning,3785,0.037869,0.040554
7,532174,3,6,0.8,UK,Medium,long_email,personalized,Wednesday,night,6622,0.066253,0.040554
8,112142,5,1,0.78,FR,Low,short_email,personalized,Sunday,night,3890,0.038919,0.040063
9,628756,11,5,0.76,ES,Medium,long_email,personalized,Wednesday,morning,3389,0.033907,0.039573
