In [2]:
import pandas as pandas
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from rulefit import RuleFit

pandas.set_option('display.max_columns', 10)
pandas.set_option('display.width', 350)
np.random.seed(4684)

In [3]:
#read from google drive
data = pandas.read_csv('emails.csv')
# make dummy variables from categorical ones. Using one-hot encoding and drop_first=True 
data = pandas.get_dummies(data, drop_first=True)
#drop the label
train_cols = data.drop('clicked', axis=1)
  
#Extract rules from Random Forest
#set tree forest parameters
rf=RandomForestClassifier(max_depth=2, n_estimators=10, class_weight={0:0.05,1:0.95})
  
#set RuleFit parameters. We are keeping RuleFit pretty small here to make it faster. Increasing max_depth, n_estimators, and setting exp_rand_tree_size = True will generate way more rules and make it somewhat more reliable. As always, there is a trade-off between accuracy and processing time, which should be considered on a case-by-case basis. Note that RuleFit is really slow, so this trade-off is pretty significant here.     
rufi=RuleFit(rfmode="classify", tree_generator=rf, exp_rand_tree_size=False, lin_standardise=False)
#fit RuleFit
rufi.fit(train_cols.values, data['clicked'].values, feature_names = train_cols.columns)
print("We have extracted", rufi.transform(train_cols.values).shape[1], "rules")

We have extracted 38 rules


In [4]:
#These are a few of the rules we have extracted
output=rufi.get_rules()
print(output[output['type']=="rule"]['rule'].head().values)

['user_country_FR > 0.5 & weekday_Tuesday <= 0.5'
 'weekday_Wednesday <= 0.5 & user_country_FR <= 0.5'
 'email_version_personalized <= 0.5 & hour <= 9.5'
 'email_version_personalized > 0.5 & weekday_Saturday > 0.5'
 'weekday_Sunday > 0.5 & user_country_FR > 0.5']


In [6]:
output.head()

Unnamed: 0,rule,type,coef,support,importance
0,email_id,linear,-5e-06,1.0,1.337103
1,hour,linear,-0.182941,1.0,0.789066
2,user_past_purchases,linear,0.0,1.0,0.0
3,email_text_short_email,linear,0.0,1.0,0.0
4,email_version_personalized,linear,0.0,1.0,0.0


In [7]:
#X_concat is the new dataset given by the original variables (train_cols.values)
#as well as the new rules extracted from the trees (rufi.transform(train_cols.values))
X_concat = np.concatenate((train_cols, rufi.transform(train_cols.values)), axis=1)
#Build the logistic regression with penalty. This will set low coefficients to zero, so only the relevant ones will survive
log = LogisticRegression(penalty='l1', solver='liblinear', C=0.1)
log.fit(X_concat, data['clicked'])
#get the full output with variables, coefficients, and support
output.iloc[:,2] = np.transpose(log.coef_)
output[output['coef']!=0].sort_values('coef', ascending = False )

Unnamed: 0,rule,type,coef,support,importance
12,user_country_UK,linear,0.9004922,1.0,0.0
8,weekday_Thursday,linear,0.3973608,1.0,0.0
5,weekday_Monday,linear,0.3193081,1.0,0.0
2,user_past_purchases,linear,0.1420547,1.0,0.0
50,email_version_personalized > 0.5 & email_text_...,rule,0.1272775,0.24818,0.0
51,user_country_US > 0.5 & email_version_personal...,rule,0.106657,0.299551,0.0
24,email_text_short_email > 0.5 & weekday_Tuesday...,rule,0.05185828,0.427504,0.0
6,weekday_Saturday,linear,0.05047285,1.0,0.0
37,user_past_purchases > 3.5 & weekday_Wednesday ...,rule,0.01713831,0.06604,0.0
0,email_id,linear,-6.724911e-08,1.0,1.337103
