In [1]:
import pandas
import statsmodels.api as sm
pandas.set_option('display.max_columns', 10)
pandas.set_option('display.width', 350)

In [5]:
#Read from google drive. This is the same dataset described in the previous section
data = pandas.read_csv('emails.csv')

In [6]:
data.head()

Unnamed: 0,email_id,email_text,email_version,hour,weekday,user_country,user_past_purchases,clicked
0,8,short_email,generic,9,Thursday,US,3,0
1,33,long_email,personalized,6,Monday,US,0,0
2,46,short_email,generic,14,Tuesday,US,3,0
3,49,long_email,personalized,11,Thursday,US,10,0
4,65,short_email,generic,8,Wednesday,UK,3,0


In [7]:
data.dtypes

email_id                int64
email_text             object
email_version          object
hour                    int64
weekday                object
user_country           object
user_past_purchases     int64
clicked                 int64
dtype: object

In [8]:
#Before building the regression, we need to know which ones are the reference levels for the categorical variables

#only keep categorical variables
data_categorical = data.select_dtypes(['object']).astype("category") # category is just a list of the values in object

#find reference level, i.e. the first level
print(data_categorical.apply(lambda x: x.cat.categories[0]))

email_text       long_email
email_version       generic
weekday              Friday
user_country             ES
dtype: object


In [9]:
data_categorical.head()

Unnamed: 0,email_text,email_version,weekday,user_country
0,short_email,generic,Thursday,US
1,long_email,personalized,Monday,US
2,short_email,generic,Tuesday,US
3,long_email,personalized,Thursday,US
4,short_email,generic,Wednesday,UK


In [10]:
#make dummy variables from categorical ones. Using one-hot encoding and drop_first=True 
data = pandas.get_dummies(data, drop_first=True) # drop first, removes first (base) value from each caetgory column

In [11]:
data.head()

Unnamed: 0,email_id,hour,user_past_purchases,clicked,email_text_short_email,...,weekday_Tuesday,weekday_Wednesday,user_country_FR,user_country_UK,user_country_US
0,8,9,3,0,1,...,0,0,0,0,1
1,33,6,0,0,0,...,0,0,0,0,1
2,46,14,3,0,1,...,1,0,0,0,1
3,49,11,10,0,0,...,0,0,0,0,1
4,65,8,3,0,1,...,0,1,0,1,0


In [12]:
data.columns

Index(['email_id', 'hour', 'user_past_purchases', 'clicked', 'email_text_short_email', 'email_version_personalized', 'weekday_Monday', 'weekday_Saturday', 'weekday_Sunday', 'weekday_Thursday', 'weekday_Tuesday', 'weekday_Wednesday', 'user_country_FR', 'user_country_UK', 'user_country_US'], dtype='object')

In [13]:
#add intercept
data['intercept'] = 1
#drop the label
train_cols = data.drop('clicked', axis=1)
  
#Build Logistic Regression
logit = sm.Logit(data['clicked'], train_cols)
output = logit.fit()

Optimization terminated successfully.
         Current function value: 0.092770
         Iterations 9


In [14]:
output_table = pandas.DataFrame(dict(coefficients = output.params, SE = output.bse, z = output.tvalues, p_values = output.pvalues))
#get coefficients and pvalues
print(output_table)

                            coefficients            SE          z       p_values
email_id                   -3.848609e-08  7.780379e-08  -0.494656   6.208432e-01
hour                        1.670684e-02  5.005879e-03   3.337445   8.455247e-04
user_past_purchases         1.878107e-01  5.725787e-03  32.800855  5.725039e-236
email_text_short_email      2.793085e-01  4.530477e-02   6.165101   7.043829e-10
email_version_personalized  6.387251e-01  4.691461e-02  13.614631   3.277989e-42
weekday_Monday              5.410326e-01  9.341014e-02   5.792011   6.954864e-09
weekday_Saturday            2.828638e-01  9.777629e-02   2.892969   3.816190e-03
weekday_Sunday              1.836278e-01  1.001194e-01   1.834088   6.664099e-02
weekday_Thursday            6.254040e-01  9.233999e-02   6.772839   1.262790e-11
weekday_Tuesday             6.162222e-01  9.237223e-02   6.671077   2.539336e-11
weekday_Wednesday           7.554637e-01  9.084515e-02   8.315950   9.102053e-17
user_country_FR            -

In [33]:
#only keep significant variables and order results by coefficient value
print(output_table.loc[output_table['p_values'] < 0.05].sort_values("coefficients", ascending=False))

                            coefficients        SE          z       p_values
user_country_UK                 1.155255  0.122060   9.464618   2.946372e-21
user_country_US                 1.141360  0.115963   9.842487   7.386228e-23
weekday_Wednesday               0.755464  0.090845   8.315950   9.102053e-17
email_version_personalized      0.638725  0.046915  13.614631   3.277989e-42
weekday_Thursday                0.625404  0.092340   6.772839   1.262790e-11
weekday_Tuesday                 0.616222  0.092372   6.671077   2.539336e-11
weekday_Monday                  0.541033  0.093410   5.792011   6.954864e-09
weekday_Saturday                0.282864  0.097776   2.892969   3.816190e-03
email_text_short_email          0.279308  0.045305   6.165101   7.043829e-10
user_past_purchases             0.187811  0.005726  32.800855  5.725039e-236
hour                            0.016707  0.005006   3.337445   8.455247e-04
intercept                      -6.880922  0.156067 -44.089646   0.000000e+00