In [1]:
%load_ext autoreload
%autoreload 2

# Description:
This notebook is used to show how to build GBM models using covariate adjustment method using the code/

In [2]:
import os
import sys

add_paths = ['..']
for add_path in add_paths:
    module_path = os.path.abspath(os.path.join(add_path))
    if module_path not in sys.path:
        sys.path.append(module_path)
        #print(f'Adding: {module_path}')

In [3]:
import pandas as pd
import numpy as np
import h2o

from sto.utils.mask_data import generate_masked_data
from sto.variants.gbm.config import DELIVERY_ID_COL, TIME_ID_COL
from sto.variants.gbm.prior_gbm import GBM
from sto.variants.gbm.config import GbmHyperParas

from sto.utils.back_test import generate_train_test_range, select_tr_te_dat
from dateutil.relativedelta import relativedelta
from datetime import date

In [4]:
h2o.init()
h2o.no_progress()

Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.
Attempting to start a local H2O server...
  Java Version: java version "11.0.3" 2019-04-16 LTS; Java(TM) SE Runtime Environment 18.9 (build 11.0.3+12-LTS); Java HotSpot(TM) 64-Bit Server VM 18.9 (build 11.0.3+12-LTS, mixed mode)
  Starting server from /anaconda3/lib/python3.7/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /var/folders/5v/zvxsqfsx2zx9qdndlwyczdrm0000gn/T/tmp7hp6vli2
  JVM stdout: /var/folders/5v/zvxsqfsx2zx9qdndlwyczdrm0000gn/T/tmp7hp6vli2/h2o_chanliu_started_from_python.out
  JVM stderr: /var/folders/5v/zvxsqfsx2zx9qdndlwyczdrm0000gn/T/tmp7hp6vli2/h2o_chanliu_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O cluster uptime:,02 secs
H2O cluster timezone:,America/Los_Angeles
H2O data parsing timezone:,UTC
H2O cluster version:,3.26.0.2
H2O cluster version age:,12 days
H2O cluster name:,H2O_from_python_chanliu_sbbp9r
H2O cluster total nodes:,1
H2O cluster free memory:,2 Gb
H2O cluster total cores:,4
H2O cluster allowed cores:,4


# Training with different hyper parameters

## Load data

In [5]:
df = pd.read_csv('../data/unilever_sto_control.csv')

### Get a low coverage training data

In [6]:
n_repeat = 1
p = 0.8
mask_cols = [DELIVERY_ID_COL, TIME_ID_COL]
df_selected, df_unselected = generate_masked_data(df,
                                                  n_repeat,
                                                  p,
                                                  mask_cols=mask_cols,
                                                  max_try=1000,
                                                  seed=2,
                                                  training_prop=False)

0 (1)


In [7]:
i = 0
df_selected_i = df_selected[0].copy()
df_test = df_unselected[0].copy()

In [8]:
df_selected_i.head()

Unnamed: 0,COUNTRY,DELIVERY_ID,TIME_ID,sent_count,response_count
0,DE,DM30589,0,6886,341
1,DE,DM30589,1,7003,349
2,DE,DM30589,2,7001,319
3,DE,DM30589,4,7240,358
4,DE,DM30589,5,7042,351


In [9]:
df_test.head()

Unnamed: 0,COUNTRY,DELIVERY_ID,TIME_ID,sent_count,response_count
0,DE,DM30589,3,7109,362
1,DE,DM30589,22,7048,337
2,DE,DM30730,26,6692,282
3,DE,DM30730,28,6496,246
4,DE,DM30730,35,6553,267


## Train with default parameters

In [10]:
gbm_hyperparameter = GbmHyperParas(split_method='mask', tuning_times=3)

In [11]:
gbm_hyperparameter

GbmHyperParas(label_col='label', delivery_id_col='DELIVERY_ID', time_id_col='TIME_ID', column_types={'TIME_ID': 'factor', 'label': 'factor'}, weight_column='weight_count', sent_count_col='sent_count', response_count_col='response_count', response_rate_col='response_rate', score_col='pred_rate', extra_attributes=[], use_time=True, alpha=None, delivery_id_transform_method='category_mean', ntrees=200, learn_rate=0.2, max_depth=10, categorical_encoding='enum', split_method='mask', attr_to_match=['DELIVERY_ID', 'TIME_ID'], tuning_times=3, max_try=1000, seed=1, split_proportion=0.7)

In [12]:
gbm = GBM(gbm_hyperparameter)
gbm.train(df_selected_i)

<sto.variants.gbm.prior_gbm.GBM at 0x1a2639bbe0>

In [13]:
# get predicted rates on entire data
df_pred = gbm.predict(df)

In [14]:
df_pred.head()

Unnamed: 0,COUNTRY,DELIVERY_ID,TIME_ID,sent_count,response_count,pred_rate
0,DE,DM30589,0,6886,341,0.049875
1,DE,DM30589,1,7003,349,0.048228
2,DE,DM30589,2,7001,319,0.048342
3,DE,DM30589,3,7109,362,0.051572
4,DE,DM30589,4,7240,358,0.048171


In [15]:
# evaluate on the entire data
metric = gbm.evaluate(df)
metric

OrderedDict([('ap_at_3_lift_del_mean', 0.05771903130418751),
             ('ap_at_3_lift_del_std', 0.03284796966278112),
             ('top_1_lift_del_mean', 0.052776587912613714),
             ('top_1_lift_del_std', 0.04857966164702544)])

## Change hyper parameters and retrain the model

###### One option

In [16]:
gbm_hyperparameter = GbmHyperParas(split_method='mask', tuning_times=3, 
                                   ntrees=50, learn_rate=0.1)

###### The other option

In [17]:
gbm_hyperparameter = GbmHyperParas(split_method='mask', tuning_times=3)
gbm_hyperparameter.ntrees = 500
gbm_hyperparameter.learn_rate = 0.01
gbm_hyperparameter

GbmHyperParas(label_col='label', delivery_id_col='DELIVERY_ID', time_id_col='TIME_ID', column_types={'TIME_ID': 'factor', 'label': 'factor'}, weight_column='weight_count', sent_count_col='sent_count', response_count_col='response_count', response_rate_col='response_rate', score_col='pred_rate', extra_attributes=[], use_time=True, alpha=None, delivery_id_transform_method='category_mean', ntrees=500, learn_rate=0.01, max_depth=10, categorical_encoding='enum', split_method='mask', attr_to_match=['DELIVERY_ID', 'TIME_ID'], tuning_times=3, max_try=1000, seed=1, split_proportion=0.7)

In [18]:
# train and evaluate
gbm = GBM(gbm_hyperparameter)
gbm.train(df_selected_i)

metric = gbm.evaluate(df)
metric

OrderedDict([('ap_at_3_lift_del_mean', 0.062351085850487625),
             ('ap_at_3_lift_del_std', 0.03149674165052149),
             ('top_1_lift_del_mean', 0.06690678411886386),
             ('top_1_lift_del_std', 0.043154608431337775)])

## Tunning the GBM hyper parameters

In [19]:
gbm_hyperparameter = GbmHyperParas(split_method='mask', tuning_times=3, 
                                   ntrees=10, learn_rate=0.5)
gbm = GBM(gbm_hyperparameter)

In [20]:
# orginal hyper parameters
gbm.hyperparams

GbmHyperParas(label_col='label', delivery_id_col='DELIVERY_ID', time_id_col='TIME_ID', column_types={'TIME_ID': 'factor', 'label': 'factor'}, weight_column='weight_count', sent_count_col='sent_count', response_count_col='response_count', response_rate_col='response_rate', score_col='pred_rate', extra_attributes=[], use_time=True, alpha=None, delivery_id_transform_method='category_mean', ntrees=10, learn_rate=0.5, max_depth=10, categorical_encoding='enum', split_method='mask', attr_to_match=['DELIVERY_ID', 'TIME_ID'], tuning_times=3, max_try=1000, seed=1, split_proportion=0.7)

In [21]:
# create a grid of parameters to tune
paras_dict = {'ntrees': [20, 30], 'learn_rate': [0.1, 0.2], 'max_depth': [10, 20]}
gbm.train(df_selected_i, para_dict=paras_dict, tuning=True)

Tuning parameters ...
0 (1)1 (1)2 (2)
Total para combinations: 8
Current iter: 0,1,2,3,4,5,6,7,Complete.
{'learn_rate': 0.1, 'max_depth': 20, 'ntrees': 30}


<sto.variants.gbm.prior_gbm.GBM at 0x1a26dc50b8>

In [22]:
# the hyperparams inside gbm has been set the the best one and the model is retrained
gbm.hyperparams

GbmHyperParas(label_col='label', delivery_id_col='DELIVERY_ID', time_id_col='TIME_ID', column_types={'TIME_ID': 'factor', 'label': 'factor'}, weight_column='weight_count', sent_count_col='sent_count', response_count_col='response_count', response_rate_col='response_rate', score_col='pred_rate', extra_attributes=[], use_time=True, alpha=None, delivery_id_transform_method='category_mean', ntrees=30, learn_rate=0.1, max_depth=20, categorical_encoding='enum', split_method='mask', attr_to_match=['DELIVERY_ID', 'TIME_ID'], tuning_times=3, max_try=1000, seed=1, split_proportion=0.7)

In [23]:
# evaluate
gbm.evaluate(df)

OrderedDict([('ap_at_3_lift_del_mean', 0.05771903130418751),
             ('ap_at_3_lift_del_std', 0.03284796966278112),
             ('top_1_lift_del_mean', 0.052776587912613714),
             ('top_1_lift_del_std', 0.04857966164702544)])

# Train model with extra attributes

In [24]:
dat_adobecom = pd.read_csv('../data/Adobecom_delivery_counts.csv')

dat_adobecom.rename(columns={"SENT_COUNT": "sent_count","RESPONSE_COUNT": "response_count"}, 
                    inplace=True)
dat_adobecom['DATE'] = pd.to_datetime(dat_adobecom['DATE']) 

In [25]:
dat_adobecom.head()

Unnamed: 0,PRODUCT_PROMOTED,INDUSTRY,REGION,PURPOSE,DATE,DELIVERY_ID,TIME_ID,sent_count,response_count
0,Creative Cloud,Commercial,North America,Other,2019-04-09,DM337552,6,8979,1012.0
1,Creative Cloud,Commercial,EMEA,Promo,2019-05-22,DM346463,13,986,48.0
2,Document Cloud,Commercial,North America,Conversion,2019-06-18,DM351255,14,98579,15525.0
3,Creative Cloud,Creative Cloud Membership,Japan,Direct/Channel Sales,2019-07-09,DM355416,14,1356,68.0
4,Creative Cloud,Commercial,EMEA,Conversion,2019-07-02,DM353699,9,28923,6900.0


In [26]:
#train test date range
train_date_range = [pd.Timestamp(2019, 4, 1), pd.Timestamp(2019, 6, 15)]
test_date_range = [pd.Timestamp(2019, 6, 16), pd.Timestamp(2019, 7, 3)]
thred_list = [100]

In [27]:
extra_attributes = ['PRODUCT_PROMOTED', 'PURPOSE', 'INDUSTRY', 'REGION']

In [28]:
training_time_range = train_date_range
testing_time_range = test_date_range
sent_count_thred = thred_list
date_col='DATE'
sent_count_col="sent_count"

In [29]:
cols_to_match = extra_attributes + ['TIME_ID']
df_intime_whole, df_oot_whole = select_tr_te_dat(df=dat_adobecom, 
                                                 training_time_range=train_date_range, 
                                                 testing_time_range=test_date_range, 
                                                 sent_count_thred=thred_list,
                                                 date_col='DATE',
                                                 sent_count_col="sent_count", 
                                                 cols_to_match=cols_to_match)

In [30]:
# add extra_attributes in GbmHyperParas, which defaults to an empty list:[]
gbm_hyperparameter = GbmHyperParas(split_method='split', tuning_times=3,
                                  extra_attributes=extra_attributes)
GBM_extra_attr_main = GBM(gbm_hyperparameter)
paras_dict = {'ntrees': [20, 30], 'learn_rate': [0.1], 'max_depth': [10]}

GBM_extra_attr_main.train(df_intime_whole, para_dict=paras_dict, tuning=True)

GBM_extra_attr_main.evaluate(df_oot_whole[0])

Tuning parameters ...
0,(1)1,(1)2,(1)
Total para combinations: 2
Current iter: 0,1,Complete.
{'learn_rate': 0.1, 'max_depth': 10, 'ntrees': 20}


OrderedDict([('ap_at_3_lift_del_mean', 0.036498556493208024),
             ('ap_at_3_lift_del_std', 0.2014212540289617),
             ('top_1_lift_del_mean', 0.1410067163447342),
             ('top_1_lift_del_std', 0.4058157250600699)])

In [31]:
h2o.cluster().shutdown()

H2O session _sid_bf12 closed.
