In [1]:
import gc
import pandas as pd
import numpy as np
from datetime import datetime

from xgboost import XGBClassifier
from sklift.models import ClassTransformation

from sklift.metrics import uplift_at_k
import matplotlib.pyplot as plt
import catboost as cb
from catboost import CatBoostClassifier, Pool
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import optuna
import pandas as pd
from pandas import read_csv

%matplotlib inline

In [21]:
data = read_csv('dataset.csv')
y = data['response_att']
X = data.copy()

features_list = ['group', 'response_att',
 'perdelta_days_between_visits_15_30d',
 'k_var_days_between_visits_1m',
 'k_var_days_between_visits_3m',
 'k_var_days_between_visits_15d',
 'cheque_count_6m_g48',
 'response_sms',
 'cheque_count_6m_g40',
 'k_var_count_per_cheque_6m_g27',
 'k_var_disc_share_6m_g27',
 'response_viber',
 'k_var_discount_depth_1m',
 'k_var_sku_price_6m_g48',
 'cheque_count_6m_g25',
 'cheque_count_12m_g41',
 'cheque_count_6m_g41',
 'cheque_count_12m_g32',
 'sale_count_12m_g54',
]

X = X.filter(features_list)
pd.set_option('display.max_columns', 500)
X.head(5)

Unnamed: 0,group,response_att,perdelta_days_between_visits_15_30d,k_var_days_between_visits_1m,k_var_days_between_visits_3m,k_var_days_between_visits_15d,cheque_count_6m_g48,response_sms,cheque_count_6m_g40,k_var_count_per_cheque_6m_g27,k_var_disc_share_6m_g27,response_viber,k_var_discount_depth_1m,k_var_sku_price_6m_g48,cheque_count_6m_g25,cheque_count_12m_g41,cheque_count_6m_g41,cheque_count_12m_g32,sale_count_12m_g54
0,1,0,1.3393,0.6479,0.824,0.4554,6.0,0.923077,5.0,0.4845,0.6366,0.071429,0.4864,0.6718,9.0,6.0,1.0,3.0,16.0
1,1,0,0.0,0.0,1.0027,0.0,1.0,1.0,3.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,1.0,1.0,2.0
2,0,0,0.0,0.0,0.5559,0.0,6.0,1.0,7.0,0.5762,0.856,0.25,0.1201,0.2407,9.0,14.0,8.0,4.0,109.0
3,1,0,0.0,0.0,0.7432,0.0,3.0,0.909091,4.0,0.3295,0.578,0.0,0.0,0.1028,11.0,8.0,2.0,2.0,39.0
4,1,0,0.0,0.4714,0.998,0.4714,4.0,1.0,8.0,0.7526,0.9058,0.1,0.4903,0.2195,2.0,3.0,2.0,0.0,25.0


In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=10)

In [4]:
import azureml.core
from azureml.core.authentication import InteractiveLoginAuthentication
from azureml.core import Experiment, Workspace

# Check core SDK version number
print("You are currently using version", azureml.core.VERSION, "of the Azure ML SDK")
print("")

# Log In to Azure ML Workspace
interactive_auth = InteractiveLoginAuthentication(tenant_id="76f90eb1-fb9a-4446-9875-4d323d6455ad")

ws = Workspace.from_config(auth=interactive_auth)
print('Workspace name: ' + ws.name, sep='\n')

You are currently using version 1.5.0 of the Azure ML SDK

Workspace name: team19


In [5]:
def custom_metric(answers, take_top_ratio=0.25):
    answers.sort_values(by='uplift', inplace=True, ascending=False)
    
    n_samples = int(np.ceil(answers.shape[0] * take_top_ratio))
    answers = answers.iloc[:n_samples, :]
    answers_test = answers[answers['group'] == 1]['response_att'].sum() / \
                   answers[answers['group'] == 1].shape[0]
    answers_control = answers[answers['group'] == 0]['response_att'].sum() / \
                      answers[answers['group'] == 0].shape[0]
    return (answers_test - answers_control) * 100

In [15]:
# Get an experiment object from Azure Machine Learning

In [16]:
#i = 0

In [26]:
import numpy as np
from tqdm import tqdm
import joblib
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
from random import randrange

# list of numbers from 0 to 1.0 with a 0.05 interval
lrs = list(np.arange(0.2, 0.25, 1))
depth = list(np.arange(2, 5, 1))
estimators = list(np.arange(50, 150, 5))

# try a bunch of alpha values in a Linear Regression (Ridge) model
experiment = Experiment(workspace=ws, name="more_cycling")

max_score = 0

for max_depth in depth:
    for n_estimators in estimators:
        for lr in lrs:
            print('lr', lr)
            print('depth', max_depth)
            print('n_estimators', n_estimators)
            seed = randrange(10000)
            print('seed', seed)
            
            with experiment.start_logging(snapshot_directory=None) as run:
                run.log(name='lr', value=lr)
                run.log(name='depth', value=max_depth)
                run.log(name='n_estimators', value=n_estimators)

                xgb_est_params = {
                    'max_depth':max_depth,
                    'learning_rate': lr, 
                    'n_estimators': n_estimators,
                    'nthread':64,
                    'n_gpus':1,
                    'seed': seed
                }

                estimator = XGBClassifier(
                    **xgb_est_params
                )


                uplift_model_cl_tr = ClassTransformation(
                    estimator=estimator
                )

                uplift_model_cl_tr.fit(
                    X=X_train.drop(columns=['group', 'response_att']),
                    y=X_train['response_att'],
                    treatment=X_train['group']
                )

                uplift_ts = uplift_model_cl_tr.predict(
                    X_test.drop(columns=['group', 'response_att'])
                )

                df_submit = X_test.assign(uplift=uplift_ts)[['uplift']]

                print(f'Submit data shape: {df_submit.shape}\n')
                df_submit.head(2)

                df_submit['group'] = X_test['group']
                df_submit['response_att'] = X_test['response_att']
                score = custom_metric(df_submit)
                print('score ', score)
                run.log(name='score', value=score)
                
                if score > max_score:
                    max_score = score
                    run.tag("Best cur")

             #   x = []
              #  answers = []
               # num = 100
              #  for i in range(20, num + 1):
               #     x.append(1.0 * i / num)
                #    answers.append(custom_metric(df_submit, take_top_ratio=1.0 * i/ num))

              #  plt.plot(x, answers)
              #  run.log_image(name="Score plot", plot=plt)
              #  plt.show()
        
        #joblib.dump(value=uplift_model_cl_tr, filename='outputs/model' + str(i) + '.pkl')
        #i += 1

lr 0.2
depth 2
n_estimators 50
seed 768



It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.



Submit data shape: (206109, 1)

score  6.315880298100263
lr 0.2
depth 2
n_estimators 55
seed 625



It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.



Submit data shape: (206109, 1)

score  6.326713948177434
lr 0.2
depth 2
n_estimators 60
seed 5786



It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.



Submit data shape: (206109, 1)

score  6.354368078385147
lr 0.2
depth 2
n_estimators 65
seed 7229



It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.



Submit data shape: (206109, 1)

score  6.362267802758323
lr 0.2
depth 2
n_estimators 70
seed 9835



It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.



Submit data shape: (206109, 1)

score  6.331017105137812
lr 0.2
depth 2
n_estimators 75
seed 6504



It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.



Submit data shape: (206109, 1)

score  6.3438188407680896
lr 0.2
depth 2
n_estimators 80
seed 9494



It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.



Submit data shape: (206109, 1)

score  6.3318038749922225
lr 0.2
depth 2
n_estimators 85
seed 2480



It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.



Submit data shape: (206109, 1)

score  6.329021333748452
lr 0.2
depth 2
n_estimators 90
seed 4751



It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.



Submit data shape: (206109, 1)

score  6.353910982779179
lr 0.2
depth 2
n_estimators 95
seed 5974



It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.



Submit data shape: (206109, 1)

score  6.352701547107656
lr 0.2
depth 2
n_estimators 100
seed 6028



It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.



Submit data shape: (206109, 1)

score  6.342565971382241
lr 0.2
depth 2
n_estimators 105
seed 5747



It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.



Submit data shape: (206109, 1)

score  6.299686177172275
lr 0.2
depth 2
n_estimators 110
seed 524



It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.



Submit data shape: (206109, 1)

score  6.314724849211373
lr 0.2
depth 2
n_estimators 115
seed 6797



It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.



Submit data shape: (206109, 1)

score  6.333710095886319
lr 0.2
depth 2
n_estimators 120
seed 585



It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.



Submit data shape: (206109, 1)

score  6.348700732073532
lr 0.2
depth 2
n_estimators 125
seed 4322



It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.



Submit data shape: (206109, 1)

score  6.386955095589897
lr 0.2
depth 2
n_estimators 130
seed 986



It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.



Submit data shape: (206109, 1)

score  6.361967550078037
lr 0.2
depth 2
n_estimators 135
seed 2598



It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.



Submit data shape: (206109, 1)

score  6.336498213163033
lr 0.2
depth 2
n_estimators 140
seed 8591



It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.



Submit data shape: (206109, 1)

score  6.329768823430304
lr 0.2
depth 2
n_estimators 145
seed 2013



It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.



Submit data shape: (206109, 1)

score  6.327707036904109
lr 0.2
depth 3
n_estimators 50
seed 7831



It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.



Submit data shape: (206109, 1)

score  6.405612542543765
lr 0.2
depth 3
n_estimators 55
seed 2573



It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.



Submit data shape: (206109, 1)

score  6.483177759532782
lr 0.2
depth 3
n_estimators 60
seed 1262



It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.



Submit data shape: (206109, 1)

score  6.464058482673382
lr 0.2
depth 3
n_estimators 65
seed 6702



It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.



Submit data shape: (206109, 1)

score  6.473809381187437
lr 0.2
depth 3
n_estimators 70
seed 3796



It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.



Submit data shape: (206109, 1)

score  6.45391567583925
lr 0.2
depth 3
n_estimators 75
seed 2010



It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.



Submit data shape: (206109, 1)

score  6.453545328862523
lr 0.2
depth 3
n_estimators 80
seed 9401



It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.



Submit data shape: (206109, 1)

score  6.493360590659955
lr 0.2
depth 3
n_estimators 85
seed 7314



It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.



Submit data shape: (206109, 1)

score  6.466692493589388
lr 0.2
depth 3
n_estimators 90
seed 1093



It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.



Submit data shape: (206109, 1)

score  6.494294573861328
lr 0.2
depth 3
n_estimators 95
seed 8213



It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.



Submit data shape: (206109, 1)

score  6.490016957639683
lr 0.2
depth 3
n_estimators 100
seed 3126



It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.



Submit data shape: (206109, 1)

score  6.4721934149845675
lr 0.2
depth 3
n_estimators 105
seed 8911



It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.



Submit data shape: (206109, 1)

score  6.464351328760867
lr 0.2
depth 3
n_estimators 110
seed 5673



It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.



Submit data shape: (206109, 1)

score  6.485878286666874
lr 0.2
depth 3
n_estimators 115
seed 2169



It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.



Submit data shape: (206109, 1)

score  6.500094793664696
lr 0.2
depth 3
n_estimators 120
seed 3071



It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.



Submit data shape: (206109, 1)

score  6.4933529972188815
lr 0.2
depth 3
n_estimators 125
seed 2257



It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.



Submit data shape: (206109, 1)

score  6.415596300057191
lr 0.2
depth 3
n_estimators 130
seed 8728



It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.



Submit data shape: (206109, 1)

score  6.438986636789975
lr 0.2
depth 3
n_estimators 135
seed 7093



It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.



Submit data shape: (206109, 1)

score  6.424854657277301
lr 0.2
depth 3
n_estimators 140
seed 6058



It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.



Submit data shape: (206109, 1)

score  6.4485879389837795
lr 0.2
depth 3
n_estimators 145
seed 5676



It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.



Submit data shape: (206109, 1)

score  6.444815953439886
lr 0.2
depth 4
n_estimators 50
seed 6666



It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.



Submit data shape: (206109, 1)

score  6.195421904723378
lr 0.2
depth 4
n_estimators 55
seed 789



It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.



Submit data shape: (206109, 1)

score  6.217662862758286
lr 0.2
depth 4
n_estimators 60
seed 759



It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.



Submit data shape: (206109, 1)

score  6.214326011681842
lr 0.2
depth 4
n_estimators 65
seed 4924



It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.



Submit data shape: (206109, 1)

score  6.189341787669409
lr 0.2
depth 4
n_estimators 70
seed 3909



It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.



Submit data shape: (206109, 1)

score  6.2713677498010325
lr 0.2
depth 4
n_estimators 75
seed 2216



It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.



Submit data shape: (206109, 1)

score  6.2518557136155755
lr 0.2
depth 4
n_estimators 80
seed 7689



It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.



Submit data shape: (206109, 1)

score  6.255356134828044
lr 0.2
depth 4
n_estimators 85
seed 3676



It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.



Submit data shape: (206109, 1)

score  6.208170652904569
lr 0.2
depth 4
n_estimators 90
seed 7399



It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.



Submit data shape: (206109, 1)

score  6.230247401104824
lr 0.2
depth 4
n_estimators 95
seed 1706



It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.



Submit data shape: (206109, 1)

score  6.254551295881602
lr 0.2
depth 4
n_estimators 100
seed 4356



It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.



Submit data shape: (206109, 1)

score  6.241700779354612
lr 0.2
depth 4
n_estimators 105
seed 7816



It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.



Submit data shape: (206109, 1)

score  6.2645444082930855
lr 0.2
depth 4
n_estimators 110
seed 4887



It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.



Submit data shape: (206109, 1)

score  6.325009433870762
lr 0.2
depth 4
n_estimators 115
seed 4502



It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.



Submit data shape: (206109, 1)

score  6.363795051962862
lr 0.2
depth 4
n_estimators 120
seed 4180



It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.



Submit data shape: (206109, 1)

score  6.30357005058258
lr 0.2
depth 4
n_estimators 125
seed 5339



It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.



Submit data shape: (206109, 1)

score  6.322250145318472
lr 0.2
depth 4
n_estimators 130
seed 3025



It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.



Submit data shape: (206109, 1)

score  6.341507146555592
lr 0.2
depth 4
n_estimators 135
seed 3914



It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.



Submit data shape: (206109, 1)

score  6.331374353768812
lr 0.2
depth 4
n_estimators 140
seed 4321



It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.



Submit data shape: (206109, 1)

score  6.333901059714686
lr 0.2
depth 4
n_estimators 145
seed 9092



It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.



Submit data shape: (206109, 1)

score  6.304052335840576


In [11]:
runs = {}
run_metrics = {}

# Create dictionaries containing the runs and the metrics for all runs containing the 'mse' metric
for r in tqdm(experiment.get_runs()):
    metrics = r.get_metrics()
    if 'score' in metrics.keys():
        runs[r.id] = r
        run_metrics[r.id] = metrics

# Find the run with the best (lowest) mean squared error and display the id and metrics
best_run_id = max(run_metrics, key = lambda k: run_metrics[k]['score'])
best_run = runs[best_run_id]
print('Best run is:', best_run_id)
print('Metrics:', run_metrics[best_run_id])

# Tag the best run for identification later
best_run.tag("Best Run")

22it [00:05,  4.05it/s]


Best run is: fb524d86-f5f6-4a3d-a3cb-4e8d66e11f6a
Metrics: {'lr': 0.2, 'score': 6.087121555088201}
