In [3]:
import gc
import pandas as pd
import numpy as np
from datetime import datetime

from xgboost import XGBClassifier
from sklift.models import ClassTransformation

from sklift.metrics import uplift_at_k
import matplotlib.pyplot as plt
import catboost as cb
from catboost import CatBoostClassifier, Pool
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import optuna
import pandas as pd
from pandas import read_csv

%matplotlib inline

In [4]:
data = read_csv('dataset.csv')
y = data['response_att']
X = data.copy()

features_list = ['group', 'response_att',
 'perdelta_days_between_visits_15_30d',
 'k_var_days_between_visits_1m',
 'k_var_days_between_visits_3m',
 'k_var_days_between_visits_15d',
 'cheque_count_6m_g48',
 'response_sms',
 'cheque_count_6m_g40',
 'k_var_count_per_cheque_6m_g27',
 'k_var_disc_share_6m_g27',
 'response_viber',
 'k_var_discount_depth_1m',
 'k_var_sku_price_6m_g48',
 'cheque_count_6m_g25',
 'cheque_count_12m_g41',
 'cheque_count_6m_g41']

X = X.filter(features_list)
pd.set_option('display.max_columns', 500)
X.head(5)

Unnamed: 0,group,response_att,perdelta_days_between_visits_15_30d,k_var_days_between_visits_1m,k_var_days_between_visits_3m,k_var_days_between_visits_15d,cheque_count_6m_g48,response_sms,cheque_count_6m_g40,k_var_count_per_cheque_6m_g27,k_var_disc_share_6m_g27,response_viber,k_var_discount_depth_1m,k_var_sku_price_6m_g48,cheque_count_6m_g25,cheque_count_12m_g41,cheque_count_6m_g41
0,1,0,1.3393,0.6479,0.824,0.4554,6.0,0.923077,5.0,0.4845,0.6366,0.071429,0.4864,0.6718,9.0,6.0,1.0
1,1,0,0.0,0.0,1.0027,0.0,1.0,1.0,3.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,1.0
2,0,0,0.0,0.0,0.5559,0.0,6.0,1.0,7.0,0.5762,0.856,0.25,0.1201,0.2407,9.0,14.0,8.0
3,1,0,0.0,0.0,0.7432,0.0,3.0,0.909091,4.0,0.3295,0.578,0.0,0.0,0.1028,11.0,8.0,2.0
4,1,0,0.0,0.4714,0.998,0.4714,4.0,1.0,8.0,0.7526,0.9058,0.1,0.4903,0.2195,2.0,3.0,2.0


In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=10)

In [6]:
import azureml.core
from azureml.core.authentication import InteractiveLoginAuthentication
from azureml.core import Experiment, Workspace

# Check core SDK version number
print("You are currently using version", azureml.core.VERSION, "of the Azure ML SDK")
print("")

# Log In to Azure ML Workspace
interactive_auth = InteractiveLoginAuthentication(tenant_id="76f90eb1-fb9a-4446-9875-4d323d6455ad")

ws = Workspace.from_config(auth=interactive_auth)
print('Workspace name: ' + ws.name, sep='\n')

You are currently using version 1.5.0 of the Azure ML SDK

Workspace name: team19


In [7]:
def custom_metric(answers, take_top_ratio=0.25):
    answers.sort_values(by='uplift', inplace=True, ascending=False)
    
    n_samples = int(np.ceil(answers.shape[0] * take_top_ratio))
    answers = answers.iloc[:n_samples, :]
    answers_test = answers[answers['group'] == 1]['response_att'].sum() / \
                   answers[answers['group'] == 1].shape[0]
    answers_control = answers[answers['group'] == 0]['response_att'].sum() / \
                      answers[answers['group'] == 0].shape[0]
    return (answers_test - answers_control) * 100

In [8]:
# Get an experiment object from Azure Machine Learning

In [9]:
#i = 0

In [29]:
import numpy as np
from tqdm import tqdm
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
from random import randrange


%matplotlib inline


# list of numbers from 0 to 1.0 with a 0.05 interval
lrs = list(np.arange(0.169, 0.171, 0.001))
depth = list(np.arange(5, 6, 1))
estimators = list(np.arange(57, 63, 1))

# try a bunch of alpha values in a Linear Regression (Ridge) model
experiment = Experiment(workspace=ws, name="random_try")

max_score = 0

for max_depth in depth:
    for n_estimators in estimators:
        for lr in lrs:
            print('lr', lr)
            print('depth', max_depth)
            print('n_estimators', n_estimators)
            seed = randrange(10000)
            print('seed', seed)
            with experiment.start_logging(snapshot_directory=None) as run:
                run.log(name='lr', value=lr)
                run.log(name='depth', value=max_depth)
                run.log(name='n_estimators', value=n_estimators)

                xgb_est_params = {
                    'max_depth':max_depth,
                    'learning_rate': lr, 
                    'n_estimators': n_estimators,
                    'nthread':64,
                    'n_gpus':1,
                    'seed':seed
                }

                estimator = XGBClassifier(
                    **xgb_est_params
                )


                uplift_model_cl_tr = ClassTransformation(
                    estimator=estimator
                )

                uplift_model_cl_tr.fit(
                    X=X_train.drop(columns=['group', 'response_att']),
                    y=X_train['response_att'],
                    treatment=X_train['group']
                )

                uplift_ts = uplift_model_cl_tr.predict(
                    X_test.drop(columns=['group', 'response_att'])
                )

                df_submit = X_test.assign(uplift=uplift_ts)[['uplift']]

                print(f'Submit data shape: {df_submit.shape}\n')
                df_submit.head(2)

                df_submit['group'] = X_test['group']
                df_submit['response_att'] = X_test['response_att']
                score = custom_metric(df_submit)
                print('score ', score)
                run.log(name='score', value=score)
                
                if score > max_score:
                    max_score = score
                    run.tag("Best cur")

#                 x = []
#                 answers = []
#                 num = 100
#                 for i in range(20, num + 1):
#                     x.append(1.0 * i / num)
#                     answers.append(custom_metric(df_submit, take_top_ratio=1.0 * i/ num))

#                 plt.plot(x, answers)
#                 run.log_image(name="Score plot", plot=plt)
#                 plt.show()
        
        #joblib.dump(value=uplift_model_cl_tr, filename='outputs/model' + str(i) + '.pkl')
        #i += 1

lr 0.169
depth 5
n_estimators 57
seed 5008



It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.



Submit data shape: (206109, 1)

score  6.291440871616369
lr 0.17
depth 5
n_estimators 57
seed 9231



It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.



Submit data shape: (206109, 1)

score  6.232487566798015
lr 0.171
depth 5
n_estimators 57
seed 9578



It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.



Submit data shape: (206109, 1)

score  6.241373961972929
lr 0.169
depth 5
n_estimators 58
seed 8845



It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.



Submit data shape: (206109, 1)

score  6.291440871616369
lr 0.17
depth 5
n_estimators 58
seed 4865



It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.



Submit data shape: (206109, 1)

score  6.250983628535517
lr 0.171
depth 5
n_estimators 58
seed 825



It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.



Submit data shape: (206109, 1)

score  6.247141166347594
lr 0.169
depth 5
n_estimators 59
seed 7273



It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.



Submit data shape: (206109, 1)

score  6.298851238600256
lr 0.17
depth 5
n_estimators 59
seed 1335



It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.



Submit data shape: (206109, 1)

score  6.244762371313414
lr 0.171
depth 5
n_estimators 59
seed 5195



It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.



Submit data shape: (206109, 1)

score  6.23189230406489
lr 0.169
depth 5
n_estimators 60
seed 3913



It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.



Submit data shape: (206109, 1)

score  6.298610305493882
lr 0.17
depth 5
n_estimators 60
seed 3116



It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.



Submit data shape: (206109, 1)

score  6.204577068310016
lr 0.171
depth 5
n_estimators 60
seed 8401



It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.



Submit data shape: (206109, 1)

score  6.289123183248818
lr 0.169
depth 5
n_estimators 61
seed 2995



It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.



Submit data shape: (206109, 1)

score  6.296284134886621
lr 0.17
depth 5
n_estimators 61
seed 7443



It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.



Submit data shape: (206109, 1)

score  6.210119383889465
lr 0.171
depth 5
n_estimators 61
seed 6590



It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.



Submit data shape: (206109, 1)

score  6.291165442424656
lr 0.169
depth 5
n_estimators 62
seed 9395



It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.



Submit data shape: (206109, 1)

score  6.2361684313893555
lr 0.17
depth 5
n_estimators 62
seed 9420



It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.



Submit data shape: (206109, 1)

score  6.272788311599617
lr 0.171
depth 5
n_estimators 62
seed 3225



It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.



Submit data shape: (206109, 1)

score  6.284941055177814


In [11]:
runs = {}
run_metrics = {}

# Create dictionaries containing the runs and the metrics for all runs containing the 'mse' metric
for r in tqdm(experiment.get_runs()):
    metrics = r.get_metrics()
    if 'score' in metrics.keys():
        runs[r.id] = r
        run_metrics[r.id] = metrics

# Find the run with the best (lowest) mean squared error and display the id and metrics
best_run_id = max(run_metrics, key = lambda k: run_metrics[k]['score'])
best_run = runs[best_run_id]
print('Best run is:', best_run_id)
print('Metrics:', run_metrics[best_run_id])

# Tag the best run for identification later
best_run.tag("Best Run")

22it [00:05,  4.05it/s]


Best run is: fb524d86-f5f6-4a3d-a3cb-4e8d66e11f6a
Metrics: {'lr': 0.2, 'score': 6.087121555088201}
