In [1]:
import gc
import pandas as pd
import numpy as np
from datetime import datetime

from xgboost import XGBClassifier
from sklift.models import ClassTransformation

from sklift.metrics import uplift_at_k
import matplotlib.pyplot as plt
import catboost as cb
from catboost import CatBoostClassifier, Pool
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import optuna
import pandas as pd
from pandas import read_csv

%matplotlib inline

In [3]:
data = read_csv('dataset.csv')
y = data['response_att']
X = data.copy()

features_list = ['group', 'response_att',
 'perdelta_days_between_visits_15_30d',
 'k_var_days_between_visits_1m',
 'k_var_days_between_visits_3m',
 'k_var_days_between_visits_15d',
 'cheque_count_6m_g48',
 'response_sms',
 'response_viber'
]

X = X.filter(features_list)
pd.set_option('display.max_columns', 500)
X.head(5)

Unnamed: 0,group,response_att,perdelta_days_between_visits_15_30d,k_var_days_between_visits_1m,k_var_days_between_visits_3m,k_var_days_between_visits_15d,cheque_count_6m_g48,response_sms,response_viber
0,1,0,1.3393,0.6479,0.824,0.4554,6.0,0.923077,0.071429
1,1,0,0.0,0.0,1.0027,0.0,1.0,1.0,0.0
2,0,0,0.0,0.0,0.5559,0.0,6.0,1.0,0.25
3,1,0,0.0,0.0,0.7432,0.0,3.0,0.909091,0.0
4,1,0,0.0,0.4714,0.998,0.4714,4.0,1.0,0.1


In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=10)

In [5]:
import azureml.core
from azureml.core.authentication import InteractiveLoginAuthentication
from azureml.core import Experiment, Workspace

# Check core SDK version number
print("You are currently using version", azureml.core.VERSION, "of the Azure ML SDK")
print("")

# Log In to Azure ML Workspace
interactive_auth = InteractiveLoginAuthentication(tenant_id="76f90eb1-fb9a-4446-9875-4d323d6455ad")

ws = Workspace.from_config(auth=interactive_auth)
print('Workspace name: ' + ws.name, sep='\n')

You are currently using version 1.5.0 of the Azure ML SDK

Workspace name: team19


In [6]:
def custom_metric(answers, take_top_ratio=0.25):
    answers.sort_values(by='uplift', inplace=True, ascending=False)
    
    n_samples = int(np.ceil(answers.shape[0] * take_top_ratio))
    answers = answers.iloc[:n_samples, :]
    answers_test = answers[answers['group'] == 1]['response_att'].sum() / \
                   answers[answers['group'] == 1].shape[0]
    answers_control = answers[answers['group'] == 0]['response_att'].sum() / \
                      answers[answers['group'] == 0].shape[0]
    return (answers_test - answers_control) * 100

In [None]:
# Get an experiment object from Azure Machine Learning

In [None]:
#i = 0

In [10]:
import numpy as np
from tqdm import tqdm
import joblib
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
from random import randrange

# list of numbers from 0 to 1.0 with a 0.05 interval
lrs = list(np.arange(0.21, 0.25, 1))
depth = list(np.arange(2, 3, 1))
estimators = list(np.arange(25, 200, 5))

# try a bunch of alpha values in a Linear Regression (Ridge) model
experiment = Experiment(workspace=ws, name="7-features")

max_score = 0

for max_depth in depth:
    for n_estimators in estimators:
        for lr in lrs:
            print('lr', lr)
            print('depth', max_depth)
            print('n_estimators', n_estimators)
            seed = randrange(10000)
            print('seed', seed)
            
            with experiment.start_logging(snapshot_directory=None) as run:
                run.log(name='lr', value=lr)
                run.log(name='depth', value=max_depth)
                run.log(name='n_estimators', value=n_estimators)

                xgb_est_params = {
                    'max_depth':max_depth,
                    'learning_rate': lr, 
                    'n_estimators': n_estimators,
                    'nthread':64,
                    'n_gpus':1,
                    'seed': seed
                }

                estimator = XGBClassifier(
                    **xgb_est_params
                )


                uplift_model_cl_tr = ClassTransformation(
                    estimator=estimator
                )

                uplift_model_cl_tr.fit(
                    X=X_train.drop(columns=['group', 'response_att']),
                    y=X_train['response_att'],
                    treatment=X_train['group']
                )

                uplift_ts = uplift_model_cl_tr.predict(
                    X_test.drop(columns=['group', 'response_att'])
                )

                df_submit = X_test.assign(uplift=uplift_ts)[['uplift']]

                print(f'Submit data shape: {df_submit.shape}\n')
                df_submit.head(2)

                df_submit['group'] = X_test['group']
                df_submit['response_att'] = X_test['response_att']
                score = custom_metric(df_submit)
                print('score ', score)
                run.log(name='score', value=score)
                
                if score > max_score:
                    max_score = score
                    run.tag("Best cur")

             #   x = []
              #  answers = []
               # num = 100
              #  for i in range(20, num + 1):
               #     x.append(1.0 * i / num)
                #    answers.append(custom_metric(df_submit, take_top_ratio=1.0 * i/ num))

              #  plt.plot(x, answers)
              #  run.log_image(name="Score plot", plot=plt)
              #  plt.show()
        
        #joblib.dump(value=uplift_model_cl_tr, filename='outputs/model' + str(i) + '.pkl')
        #i += 1

lr 0.21
depth 2
n_estimators 25
seed 7538



It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.



Submit data shape: (206109, 1)

score  6.2775684180707305
lr 0.21
depth 2
n_estimators 30
seed 1879



It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.



Submit data shape: (206109, 1)

score  6.295756646765765
lr 0.21
depth 2
n_estimators 35
seed 5788



It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.



Submit data shape: (206109, 1)

score  6.333309993954394
lr 0.21
depth 2
n_estimators 40
seed 5374



It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.



Submit data shape: (206109, 1)

score  6.355778946639534
lr 0.21
depth 2
n_estimators 45
seed 9213



It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.



Submit data shape: (206109, 1)

score  6.368567824667254
lr 0.21
depth 2
n_estimators 50
seed 3682



It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.



Submit data shape: (206109, 1)

score  6.355111949853079
lr 0.21
depth 2
n_estimators 55
seed 3146



It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.



Submit data shape: (206109, 1)

score  6.374623418188436
lr 0.21
depth 2
n_estimators 60
seed 3340



It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.



Submit data shape: (206109, 1)

score  6.369763714971049
lr 0.21
depth 2
n_estimators 65
seed 8510



It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.



Submit data shape: (206109, 1)

score  6.41592497242009
lr 0.21
depth 2
n_estimators 70
seed 7725



It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.



Submit data shape: (206109, 1)

score  6.416613967682617
lr 0.21
depth 2
n_estimators 75
seed 9366



It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.



Submit data shape: (206109, 1)

score  6.437748779826599
lr 0.21
depth 2
n_estimators 80
seed 6038



It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.



Submit data shape: (206109, 1)

score  6.405400979763384
lr 0.21
depth 2
n_estimators 85
seed 3985



It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.



Submit data shape: (206109, 1)

score  6.393136486716367
lr 0.21
depth 2
n_estimators 90
seed 9148



It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.



Submit data shape: (206109, 1)

score  6.369084264945904
lr 0.21
depth 2
n_estimators 95
seed 4096



It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.



Submit data shape: (206109, 1)

score  6.332651603214876
lr 0.21
depth 2
n_estimators 100
seed 5201



It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.



Submit data shape: (206109, 1)

score  6.298090962433308
lr 0.21
depth 2
n_estimators 105
seed 6105



It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.



Submit data shape: (206109, 1)

score  6.31565301638852
lr 0.21
depth 2
n_estimators 110
seed 2642



It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.



Submit data shape: (206109, 1)

score  6.310720966959743
lr 0.21
depth 2
n_estimators 115
seed 7297



It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.



Submit data shape: (206109, 1)

score  6.325512368335906
lr 0.21
depth 2
n_estimators 120
seed 5325



It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.



Submit data shape: (206109, 1)

score  6.32889914481071
lr 0.21
depth 2
n_estimators 125
seed 9012



It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.



Submit data shape: (206109, 1)

score  6.345182375175687
lr 0.21
depth 2
n_estimators 130
seed 5720



It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.



Submit data shape: (206109, 1)

score  6.341388310090731
lr 0.21
depth 2
n_estimators 135
seed 6824



It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.



Submit data shape: (206109, 1)

score  6.344309995958916
lr 0.21
depth 2
n_estimators 140
seed 5678



It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.



Submit data shape: (206109, 1)

score  6.348443889332186
lr 0.21
depth 2
n_estimators 145
seed 2171



It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.



Submit data shape: (206109, 1)

score  6.374118016493046
lr 0.21
depth 2
n_estimators 150
seed 496



It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.



Submit data shape: (206109, 1)

score  6.345758286611764
lr 0.21
depth 2
n_estimators 155
seed 6873



It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.



Submit data shape: (206109, 1)

score  6.334675805244183
lr 0.21
depth 2
n_estimators 160
seed 7009



It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.



Submit data shape: (206109, 1)

score  6.333381020856982
lr 0.21
depth 2
n_estimators 165
seed 1365



It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.



Submit data shape: (206109, 1)

score  6.321160180432534
lr 0.21
depth 2
n_estimators 170
seed 4552



It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.



Submit data shape: (206109, 1)

score  6.291628230502619
lr 0.21
depth 2
n_estimators 175
seed 4914



It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.



Submit data shape: (206109, 1)

score  6.257843199068125
lr 0.21
depth 2
n_estimators 180
seed 4098



It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.



Submit data shape: (206109, 1)

score  6.269659257705992
lr 0.21
depth 2
n_estimators 185
seed 4722



It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.



Submit data shape: (206109, 1)

score  6.289743099492378
lr 0.21
depth 2
n_estimators 190
seed 9010



It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.



Submit data shape: (206109, 1)

score  6.294080362299787
lr 0.21
depth 2
n_estimators 195
seed 1342



It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.



Submit data shape: (206109, 1)

score  6.2934462985490995


In [11]:
runs = {}
run_metrics = {}

# Create dictionaries containing the runs and the metrics for all runs containing the 'mse' metric
for r in tqdm(experiment.get_runs()):
    metrics = r.get_metrics()
    if 'score' in metrics.keys():
        runs[r.id] = r
        run_metrics[r.id] = metrics

# Find the run with the best (lowest) mean squared error and display the id and metrics
best_run_id = max(run_metrics, key = lambda k: run_metrics[k]['score'])
best_run = runs[best_run_id]
print('Best run is:', best_run_id)
print('Metrics:', run_metrics[best_run_id])

# Tag the best run for identification later
best_run.tag("Best Run")

22it [00:05,  4.05it/s]


Best run is: fb524d86-f5f6-4a3d-a3cb-4e8d66e11f6a
Metrics: {'lr': 0.2, 'score': 6.087121555088201}
