# Libs

In [19]:
import pandas as pd; pd.set_option('display.max_columns', None)
from sklearn.model_selection import train_test_split
from datetime import datetime
import time

import warnings
import numpy as np

from uplift.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

# Data extraction

In [2]:
%matplotlib inline


# Чтение данных
df_clients = pd.read_csv('data/clients.csv', index_col='client_id')
df_train = pd.read_csv('data/uplift_train.csv', index_col='client_id')
df_test = pd.read_csv('data/uplift_test.csv', index_col='client_id')

# Извлечение признаков

df_clients['first_issue_time'] = \
    (pd.to_datetime(df_clients['first_issue_date'])
     - pd.Timestamp('1970-01-01')) // pd.Timedelta('1s')
df_clients['first_redeem_time'] = \
    (pd.to_datetime(df_clients['first_redeem_date'])
     - pd.Timestamp('1970-01-01')) // pd.Timedelta('1s')
df_clients['issue_redeem_delay'] = df_clients['first_redeem_time'] \
    - df_clients['first_issue_time']
df_clients = df_clients.drop(['first_issue_date', 'first_redeem_date'], axis=1)

df_features = pd.DataFrame({
    'gender_M': (df_clients['gender'] == 'M').astype(int),
    'gender_F': (df_clients['gender'] == 'F').astype(int),
    'gender_U': (df_clients['gender'] == 'U').astype(int),
    'age': df_clients['age'],
    'first_issue_time': df_clients['first_issue_time'],
    'first_redeem_time': df_clients['first_redeem_time'],
    'issue_redeem_delay': df_clients['issue_redeem_delay'],
}).fillna(0)

indices_train = df_train.index
indices_learn, indices_valid = train_test_split(df_train.index, test_size=0.3, random_state=123)

In [7]:
print(indices_train)

Index(['000012768d', '000036f903', '00010925a5', '0001f552b0', '00020e7b18',
       '000220a0a7', '00022fd34f', '0002ce2217', '00031cbbe6', '00035a21d9',
       ...
       'fffb52b456', 'fffb9a1eaa', 'fffbdaf8ed', 'fffc2e37ab', 'fffc2eadcb',
       'fffcb91f10', 'fffd5cd0c6', 'fffd63dfe3', 'fffd8c9d7d', 'fffe0abb97'],
      dtype='object', name='client_id', length=200035)


In [10]:
X_train = df_features.loc[indices_train, :]
y_train = df_train.loc[indices_train, 'target']
treat_train = df_train.loc[indices_train, 'treatment_flg']

X_val = df_features.loc[indices_valid, :]
y_val = df_train.loc[indices_valid, 'target']
treat_val =  df_train.loc[indices_valid, 'treatment_flg']

X_train_full = df_features.loc[indices_train, :]
y_train_full = df_train.loc[:, 'target']
treat_train_full = df_train.loc[:, 'treatment_flg']



print(df_features.head())
cat_features = ['gender']

models_results = {
    'approach': [],
    'uplift@30%': []
}
print(models_results)

            gender_M  gender_F  gender_U  age  first_issue_time  \
client_id                                                         
000012768d         0         0         1   45        1501947648   
000036f903         0         1         0   72        1491832463   
000048b7a6         0         1         0   68        1544880791   
000073194a         0         1         0   60        1495544174   
00007c7133         0         0         1   67        1495469828   

            first_redeem_time  issue_redeem_delay  
client_id                                          
000012768d       1.515094e+09          13146559.0  
000036f903       1.492951e+09           1118613.0  
000048b7a6       0.000000e+00                 0.0  
000073194a       1.511522e+09          15978107.0  
00007c7133       1.546277e+09          50806825.0  
{'approach': [], 'uplift@30%': []}


# Validation RF

In [6]:
from pprint import pprint


n_estimators = [int(x) for x in np.linspace(start = 1, stop = 45, num = 5)]
# Number of features to consider at every split
max_features = [0.1, 0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
#min_samples_split = [500, 1000, 2000, 5000, 10000]
# Minimum number of samples required at each leaf node
min_samples_leaf = [500, 1000, 2000, 5000, 10000]
# Method of selecting samples for training each tree
#bootstrap = [True, False]


param_grid = {
              'max_features':max_features,
               'min_samples_leaf': min_samples_leaf}

pprint(param_grid)

{'max_features': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
 'min_samples_leaf': [500, 1000, 2000, 5000, 10000]}


In [8]:
from sklearn.model_selection import ParameterGrid

grid_list = list(ParameterGrid(param_grid))
grid_size = len(grid_list)

In [26]:
num_folds = 5
val_count = (df_train.shape[0]) // num_folds #volume of validation set
train_count = (df_train.shape[0]) - val_count
print(train_count)
print(df_train.shape[0])

160028
200035


In [69]:
from uplift.metrics import uplift_at_k,qini_auc_score
accuracy_auqc = {}
accuracy = {}
all_accuracy = []
all_accuracy_auqc = []
time = []
best_param = {}
best_param_auqc = {}

indices_all_folds = np.array_split(indices_learn, num_folds)
#cross validation
for i in range(num_folds):
    
    x_val = df_features.loc[indices_all_folds[i]].values
    y_val = df_train.loc[indices_all_folds[i],'target'].values
    treatment_val = df_train.loc[indices_all_folds[i],'treatment_flg'].values
    
    
    united = np.vstack(indices_all_folds[0:i]+indices_all_folds[i+1:])
    united = united.reshape(train_count)
    
    x_train = df_features.loc[united].values
    y_train = df_train.loc[united, 'target'].values
    treatment_train = df_train.loc[united, 'treatment_flg'].values
    best_acc = 0.0
    best_auqc = 0.0
    for params in grid_list:
        
        rfc = RandomForestClassifier(n_estimators = 25, min_samples_leaf= params["min_samples_leaf"], 
                                     max_features = params["max_features"], max_depth = None, criterion='uplift_gini')
        start_time = datetime.now()
        rfc.fit(x_train,y_train,treatment_train)
        time.append(datetime.now() - start_time)
        pred = rfc.predict_uplift(x_val)
        #acc = np.mean(pred==y_val)
        acc = uplift_at_k(y_true=y_val, uplift=pred, treatment=treatment_val,strategy= "by_group", k=0.3)
        auqc = qini_auc_score(y_true = y_val, uplift=pred, treatment=treatment_val)
        all_accuracy.append(acc)
        all_accuracy_auqc.append(auqc)
        if acc > best_acc:
            accuracy[i] = acc
            best_param[i] = [params["max_features"],params["min_samples_leaf"],  'uplift_gini']
            print("fold {}, acc {}, params {}".format(i,acc,[params["max_features"],params["min_samples_leaf"],  'uplift_gini']))
            best_acc = acc
        if auqc > best_auqc:
            accuracy_auqc[i] = auqc
            best_param_auqc[i] =  [params["max_features"],params["min_samples_leaf"],  'uplift_gini']
            print("fold {}, auqc {}, params {}".format(i,auqc,[params["max_features"],params["min_samples_leaf"],  'uplift_gini']))
            best_auqc = auqc
            
    
print("best accuracies in folds:")
print(accuracy)
print(accuracy_auqc)


fold 0, acc 0.06630080277392303, params [0.1, 500, 'uplift_gini']
fold 0, auqc 8725646.445880946, params [0.1, 500, 'uplift_gini']
fold 0, acc 0.06633935215832043, params [0.1, 2000, 'uplift_gini']
fold 0, auqc 8829313.020955099, params [0.1, 2000, 'uplift_gini']
fold 0, auqc 9058460.834605347, params [0.1, 5000, 'uplift_gini']
fold 0, acc 0.07556064835756093, params [0.2, 1000, 'uplift_gini']
fold 0, auqc 9523323.62691804, params [0.2, 1000, 'uplift_gini']
fold 0, acc 0.07579971123109674, params [0.3, 500, 'uplift_gini']
fold 0, auqc 9642599.279244933, params [0.3, 5000, 'uplift_gini']
fold 0, acc 0.0762527798783098, params [0.4, 1000, 'uplift_gini']
fold 0, auqc 9712394.182328422, params [0.5, 5000, 'uplift_gini']


KeyboardInterrupt: 

In [68]:
ac_30  = np.fromiter(accuracy.values(), dtype=float)
ac_auqc = np.fromiter(accuracy_auqc.values(), dtype=float)

print("mean_30%: {}, mean_auqc: {}, std_30%: {}, std_auqc:{}, mean_time{}".format(np.mean(ac_30),
                    np.mean(ac_auqc),np.std(ac_30),np.std(ac_auqc),np.mean(time)))

mean_30%: 0.08020208888281581, mean_auqc: 10009090.72082971, std_30%: 0.00747535236162106, std_auqc:1041006.2876469067, mean_time0:00:00.829575


# Solo model

In [34]:
from pprint import pprint

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(1, 9, num = 5)]



param_grid = {
              'max_depth':max_depth
             }

pprint(param_grid)

from sklearn.model_selection import ParameterGrid

grid_list = list(ParameterGrid(param_grid))
grid_size = len(grid_list)

indices_learn = indices_learn[0:140020]
num_folds = 5
val_count = (len(indices_learn) ) // num_folds #volume of validation set
train_count = (len(indices_learn)) - val_count
print(val_count)
print(len(indices_learn))

{'max_depth': [1, 3, 5, 7, 9]}
28004
140020


In [33]:
indices_all_folds = np.array_split(indices_train, num_folds)
print(len(indices_learn[0:140020]))

140020


In [35]:
from uplift.models import SoloModel
from uplift.metrics import uplift_at_k,qini_auc_score
accuracy_auqc = {}
accuracy = {}
eval_x = df_features.loc[indices_valid].values
eval_y = df_train.loc[indices_valid,'target'].values
eval_treat = df_train.loc[indices_valid,'treatment_flg'].values
eval_set = [(np.column_stack((eval_x,eval_treat)), eval_y)]
best_param = {}
best_param_auqc = {}
indices_all_folds = np.array_split(indices_learn, num_folds)
#cross validation

for i in range(num_folds):
    
    x_val = df_features.loc[indices_all_folds[i]].values
    y_val =df_train.loc[indices_all_folds[i],'target'].values
    treatment_val = df_train.loc[indices_all_folds[i],'treatment_flg'].values
    
    
    united = np.vstack(indices_all_folds[0:i]+indices_all_folds[i+1:])
    united = united.reshape(train_count)
    
    x_train = df_features.loc[united].values
    y_train = df_train.loc[united, 'target'].values
    treatment_train = df_train.loc[united, 'treatment_flg'].values
    best_acc = 0.0
    best_auqc = 0.0
    for params in grid_list:
        
        sm = SoloModel(XGBClassifier(max_depth = params["max_depth"]))
        start_time = datetime.now()
        
        sm = sm.fit(x_train, y_train, treatment_train, estimator_fit_params={'early_stopping_rounds': 10,
                                                'eval_metric':"auc", "eval_set":eval_set})
        time_fit = datetime.now() - start_time

        uplift_sm = sm.predict(x_val)
        
        acc = uplift_at_k(y_true=y_val, uplift=uplift_sm, treatment=treatment_val,strategy= "by_group", k=0.3)
        auqc = qini_auc_score(y_true = y_val, uplift=uplift_sm, treatment=treatment_val)
        if acc > best_acc:
            accuracy[i] = acc
            best_param[i] = [params["max_depth"]]
            print("fold {}, acc {}, params {}".format(i,acc,[params["max_depth"]]))
            best_acc = acc
            
        if auqc > best_auqc:
            accuracy_auqc[i] = auqc
            best_param_auqc[i] = [params["max_depth"]]
            print("fold {}, auqc {}, params {}".format(i,auqc,[params["max_depth"]]))
            best_auqc = auqc
            
    
print("best accuracies in folds:")
print(accuracy)
print(accuracy_auqc)
print(best_param)
print(best_param_auqc)

[0]	validation_0-auc:0.54653
Will train until validation_0-auc hasn't improved in 10 rounds.
[1]	validation_0-auc:0.54653
[2]	validation_0-auc:0.54655
[3]	validation_0-auc:0.56717
[4]	validation_0-auc:0.56715
[5]	validation_0-auc:0.56772
[6]	validation_0-auc:0.58075
[7]	validation_0-auc:0.59201
[8]	validation_0-auc:0.59323
[9]	validation_0-auc:0.59323
[10]	validation_0-auc:0.59646
[11]	validation_0-auc:0.59555
[12]	validation_0-auc:0.59899
[13]	validation_0-auc:0.60033
[14]	validation_0-auc:0.60033
[15]	validation_0-auc:0.60035
[16]	validation_0-auc:0.60310
[17]	validation_0-auc:0.60270
[18]	validation_0-auc:0.60276
[19]	validation_0-auc:0.60551
[20]	validation_0-auc:0.60530
[21]	validation_0-auc:0.60529
[22]	validation_0-auc:0.60574
[23]	validation_0-auc:0.60710
[24]	validation_0-auc:0.60686
[25]	validation_0-auc:0.60764
[26]	validation_0-auc:0.60781
[27]	validation_0-auc:0.60941
[28]	validation_0-auc:0.60952
[29]	validation_0-auc:0.61085
[30]	validation_0-auc:0.61094
[31]	validation_

[61]	validation_0-auc:0.61467
[62]	validation_0-auc:0.61467
[63]	validation_0-auc:0.61531
[64]	validation_0-auc:0.61534
[65]	validation_0-auc:0.61533
[66]	validation_0-auc:0.61536
[67]	validation_0-auc:0.61535
[68]	validation_0-auc:0.61527
[69]	validation_0-auc:0.61526
[70]	validation_0-auc:0.61514
[71]	validation_0-auc:0.61519
[72]	validation_0-auc:0.61523
[73]	validation_0-auc:0.61530
[74]	validation_0-auc:0.61531
[75]	validation_0-auc:0.61531
[76]	validation_0-auc:0.61533
Stopping. Best iteration:
[66]	validation_0-auc:0.61536

fold 1, acc 0.00037415042343624805, params [1]
fold 1, auqc 2380726.366007466, params [1]
[0]	validation_0-auc:0.57761
Will train until validation_0-auc hasn't improved in 10 rounds.
[1]	validation_0-auc:0.57678
[2]	validation_0-auc:0.60184
[3]	validation_0-auc:0.60562
[4]	validation_0-auc:0.61160
[5]	validation_0-auc:0.61256
[6]	validation_0-auc:0.61343
[7]	validation_0-auc:0.61535
[8]	validation_0-auc:0.61638
[9]	validation_0-auc:0.61664
[10]	validation_0-a

[28]	validation_0-auc:0.61838
[29]	validation_0-auc:0.61851
Stopping. Best iteration:
[19]	validation_0-auc:0.61897

fold 2, acc 0.046291428427632564, params [3]
fold 2, auqc 7603525.945829059, params [3]
[0]	validation_0-auc:0.60150
Will train until validation_0-auc hasn't improved in 10 rounds.
[1]	validation_0-auc:0.60523
[2]	validation_0-auc:0.60657
[3]	validation_0-auc:0.61131
[4]	validation_0-auc:0.61459
[5]	validation_0-auc:0.61559
[6]	validation_0-auc:0.61711
[7]	validation_0-auc:0.61712
[8]	validation_0-auc:0.61753
[9]	validation_0-auc:0.61747
[10]	validation_0-auc:0.61725
[11]	validation_0-auc:0.61784
[12]	validation_0-auc:0.61774
[13]	validation_0-auc:0.61749
[14]	validation_0-auc:0.61736
[15]	validation_0-auc:0.61780
[16]	validation_0-auc:0.61768
[17]	validation_0-auc:0.61768
[18]	validation_0-auc:0.61767
[19]	validation_0-auc:0.61755
[20]	validation_0-auc:0.61738
[21]	validation_0-auc:0.61731
Stopping. Best iteration:
[11]	validation_0-auc:0.61784

fold 2, acc 0.0576804841

Stopping. Best iteration:
[7]	validation_0-auc:0.61713

[0]	validation_0-auc:0.60630
Will train until validation_0-auc hasn't improved in 10 rounds.
[1]	validation_0-auc:0.60984
[2]	validation_0-auc:0.61186
[3]	validation_0-auc:0.61207
[4]	validation_0-auc:0.61253
[5]	validation_0-auc:0.61339
[6]	validation_0-auc:0.61380
[7]	validation_0-auc:0.61343
[8]	validation_0-auc:0.61342
[9]	validation_0-auc:0.61370
[10]	validation_0-auc:0.61338
[11]	validation_0-auc:0.61301
[12]	validation_0-auc:0.61297
[13]	validation_0-auc:0.61254
[14]	validation_0-auc:0.61232
[15]	validation_0-auc:0.61186
[16]	validation_0-auc:0.61168
Stopping. Best iteration:
[6]	validation_0-auc:0.61380

fold 3, acc 0.04804956496463986, params [9]
fold 3, auqc 7811857.3524293415, params [9]
[0]	validation_0-auc:0.54653
Will train until validation_0-auc hasn't improved in 10 rounds.
[1]	validation_0-auc:0.54655
[2]	validation_0-auc:0.54655
[3]	validation_0-auc:0.56773
[4]	validation_0-auc:0.56773
[5]	validation_0-auc:0.5678

In [44]:
ac_30  = np.fromiter(accuracy.values(), dtype=float)
ac_auqc = np.fromiter(accuracy_auqc.values(), dtype=float)

print("mean_30%: {}, mean_auqc: {}, std_30%: {}, std_auqc:{}".format(np.mean(ac_30),
                    np.mean(ac_auqc),np.std(ac_30),np.std(ac_auqc)))

print(time_fit)

mean_30%: 0.0607829747316551, mean_auqc: 8979591.456489269, std_30%: 0.016835659776562713, std_auqc:1836127.686022348
0:00:02.615963


# Two models independent

In [55]:
from uplift.models import TwoModels
from uplift.metrics import uplift_at_k,qini_auc_score
accuracy_auqc = {}
accuracy = {}
eval_x = df_features.loc[indices_valid].values
eval_y = df_train.loc[indices_valid,'target'].values
eval_treat = df_train.loc[indices_valid,'treatment_flg'].values
eval_set = [(eval_x, eval_y)]
best_param = {}
time = []
best_param_auqc = {}
indices_all_folds = np.array_split(indices_learn, num_folds)
#cross validation

for i in range(num_folds):
    
    x_val = df_features.loc[indices_all_folds[i]].values
    y_val =df_train.loc[indices_all_folds[i],'target'].values
    treatment_val = df_train.loc[indices_all_folds[i],'treatment_flg'].values
    
    
    united = np.vstack(indices_all_folds[0:i]+indices_all_folds[i+1:])
    united = united.reshape(train_count)
    
    x_train = df_features.loc[united].values
    y_train = df_train.loc[united, 'target'].values
    treatment_train = df_train.loc[united, 'treatment_flg'].values
    best_acc = 0.0
    best_auqc = 0.0
    for params in grid_list:
        
        tm = TwoModels(
                estimator_trmnt=XGBClassifier(max_depth = params["max_depth"]), 
                estimator_ctrl=XGBClassifier(max_depth = params["max_depth"]), 
                method='vanilla')
        
        start_time = datetime.now()
        
        tm = tm.fit(x_train, y_train, treatment_train, estimator_trmnt_fit_params={'early_stopping_rounds': 10,
                                                'eval_metric':"auc", "eval_set":eval_set},
                                  estimator_ctrl_fit_params ={'early_stopping_rounds': 10,
                                                'eval_metric':"auc", "eval_set":eval_set})
        time.append(datetime.now() - start_time)

        uplift_tm = tm.predict(x_val)
        
        acc = uplift_at_k(y_true=y_val, uplift=uplift_tm, treatment=treatment_val,strategy= "by_group", k=0.3)
        auqc = qini_auc_score(y_true = y_val, uplift=uplift_tm, treatment=treatment_val)
        
        if acc > best_acc:
            accuracy[i] = acc
            best_param[i] = [params["max_depth"]]
            print("fold {}, acc {}, params {}".format(i,acc,[params["max_depth"]]))
            best_acc = acc
            
        if auqc > best_auqc:
            accuracy_auqc[i] = auqc
            best_param_auqc[i] = [params["max_depth"]]
            print("fold {}, auqc {}, params {}".format(i,auqc,[params["max_depth"]]))
            best_auqc = auqc
            
    
print("best accuracies in folds:")
print(accuracy)
print(accuracy_auqc)
print(best_param)
print(best_param_auqc)




[0]	validation_0-auc:0.54651
Will train until validation_0-auc hasn't improved in 10 rounds.
[1]	validation_0-auc:0.54646
[2]	validation_0-auc:0.54646
[3]	validation_0-auc:0.56755
[4]	validation_0-auc:0.56758
[5]	validation_0-auc:0.57985
[6]	validation_0-auc:0.58294
[7]	validation_0-auc:0.58294
[8]	validation_0-auc:0.58290
[9]	validation_0-auc:0.58939
[10]	validation_0-auc:0.59015
[11]	validation_0-auc:0.58982
[12]	validation_0-auc:0.59013
[13]	validation_0-auc:0.59013
[14]	validation_0-auc:0.59096
[15]	validation_0-auc:0.59104
[16]	validation_0-auc:0.59151
[17]	validation_0-auc:0.59184
[18]	validation_0-auc:0.59258
[19]	validation_0-auc:0.59258
[20]	validation_0-auc:0.59514
[21]	validation_0-auc:0.59551
[22]	validation_0-auc:0.59529
[23]	validation_0-auc:0.59524
[24]	validation_0-auc:0.59600
[25]	validation_0-auc:0.59685
[26]	validation_0-auc:0.59685
[27]	validation_0-auc:0.59818
[28]	validation_0-auc:0.59978
[29]	validation_0-auc:0.59979
[30]	validation_0-auc:0.59917
[31]	validation_

[6]	validation_0-auc:0.61309
[7]	validation_0-auc:0.61334
[8]	validation_0-auc:0.61297
[9]	validation_0-auc:0.61290
[10]	validation_0-auc:0.61210
[11]	validation_0-auc:0.61199
[12]	validation_0-auc:0.61201
[13]	validation_0-auc:0.61136
[14]	validation_0-auc:0.61125
[15]	validation_0-auc:0.61092
[16]	validation_0-auc:0.61101
[17]	validation_0-auc:0.61074
Stopping. Best iteration:
[7]	validation_0-auc:0.61334

[0]	validation_0-auc:0.60134
Will train until validation_0-auc hasn't improved in 10 rounds.
[1]	validation_0-auc:0.60544
[2]	validation_0-auc:0.60893
[3]	validation_0-auc:0.61079
[4]	validation_0-auc:0.61181
[5]	validation_0-auc:0.61321
[6]	validation_0-auc:0.61343
[7]	validation_0-auc:0.61412
[8]	validation_0-auc:0.61401
[9]	validation_0-auc:0.61425
[10]	validation_0-auc:0.61441
[11]	validation_0-auc:0.61446
[12]	validation_0-auc:0.61407
[13]	validation_0-auc:0.61427
[14]	validation_0-auc:0.61417
[15]	validation_0-auc:0.61419
[16]	validation_0-auc:0.61452
[17]	validation_0-auc:0.

[43]	validation_0-auc:0.61011
[44]	validation_0-auc:0.61143
[45]	validation_0-auc:0.61144
[46]	validation_0-auc:0.61142
[47]	validation_0-auc:0.61145
[48]	validation_0-auc:0.61144
[49]	validation_0-auc:0.61146
[50]	validation_0-auc:0.61145
[51]	validation_0-auc:0.61146
[52]	validation_0-auc:0.61145
[53]	validation_0-auc:0.61146
[54]	validation_0-auc:0.61145
[55]	validation_0-auc:0.61146
[56]	validation_0-auc:0.61192
[57]	validation_0-auc:0.61185
[58]	validation_0-auc:0.61184
[59]	validation_0-auc:0.61183
[60]	validation_0-auc:0.61185
[61]	validation_0-auc:0.61184
[62]	validation_0-auc:0.61185
[63]	validation_0-auc:0.61195
[64]	validation_0-auc:0.61281
[65]	validation_0-auc:0.61295
[66]	validation_0-auc:0.61307
[67]	validation_0-auc:0.61306
[68]	validation_0-auc:0.61307
[69]	validation_0-auc:0.61306
[70]	validation_0-auc:0.61307
[71]	validation_0-auc:0.61306
[72]	validation_0-auc:0.61307
[73]	validation_0-auc:0.61307
[74]	validation_0-auc:0.61308
[75]	validation_0-auc:0.61307
[76]	valid

[9]	validation_0-auc:0.59980
[10]	validation_0-auc:0.59951
[11]	validation_0-auc:0.59959
[12]	validation_0-auc:0.59961
[13]	validation_0-auc:0.59965
Stopping. Best iteration:
[3]	validation_0-auc:0.60187

[0]	validation_0-auc:0.60454
Will train until validation_0-auc hasn't improved in 10 rounds.
[1]	validation_0-auc:0.60757
[2]	validation_0-auc:0.60811
[3]	validation_0-auc:0.60835
[4]	validation_0-auc:0.60860
[5]	validation_0-auc:0.60888
[6]	validation_0-auc:0.60904
[7]	validation_0-auc:0.60904
[8]	validation_0-auc:0.60927
[9]	validation_0-auc:0.60940
[10]	validation_0-auc:0.60940
[11]	validation_0-auc:0.60899
[12]	validation_0-auc:0.60930
[13]	validation_0-auc:0.60899
[14]	validation_0-auc:0.60867
[15]	validation_0-auc:0.60862
[16]	validation_0-auc:0.60815
[17]	validation_0-auc:0.60771
[18]	validation_0-auc:0.60779
[19]	validation_0-auc:0.60726
[20]	validation_0-auc:0.60721
Stopping. Best iteration:
[10]	validation_0-auc:0.60940

[0]	validation_0-auc:0.54653
Will train until validati

[1]	validation_0-auc:0.60575
[2]	validation_0-auc:0.60906
[3]	validation_0-auc:0.61037
[4]	validation_0-auc:0.61187
[5]	validation_0-auc:0.61370
[6]	validation_0-auc:0.61396
[7]	validation_0-auc:0.61406
[8]	validation_0-auc:0.61460
[9]	validation_0-auc:0.61509
[10]	validation_0-auc:0.61503
[11]	validation_0-auc:0.61479
[12]	validation_0-auc:0.61503
[13]	validation_0-auc:0.61507
[14]	validation_0-auc:0.61524
[15]	validation_0-auc:0.61506
[16]	validation_0-auc:0.61480
[17]	validation_0-auc:0.61464
[18]	validation_0-auc:0.61436
[19]	validation_0-auc:0.61433
[20]	validation_0-auc:0.61429
[21]	validation_0-auc:0.61421
[22]	validation_0-auc:0.61422
[23]	validation_0-auc:0.61400
[24]	validation_0-auc:0.61400
Stopping. Best iteration:
[14]	validation_0-auc:0.61524

[0]	validation_0-auc:0.60121
Will train until validation_0-auc hasn't improved in 10 rounds.
[1]	validation_0-auc:0.60577
[2]	validation_0-auc:0.60741
[3]	validation_0-auc:0.60690
[4]	validation_0-auc:0.60727
[5]	validation_0-auc:0.

[79]	validation_0-auc:0.61410
[80]	validation_0-auc:0.61458
[81]	validation_0-auc:0.61459
[82]	validation_0-auc:0.61458
[83]	validation_0-auc:0.61458
[84]	validation_0-auc:0.61458
[85]	validation_0-auc:0.61458
[86]	validation_0-auc:0.61458
[87]	validation_0-auc:0.61458
[88]	validation_0-auc:0.61455
[89]	validation_0-auc:0.61455
[90]	validation_0-auc:0.61455
[91]	validation_0-auc:0.61455
Stopping. Best iteration:
[81]	validation_0-auc:0.61459

fold 3, acc 0.06368573434040226, params [1]
fold 3, auqc 8447861.165370245, params [1]
[0]	validation_0-auc:0.58160
Will train until validation_0-auc hasn't improved in 10 rounds.
[1]	validation_0-auc:0.58525
[2]	validation_0-auc:0.59473
[3]	validation_0-auc:0.59834
[4]	validation_0-auc:0.60131
[5]	validation_0-auc:0.60265
[6]	validation_0-auc:0.60335
[7]	validation_0-auc:0.60986
[8]	validation_0-auc:0.61016
[9]	validation_0-auc:0.61121
[10]	validation_0-auc:0.61207
[11]	validation_0-auc:0.61221
[12]	validation_0-auc:0.61251
[13]	validation_0-auc:

[40]	validation_0-auc:0.60190
[41]	validation_0-auc:0.60177
[42]	validation_0-auc:0.60201
[43]	validation_0-auc:0.60201
[44]	validation_0-auc:0.60214
[45]	validation_0-auc:0.60277
[46]	validation_0-auc:0.60307
[47]	validation_0-auc:0.60361
[48]	validation_0-auc:0.60390
[49]	validation_0-auc:0.60439
[50]	validation_0-auc:0.60467
[51]	validation_0-auc:0.60496
[52]	validation_0-auc:0.60574
[53]	validation_0-auc:0.60602
[54]	validation_0-auc:0.60600
[55]	validation_0-auc:0.60620
[56]	validation_0-auc:0.60665
[57]	validation_0-auc:0.60689
[58]	validation_0-auc:0.60735
[59]	validation_0-auc:0.60728
[60]	validation_0-auc:0.60682
[61]	validation_0-auc:0.60685
[62]	validation_0-auc:0.60701
[63]	validation_0-auc:0.60701
[64]	validation_0-auc:0.60724
[65]	validation_0-auc:0.60766
[66]	validation_0-auc:0.60794
[67]	validation_0-auc:0.60826
[68]	validation_0-auc:0.60851
[69]	validation_0-auc:0.60889
[70]	validation_0-auc:0.60914
[71]	validation_0-auc:0.60933
[72]	validation_0-auc:0.60958
[73]	valid

[1]	validation_0-auc:0.60472
[2]	validation_0-auc:0.60989
[3]	validation_0-auc:0.61110
[4]	validation_0-auc:0.61217
[5]	validation_0-auc:0.61389
[6]	validation_0-auc:0.61501
[7]	validation_0-auc:0.61511
[8]	validation_0-auc:0.61459
[9]	validation_0-auc:0.61487
[10]	validation_0-auc:0.61467
[11]	validation_0-auc:0.61500
[12]	validation_0-auc:0.61496
[13]	validation_0-auc:0.61505
[14]	validation_0-auc:0.61509
[15]	validation_0-auc:0.61493
[16]	validation_0-auc:0.61481
[17]	validation_0-auc:0.61499
Stopping. Best iteration:
[7]	validation_0-auc:0.61511

[0]	validation_0-auc:0.59717
Will train until validation_0-auc hasn't improved in 10 rounds.
[1]	validation_0-auc:0.60601
[2]	validation_0-auc:0.60802
[3]	validation_0-auc:0.60907
[4]	validation_0-auc:0.60894
[5]	validation_0-auc:0.60971
[6]	validation_0-auc:0.61006
[7]	validation_0-auc:0.60991
[8]	validation_0-auc:0.60985
[9]	validation_0-auc:0.60972
[10]	validation_0-auc:0.60887
[11]	validation_0-auc:0.60855
[12]	validation_0-auc:0.60853

In [57]:
ac_30  = np.fromiter(accuracy.values(), dtype=float)
ac_auqc = np.fromiter(accuracy_auqc.values(), dtype=float)

print("mean_30%: {}, mean_auqc: {}, std_30%: {}, std_auqc:{},mean_time{}".format(np.mean(ac_30),
                    np.mean(ac_auqc),np.std(ac_30),np.std(ac_auqc),np.mean(time)))



mean_30%: 0.07131610374366079, mean_auqc: 9723091.368305746, std_30%: 0.008928243855634618, std_auqc:1516337.7264928604,mean_time0:00:03.694198


# Two models control

In [51]:
from uplift.models import TwoModels
from uplift.metrics import uplift_at_k,qini_auc_score
accuracy_auqc = {}
accuracy = {}
eval_x = df_features.loc[indices_valid].values
eval_y = df_train.loc[indices_valid,'target'].values

eval_treat = df_train.loc[indices_valid,'treatment_flg'].values
eval_set_cntrl = [(eval_x, eval_y)]
eval_set_trmnt = [(np.column_stack((eval_x,eval_treat)), eval_y)]  
best_param = {}
time = []
best_param_auqc = {}
indices_all_folds = np.array_split(indices_learn, num_folds)
#cross validation

for i in range(num_folds):
    
    x_val = df_features.loc[indices_all_folds[i]].values
    y_val =df_train.loc[indices_all_folds[i],'target'].values
    treatment_val = df_train.loc[indices_all_folds[i],'treatment_flg'].values
    
    
    united = np.vstack(indices_all_folds[0:i]+indices_all_folds[i+1:])
    united = united.reshape(train_count)
    
    x_train = df_features.loc[united].values
    y_train = df_train.loc[united, 'target'].values
    treatment_train = df_train.loc[united, 'treatment_flg'].values
    
    best_acc = 0.0
    best_auqc = 0.0
    for params in grid_list:
        
        ctrl = TwoModels(
                estimator_trmnt=XGBClassifier(max_depth = params["max_depth"]), 
                estimator_ctrl=XGBClassifier(max_depth = params["max_depth"]), 
                method='ddr_control')
        
        start_time = datetime.now()
        
        ctrl = ctrl.fit(x_train, y_train, treatment_train, 
                                  estimator_ctrl_fit_params ={'early_stopping_rounds': 10,
                                                'eval_metric':"auc", "eval_set":eval_set_cntrl},
                       estimator_trmnt_fit_params={'early_stopping_rounds': 10, 'eval_metric':"auc", "eval_set":eval_set_trmnt}) #estimator_trmnt_fit_params={'early_stopping_rounds': 10, 'eval_metric':"auc", "eval_set":eval_set_trmnt},
        time.append(datetime.now() - start_time)

        uplift_ctrl = ctrl.predict(x_val)
        
        acc = uplift_at_k(y_true=y_val, uplift=uplift_ctrl, treatment=treatment_val,strategy= "by_group", k=0.3)
        auqc = qini_auc_score(y_true = y_val, uplift=uplift_ctrl, treatment=treatment_val)
       
        if acc > best_acc:
            accuracy[i] = acc
            best_param[i] = [params["max_depth"]]
            print("fold {}, acc {}, params {}".format(i,acc,[params["max_depth"]]))
            best_acc = acc
            
        if auqc > best_auqc:
            accuracy_auqc[i] = auqc
            best_param_auqc[i] = [params["max_depth"]]
            print("fold {}, auqc {}, params {}".format(i,auqc,[params["max_depth"]]))
            best_auqc = auqc
            
    
print("best accuracies in folds:")
print(accuracy)
print(accuracy_auqc)
print(best_param)
print(best_param_auqc)





[0]	validation_0-auc:0.54651
Will train until validation_0-auc hasn't improved in 10 rounds.
[1]	validation_0-auc:0.54646
[2]	validation_0-auc:0.54646
[3]	validation_0-auc:0.56755
[4]	validation_0-auc:0.56758
[5]	validation_0-auc:0.57985
[6]	validation_0-auc:0.58294
[7]	validation_0-auc:0.58294
[8]	validation_0-auc:0.58290
[9]	validation_0-auc:0.58939
[10]	validation_0-auc:0.59015
[11]	validation_0-auc:0.58982
[12]	validation_0-auc:0.59013
[13]	validation_0-auc:0.59013
[14]	validation_0-auc:0.59096
[15]	validation_0-auc:0.59104
[16]	validation_0-auc:0.59151
[17]	validation_0-auc:0.59184
[18]	validation_0-auc:0.59258
[19]	validation_0-auc:0.59258
[20]	validation_0-auc:0.59514
[21]	validation_0-auc:0.59551
[22]	validation_0-auc:0.59529
[23]	validation_0-auc:0.59524
[24]	validation_0-auc:0.59600
[25]	validation_0-auc:0.59685
[26]	validation_0-auc:0.59685
[27]	validation_0-auc:0.59818
[28]	validation_0-auc:0.59978
[29]	validation_0-auc:0.59979
[30]	validation_0-auc:0.59917
[31]	validation_

[3]	validation_0-auc:0.52100
[4]	validation_0-auc:0.52720
[5]	validation_0-auc:0.52764
[6]	validation_0-auc:0.53042
[7]	validation_0-auc:0.53047
[8]	validation_0-auc:0.53078
[9]	validation_0-auc:0.52981
[10]	validation_0-auc:0.53012
[11]	validation_0-auc:0.53010
[12]	validation_0-auc:0.52971
[13]	validation_0-auc:0.52941
[14]	validation_0-auc:0.52882
[15]	validation_0-auc:0.52882
[16]	validation_0-auc:0.52833
[17]	validation_0-auc:0.52731
[18]	validation_0-auc:0.52864
Stopping. Best iteration:
[8]	validation_0-auc:0.53078

fold 0, acc 0.0636190694960933, params [7]
fold 0, auqc 9391877.96443657, params [7]
[0]	validation_0-auc:0.60121
Will train until validation_0-auc hasn't improved in 10 rounds.
[1]	validation_0-auc:0.60315
[2]	validation_0-auc:0.60476
[3]	validation_0-auc:0.60461
[4]	validation_0-auc:0.60491
[5]	validation_0-auc:0.60491
[6]	validation_0-auc:0.60497
[7]	validation_0-auc:0.60416
[8]	validation_0-auc:0.60383
[9]	validation_0-auc:0.60388
[10]	validation_0-auc:0.60381
[1

[15]	validation_0-auc:0.61061
[16]	validation_0-auc:0.61048
[17]	validation_0-auc:0.61020
[18]	validation_0-auc:0.61011
[19]	validation_0-auc:0.61042
[20]	validation_0-auc:0.61045
Stopping. Best iteration:
[10]	validation_0-auc:0.61150

[0]	validation_0-auc:0.52976
Will train until validation_0-auc hasn't improved in 10 rounds.
[1]	validation_0-auc:0.53148
[2]	validation_0-auc:0.53698
[3]	validation_0-auc:0.53564
[4]	validation_0-auc:0.53533
[5]	validation_0-auc:0.53635
[6]	validation_0-auc:0.53727
[7]	validation_0-auc:0.54052
[8]	validation_0-auc:0.54083
[9]	validation_0-auc:0.54053
[10]	validation_0-auc:0.53834
[11]	validation_0-auc:0.53917
[12]	validation_0-auc:0.53011
[13]	validation_0-auc:0.52943
[14]	validation_0-auc:0.53032
[15]	validation_0-auc:0.53037
[16]	validation_0-auc:0.52873
[17]	validation_0-auc:0.52798
[18]	validation_0-auc:0.52845
Stopping. Best iteration:
[8]	validation_0-auc:0.54083

fold 1, acc 0.04639431236934499, params [5]
fold 1, auqc 7264345.047750281, params 

[17]	validation_0-auc:0.61299
[18]	validation_0-auc:0.61303
[19]	validation_0-auc:0.61292
[20]	validation_0-auc:0.61281
[21]	validation_0-auc:0.61278
[22]	validation_0-auc:0.61280
[23]	validation_0-auc:0.61247
[24]	validation_0-auc:0.61247
[25]	validation_0-auc:0.61253
[26]	validation_0-auc:0.61246
Stopping. Best iteration:
[16]	validation_0-auc:0.61320

[0]	validation_0-auc:0.52068
Will train until validation_0-auc hasn't improved in 10 rounds.
[1]	validation_0-auc:0.51988
[2]	validation_0-auc:0.52066
[3]	validation_0-auc:0.52108
[4]	validation_0-auc:0.52109
[5]	validation_0-auc:0.52109
[6]	validation_0-auc:0.52436
[7]	validation_0-auc:0.52484
[8]	validation_0-auc:0.52415
[9]	validation_0-auc:0.52536
[10]	validation_0-auc:0.53190
[11]	validation_0-auc:0.53024
[12]	validation_0-auc:0.53004
[13]	validation_0-auc:0.53102
[14]	validation_0-auc:0.53275
[15]	validation_0-auc:0.52879
[16]	validation_0-auc:0.52915
[17]	validation_0-auc:0.52829
[18]	validation_0-auc:0.52842
[19]	validation_0-a

[6]	validation_0-auc:0.51696
[7]	validation_0-auc:0.51696
[8]	validation_0-auc:0.51696
[9]	validation_0-auc:0.51696
[10]	validation_0-auc:0.51696
Stopping. Best iteration:
[0]	validation_0-auc:0.51696

fold 3, acc 0.010204541538873846, params [1]
fold 3, auqc 2956643.873915474, params [1]
[0]	validation_0-auc:0.58160
Will train until validation_0-auc hasn't improved in 10 rounds.
[1]	validation_0-auc:0.58525
[2]	validation_0-auc:0.59473
[3]	validation_0-auc:0.59834
[4]	validation_0-auc:0.60131
[5]	validation_0-auc:0.60265
[6]	validation_0-auc:0.60335
[7]	validation_0-auc:0.60986
[8]	validation_0-auc:0.61016
[9]	validation_0-auc:0.61121
[10]	validation_0-auc:0.61207
[11]	validation_0-auc:0.61221
[12]	validation_0-auc:0.61251
[13]	validation_0-auc:0.61262
[14]	validation_0-auc:0.61254
[15]	validation_0-auc:0.61255
[16]	validation_0-auc:0.61323
[17]	validation_0-auc:0.61317
[18]	validation_0-auc:0.61319
[19]	validation_0-auc:0.61336
[20]	validation_0-auc:0.61339
[21]	validation_0-auc:0.61

[75]	validation_0-auc:0.60974
[76]	validation_0-auc:0.60999
[77]	validation_0-auc:0.61013
[78]	validation_0-auc:0.61037
[79]	validation_0-auc:0.61032
[80]	validation_0-auc:0.61031
[81]	validation_0-auc:0.61045
[82]	validation_0-auc:0.61064
[83]	validation_0-auc:0.61078
[84]	validation_0-auc:0.61055
[85]	validation_0-auc:0.61055
[86]	validation_0-auc:0.61052
[87]	validation_0-auc:0.61077
[88]	validation_0-auc:0.61092
[89]	validation_0-auc:0.61110
[90]	validation_0-auc:0.61123
[91]	validation_0-auc:0.61142
[92]	validation_0-auc:0.61125
[93]	validation_0-auc:0.61123
[94]	validation_0-auc:0.61113
[95]	validation_0-auc:0.61122
[96]	validation_0-auc:0.61142
[97]	validation_0-auc:0.61146
[98]	validation_0-auc:0.61162
[99]	validation_0-auc:0.61169
[0]	validation_0-auc:0.51696
Will train until validation_0-auc hasn't improved in 10 rounds.
[1]	validation_0-auc:0.51696
[2]	validation_0-auc:0.51696
[3]	validation_0-auc:0.51696
[4]	validation_0-auc:0.51696
[5]	validation_0-auc:0.51696
[6]	validati

In [54]:
ac_30  = np.fromiter(accuracy.values(), dtype=float)
ac_auqc = np.fromiter(accuracy_auqc.values(), dtype=float)

print("mean_30%: {}, mean_auqc: {}, std_30%: {}, std_auqc:{},mean_time{}".format(np.mean(ac_30),
                    np.mean(ac_auqc),np.std(ac_30),np.std(ac_auqc),np.mean(time)))


mean_30%: 0.06700121936495973, mean_auqc: 9266011.931098882, std_30%: 0.008668268543418973, std_auqc:1113792.9579186535,mean_time0:00:02.904528


# Two models treat

In [58]:
from uplift.models import TwoModels
from uplift.metrics import uplift_at_k,qini_auc_score
accuracy_auqc = {}
accuracy = {}
eval_x = df_features.loc[indices_valid].values
eval_y = df_train.loc[indices_valid,'target'].values

eval_treat = df_train.loc[indices_valid,'treatment_flg'].values
eval_set_cntrl = [(np.column_stack((eval_x,eval_treat)), eval_y)] 
eval_set_trmnt =[(eval_x, eval_y)] 
best_param = {}
time = []
best_param_auqc = {}
indices_all_folds = np.array_split(indices_learn, num_folds)
#cross validation

for i in range(num_folds):
    
    x_val = df_features.loc[indices_all_folds[i]].values
    y_val =df_train.loc[indices_all_folds[i],'target'].values
    treatment_val = df_train.loc[indices_all_folds[i],'treatment_flg'].values
    
    
    united = np.vstack(indices_all_folds[0:i]+indices_all_folds[i+1:])
    united = united.reshape(train_count)
    
    x_train = df_features.loc[united].values
    y_train = df_train.loc[united, 'target'].values
    treatment_train = df_train.loc[united, 'treatment_flg'].values
    
    best_acc = 0.0
    best_auqc = 0.0
    for params in grid_list:
        
        ctrl = TwoModels(
                estimator_trmnt=XGBClassifier(max_depth = params["max_depth"]), 
                estimator_ctrl=XGBClassifier(max_depth = params["max_depth"]), 
                method='ddr_treatment')
        
        start_time = datetime.now()
        
        ctrl = ctrl.fit(x_train, y_train, treatment_train, 
                                  estimator_ctrl_fit_params ={'early_stopping_rounds': 10,
                                                'eval_metric':"auc", "eval_set":eval_set_cntrl},
                       estimator_trmnt_fit_params={'early_stopping_rounds': 10, 'eval_metric':"auc", "eval_set":eval_set_trmnt}) #estimator_trmnt_fit_params={'early_stopping_rounds': 10, 'eval_metric':"auc", "eval_set":eval_set_trmnt},
        time.append(datetime.now() - start_time)

        uplift_ctrl = ctrl.predict(x_val)
        
        acc = uplift_at_k(y_true=y_val, uplift=uplift_ctrl, treatment=treatment_val,strategy= "by_group", k=0.3)
        auqc = qini_auc_score(y_true = y_val, uplift=uplift_ctrl, treatment=treatment_val)
       
        if acc > best_acc:
            accuracy[i] = acc
            best_param[i] = [params["max_depth"]]
            print("fold {}, acc {}, params {}".format(i,acc,[params["max_depth"]]))
            best_acc = acc
            
        if auqc > best_auqc:
            accuracy_auqc[i] = auqc
            best_param_auqc[i] = [params["max_depth"]]
            print("fold {}, auqc {}, params {}".format(i,auqc,[params["max_depth"]]))
            best_auqc = auqc
            
    
print("best accuracies in folds:")
print(accuracy)
print(accuracy_auqc)
print(best_param)
print(best_param_auqc)



[0]	validation_0-auc:0.54653
Will train until validation_0-auc hasn't improved in 10 rounds.
[1]	validation_0-auc:0.54655
[2]	validation_0-auc:0.56717
[3]	validation_0-auc:0.56717
[4]	validation_0-auc:0.56774
[5]	validation_0-auc:0.56772
[6]	validation_0-auc:0.58335
[7]	validation_0-auc:0.58335
[8]	validation_0-auc:0.58466
[9]	validation_0-auc:0.58464
[10]	validation_0-auc:0.58918
[11]	validation_0-auc:0.59515
[12]	validation_0-auc:0.60100
[13]	validation_0-auc:0.60099
[14]	validation_0-auc:0.60320
[15]	validation_0-auc:0.60327
[16]	validation_0-auc:0.60229
[17]	validation_0-auc:0.60234
[18]	validation_0-auc:0.60233
[19]	validation_0-auc:0.60346
[20]	validation_0-auc:0.60624
[21]	validation_0-auc:0.60747
[22]	validation_0-auc:0.60750
[23]	validation_0-auc:0.60813
[24]	validation_0-auc:0.60813
[25]	validation_0-auc:0.61045
[26]	validation_0-auc:0.61097
[27]	validation_0-auc:0.61096
[28]	validation_0-auc:0.61114
[29]	validation_0-auc:0.61097
[30]	validation_0-auc:0.61004
[31]	validation_

fold 0, acc 0.06259926820796557, params [5]
fold 0, auqc 9419714.00993551, params [5]
[0]	validation_0-auc:0.60350
Will train until validation_0-auc hasn't improved in 10 rounds.
[1]	validation_0-auc:0.61183
[2]	validation_0-auc:0.61232
[3]	validation_0-auc:0.61341
[4]	validation_0-auc:0.61350
[5]	validation_0-auc:0.61272
[6]	validation_0-auc:0.61306
[7]	validation_0-auc:0.61184
[8]	validation_0-auc:0.61174
[9]	validation_0-auc:0.61075
[10]	validation_0-auc:0.61065
[11]	validation_0-auc:0.61044
[12]	validation_0-auc:0.61035
[13]	validation_0-auc:0.61011
[14]	validation_0-auc:0.60958
Stopping. Best iteration:
[4]	validation_0-auc:0.61350

[0]	validation_0-auc:0.58278
Will train until validation_0-auc hasn't improved in 10 rounds.
[1]	validation_0-auc:0.57432
[2]	validation_0-auc:0.57666
[3]	validation_0-auc:0.58242
[4]	validation_0-auc:0.57580
[5]	validation_0-auc:0.56496
[6]	validation_0-auc:0.56228
[7]	validation_0-auc:0.56186
[8]	validation_0-auc:0.56198
[9]	validation_0-auc:0.56203


[14]	validation_0-auc:0.61497
[15]	validation_0-auc:0.61511
[16]	validation_0-auc:0.61520
[17]	validation_0-auc:0.61503
[18]	validation_0-auc:0.61492
[19]	validation_0-auc:0.61487
[20]	validation_0-auc:0.61475
[21]	validation_0-auc:0.61471
Stopping. Best iteration:
[11]	validation_0-auc:0.61522

[0]	validation_0-auc:0.52789
Will train until validation_0-auc hasn't improved in 10 rounds.
[1]	validation_0-auc:0.53272
[2]	validation_0-auc:0.53805
[3]	validation_0-auc:0.53789
[4]	validation_0-auc:0.54690
[5]	validation_0-auc:0.54696
[6]	validation_0-auc:0.54882
[7]	validation_0-auc:0.54885
[8]	validation_0-auc:0.54949
[9]	validation_0-auc:0.54934
[10]	validation_0-auc:0.54808
[11]	validation_0-auc:0.54797
[12]	validation_0-auc:0.54844
[13]	validation_0-auc:0.54825
[14]	validation_0-auc:0.54641
[15]	validation_0-auc:0.54605
[16]	validation_0-auc:0.54600
[17]	validation_0-auc:0.54647
[18]	validation_0-auc:0.54646
Stopping. Best iteration:
[8]	validation_0-auc:0.54949

[0]	validation_0-auc:0.

[10]	validation_0-auc:0.52610
[11]	validation_0-auc:0.53125
[12]	validation_0-auc:0.53414
[13]	validation_0-auc:0.53440
[14]	validation_0-auc:0.53429
[15]	validation_0-auc:0.53481
[16]	validation_0-auc:0.53797
[17]	validation_0-auc:0.53800
[18]	validation_0-auc:0.53882
[19]	validation_0-auc:0.53943
[20]	validation_0-auc:0.53948
[21]	validation_0-auc:0.53895
[22]	validation_0-auc:0.53890
[23]	validation_0-auc:0.53895
[24]	validation_0-auc:0.53665
[25]	validation_0-auc:0.53572
[26]	validation_0-auc:0.53591
[27]	validation_0-auc:0.53347
[28]	validation_0-auc:0.53357
[29]	validation_0-auc:0.53353
[30]	validation_0-auc:0.53279
Stopping. Best iteration:
[20]	validation_0-auc:0.53948

fold 2, acc 0.04570403474396234, params [3]
fold 2, auqc 6836483.044492745, params [3]
[0]	validation_0-auc:0.60012
Will train until validation_0-auc hasn't improved in 10 rounds.
[1]	validation_0-auc:0.60575
[2]	validation_0-auc:0.60906
[3]	validation_0-auc:0.61037
[4]	validation_0-auc:0.61187
[5]	validation_0-

[11]	validation_0-auc:0.61604
[12]	validation_0-auc:0.61604
[13]	validation_0-auc:0.61649
[14]	validation_0-auc:0.61658
[15]	validation_0-auc:0.61650
[16]	validation_0-auc:0.61644
[17]	validation_0-auc:0.61663
[18]	validation_0-auc:0.61695
[19]	validation_0-auc:0.61685
[20]	validation_0-auc:0.61705
[21]	validation_0-auc:0.61727
[22]	validation_0-auc:0.61733
[23]	validation_0-auc:0.61733
[24]	validation_0-auc:0.61744
[25]	validation_0-auc:0.61759
[26]	validation_0-auc:0.61750
[27]	validation_0-auc:0.61731
[28]	validation_0-auc:0.61723
[29]	validation_0-auc:0.61716
[30]	validation_0-auc:0.61721
[31]	validation_0-auc:0.61723
[32]	validation_0-auc:0.61711
[33]	validation_0-auc:0.61706
[34]	validation_0-auc:0.61709
[35]	validation_0-auc:0.61702
Stopping. Best iteration:
[25]	validation_0-auc:0.61759

[0]	validation_0-auc:0.51696
Will train until validation_0-auc hasn't improved in 10 rounds.
[1]	validation_0-auc:0.51768
[2]	validation_0-auc:0.52146
[3]	validation_0-auc:0.52523
[4]	validatio

[72]	validation_0-auc:0.61280
[73]	validation_0-auc:0.61283
[74]	validation_0-auc:0.61280
[75]	validation_0-auc:0.61305
[76]	validation_0-auc:0.61308
[77]	validation_0-auc:0.61305
[78]	validation_0-auc:0.61308
[79]	validation_0-auc:0.61305
[80]	validation_0-auc:0.61308
[81]	validation_0-auc:0.61305
[82]	validation_0-auc:0.61308
[83]	validation_0-auc:0.61307
[84]	validation_0-auc:0.61341
[85]	validation_0-auc:0.61339
[86]	validation_0-auc:0.61341
[87]	validation_0-auc:0.61340
[88]	validation_0-auc:0.61351
[89]	validation_0-auc:0.61351
[90]	validation_0-auc:0.61351
[91]	validation_0-auc:0.61353
[92]	validation_0-auc:0.61354
[93]	validation_0-auc:0.61353
[94]	validation_0-auc:0.61355
[95]	validation_0-auc:0.61388
[96]	validation_0-auc:0.61412
[97]	validation_0-auc:0.61420
[98]	validation_0-auc:0.61403
[99]	validation_0-auc:0.61404
[0]	validation_0-auc:0.51696
Will train until validation_0-auc hasn't improved in 10 rounds.
[1]	validation_0-auc:0.51696
[2]	validation_0-auc:0.51696
[3]	valid

In [59]:
ac_30  = np.fromiter(accuracy.values(), dtype=float)
ac_auqc = np.fromiter(accuracy_auqc.values(), dtype=float)

print("mean_30%: {}, mean_auqc: {}, std_30%: {}, std_auqc:{},mean_time{}".format(np.mean(ac_30),
                    np.mean(ac_auqc),np.std(ac_30),np.std(ac_auqc),np.mean(time)))

mean_30%: 0.0688019156464239, mean_auqc: 9312881.741546579, std_30%: 0.007533510203998691, std_auqc:1096101.5127293775,mean_time0:00:02.896441


# Class transformation

In [61]:
from uplift.models import ClassTransformation
from uplift.metrics import uplift_at_k,qini_auc_score
accuracy_auqc = {}
accuracy = {}
eval_x = df_features.loc[indices_valid].values
eval_y = df_train.loc[indices_valid,'target'].values

eval_treat = df_train.loc[indices_valid,'treatment_flg'].values
eval_set = [(eval_x, eval_y)]
 
best_param = {}
time = []
best_param_auqc = {}
indices_all_folds = np.array_split(indices_learn, num_folds)
#cross validation


for i in range(num_folds):
    
    x_val = df_features.loc[indices_all_folds[i]].values
    y_val =df_train.loc[indices_all_folds[i],'target'].values
    treatment_val = df_train.loc[indices_all_folds[i],'treatment_flg'].values
    
    
    united = np.vstack(indices_all_folds[0:i]+indices_all_folds[i+1:])
    united = united.reshape(train_count)
    
    x_train = df_features.loc[united].values
    y_train = df_train.loc[united, 'target'].values
    treatment_train = df_train.loc[united, 'treatment_flg'].values
    best_acc = 0.0
    best_auqc = 0.0
    for params in grid_list:
        
        ct = ClassTransformation(XGBClassifier(max_depth = params["max_depth"]))
        start_time = datetime.now()
        
        ct = ct.fit(x_train, y_train, treatment_train, estimator_fit_params={'early_stopping_rounds': 10,
                                                'eval_metric':"auc", "eval_set":eval_set})
        time.append(datetime.now() - start_time)

        uplift_ct = ct.predict(x_val)
        
        acc = uplift_at_k(y_true=y_val, uplift=uplift_ct, treatment=treatment_val,strategy= "by_group", k=0.3)
        auqc = qini_auc_score(y_true = y_val, uplift=uplift_ct, treatment=treatment_val)
       
        if acc > best_acc:
            accuracy[i] = acc
            best_param[i] = [params["max_depth"]]
            print("fold {}, acc {}, params {}".format(i,acc,[params["max_depth"]]))
            best_acc = acc
            
        if auqc > best_auqc:
            accuracy_auqc[i] = auqc
            best_param_auqc[i] = [params["max_depth"]]
            print("fold {}, auqc {}, params {}".format(i,auqc,[params["max_depth"]]))
            best_auqc = auqc
            
    
print("best accuracies in folds:")
print(accuracy)
print(accuracy_auqc)
print(best_param)
print(best_param_auqc)


[0]	validation_0-auc:0.51845
Will train until validation_0-auc hasn't improved in 10 rounds.
[1]	validation_0-auc:0.51845


  ct = ct.fit(x_train, y_train, treatment_train, estimator_fit_params={'early_stopping_rounds': 10,


[2]	validation_0-auc:0.52931
[3]	validation_0-auc:0.52931
[4]	validation_0-auc:0.52965
[5]	validation_0-auc:0.52965
[6]	validation_0-auc:0.52965
[7]	validation_0-auc:0.52965
[8]	validation_0-auc:0.52965
[9]	validation_0-auc:0.52965
[10]	validation_0-auc:0.52965
[11]	validation_0-auc:0.52965
[12]	validation_0-auc:0.52965
[13]	validation_0-auc:0.52965
[14]	validation_0-auc:0.52965
Stopping. Best iteration:
[4]	validation_0-auc:0.52965

fold 0, acc 0.06881286207013249, params [1]
fold 0, auqc 8061494.639677377, params [1]
[0]	validation_0-auc:0.54772
Will train until validation_0-auc hasn't improved in 10 rounds.
[1]	validation_0-auc:0.55108
[2]	validation_0-auc:0.53939
[3]	validation_0-auc:0.54650
[4]	validation_0-auc:0.55090
[5]	validation_0-auc:0.55006
[6]	validation_0-auc:0.54996
[7]	validation_0-auc:0.54613
[8]	validation_0-auc:0.54592
[9]	validation_0-auc:0.54559
[10]	validation_0-auc:0.55050
[11]	validation_0-auc:0.54920
Stopping. Best iteration:
[1]	validation_0-auc:0.55108

fold 

[4]	validation_0-auc:0.53023
[5]	validation_0-auc:0.52980
[6]	validation_0-auc:0.53429
[7]	validation_0-auc:0.53387
[8]	validation_0-auc:0.53158
[9]	validation_0-auc:0.53143
[10]	validation_0-auc:0.53013
[11]	validation_0-auc:0.52947
[12]	validation_0-auc:0.52968
[13]	validation_0-auc:0.53014
[14]	validation_0-auc:0.53053
[15]	validation_0-auc:0.53080
[16]	validation_0-auc:0.53013
Stopping. Best iteration:
[6]	validation_0-auc:0.53429

[0]	validation_0-auc:0.51657
Will train until validation_0-auc hasn't improved in 10 rounds.
[1]	validation_0-auc:0.51680
[2]	validation_0-auc:0.52682
[3]	validation_0-auc:0.52682
[4]	validation_0-auc:0.52682
[5]	validation_0-auc:0.52682
[6]	validation_0-auc:0.52682
[7]	validation_0-auc:0.52682
[8]	validation_0-auc:0.52684
[9]	validation_0-auc:0.52684
[10]	validation_0-auc:0.52684
[11]	validation_0-auc:0.52684
[12]	validation_0-auc:0.52684
[13]	validation_0-auc:0.52684
[14]	validation_0-auc:0.52636
[15]	validation_0-auc:0.52636
[16]	validation_0-auc:0.52

In [62]:
ac_30  = np.fromiter(accuracy.values(), dtype=float)
ac_auqc = np.fromiter(accuracy_auqc.values(), dtype=float)

print("mean_30%: {}, mean_auqc: {}, std_30%: {}, std_auqc:{},mean_time{}".format(np.mean(ac_30),
                    np.mean(ac_auqc),np.std(ac_30),np.std(ac_auqc),np.mean(time)))

mean_30%: 0.07124388790624306, mean_auqc: 9497763.45889017, std_30%: 0.005232098848638294, std_auqc:1174267.5697795071,mean_time0:00:01.443718


# Stat test

In [63]:
from pprint import pprint


n_estimators = [int(x) for x in np.linspace(start = 1, stop = 45, num = 5)]
# Number of features to consider at every split
max_features = [0.1, 0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
#min_samples_split = [500, 1000, 2000, 5000, 10000]
# Minimum number of samples required at each leaf node
min_samples_leaf = [500, 1000, 2000, 5000, 10000]
# Method of selecting samples for training each tree
#bootstrap = [True, False]


param_grid = {
              'max_features':max_features,
               'min_samples_leaf': min_samples_leaf}

pprint(param_grid)

{'max_features': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
 'min_samples_leaf': [500, 1000, 2000, 5000, 10000]}


In [64]:
from sklearn.model_selection import ParameterGrid

grid_list = list(ParameterGrid(param_grid))
grid_size = len(grid_list)

In [65]:
from uplift.metrics import uplift_at_k,qini_auc_score
accuracy_auqc = {}
accuracy = {}
all_accuracy = []
all_accuracy_auqc = []
time = []
best_param = {}
best_param_auqc = {}

indices_all_folds = np.array_split(indices_learn, num_folds)
#cross validation
for i in range(num_folds):
    
    x_val = df_features.loc[indices_all_folds[i]].values
    y_val =df_train.loc[indices_all_folds[i],'target'].values
    treatment_val = df_train.loc[indices_all_folds[i],'treatment_flg'].values
    
    
    united = np.vstack(indices_all_folds[0:i]+indices_all_folds[i+1:])
    united = united.reshape(train_count)
    
    x_train = df_features.loc[united].values
    y_train = df_train.loc[united, 'target'].values
    treatment_train = df_train.loc[united, 'treatment_flg'].values
    best_acc = 0.0
    best_auqc = 0.0
    for params in grid_list:
        
        rfc = RandomForestClassifier(n_estimators = 1, min_samples_leaf= params["min_samples_leaf"], 
                                     max_features = params["max_features"], max_depth = None, criterion='stat_test')
        start_time = datetime.now()
        rfc.fit(x_train,y_train,treatment_train)
        time.append(datetime.now() - start_time)

        pred = rfc.predict_uplift(x_val)
        #acc = np.mean(pred==y_val)
        acc = uplift_at_k(y_true=y_val, uplift=pred, treatment=treatment_val,strategy= "by_group", k=0.3)
        auqc = qini_auc_score(y_true = y_val, uplift=pred, treatment=treatment_val)
        if acc > best_acc:
            accuracy[i] = acc
            best_param[i] = [params["max_features"],params["min_samples_leaf"],  'stat_test']
            print("fold {}, acc {}, params {}".format(i,acc,[params["max_features"],params["min_samples_leaf"],  'stat_test']))
            best_acc = acc
        if auqc > best_auqc:
            accuracy_auqc[i] = auqc
            best_param_auqc[i] =  [params["max_features"],params["min_samples_leaf"],  'stat_test']
            print("fold {}, auqc {}, params {}".format(i,auqc,[params["max_features"],params["min_samples_leaf"],  'stat_test']))
            best_auqc = auqc
            
    
print("best accuracies in folds:")
print(accuracy)
print(accuracy_auqc)

fold 0, acc 0.06686759210510873, params [0.1, 500, 'stat_test']
fold 0, auqc 8694300.23164066, params [0.1, 500, 'stat_test']
fold 0, auqc 8845827.618876977, params [0.3, 1000, 'stat_test']
fold 0, auqc 8848572.064730577, params [0.5, 5000, 'stat_test']
fold 0, auqc 9650182.961078396, params [0.6, 1000, 'stat_test']
fold 0, acc 0.0784062165190802, params [0.8, 1000, 'stat_test']
fold 0, auqc 9804725.13981018, params [0.8, 1000, 'stat_test']
fold 1, acc 0.05431711765398162, params [0.1, 500, 'stat_test']
fold 1, auqc 7689448.349625971, params [0.1, 500, 'stat_test']
fold 1, auqc 7709436.589817772, params [0.1, 1000, 'stat_test']
fold 1, auqc 7753662.899641318, params [0.4, 1000, 'stat_test']
fold 1, acc 0.056537870485450004, params [0.4, 2000, 'stat_test']
fold 1, auqc 8116604.128349746, params [0.4, 2000, 'stat_test']
fold 1, auqc 8124581.195801681, params [0.7, 1000, 'stat_test']
fold 1, auqc 8680323.212434174, params [0.8, 1000, 'stat_test']
fold 1, acc 0.06072223276646138, params [0

In [66]:
ac_30  = np.fromiter(accuracy.values(), dtype=float)
ac_auqc = np.fromiter(accuracy_auqc.values(), dtype=float)

print("mean_30%: {}, mean_auqc: {}, std_30%: {}, std_auqc:{},mean_time{}".format(np.mean(ac_30),
                    np.mean(ac_auqc),np.std(ac_30),np.std(ac_auqc),np.mean(time)))

mean_30%: 0.07181441907224147, mean_auqc: 9672102.217400705, std_30%: 0.007653994327337734, std_auqc:1178074.657914627,mean_time0:00:00.113349
