In [1]:
import pandas as pd

In [9]:
def read_train_test():
    X_train = pd.read_csv(
        "../data/processed/two_models/X_train.csv", index_col="client_id"
    )
    y_train = pd.read_csv(
        "../data/processed/two_models/y_train.csv",
        header=None,
        names=["client_id", "target"],
        index_col="client_id"
    )["target"]
    train_is_treatment = pd.read_csv(
        "../data/processed/two_models/X_train_is_treatment.csv",
        header=None,
        names=["client_id", "is_treatment"],
        index_col="client_id"
    )["is_treatment"]

    X_valid = pd.read_csv("../data/processed/two_models/X_valid.csv", index_col="client_id")
    y_valid = pd.read_csv(
        "../data/processed/two_models/y_valid.csv",
        header=None,
        names=["client_id", "target"],
        index_col="client_id"
    )["target"]
    valid_is_treatment = pd.read_csv(
        "../data/processed/two_models/X_valid_is_treatment.csv",
        header=None,
        names=["client_id", "is_treatment"],
        index_col="client_id"
    )["is_treatment"]

    X_test = pd.read_csv("../data/processed/two_models/X_test.csv", index_col="client_id")

    return X_train, y_train, train_is_treatment, X_valid, y_valid, valid_is_treatment, X_test


def join_train_validation(X_train, X_valid, y_train, y_valid):
    X_train = pd.concat([X_train, X_valid], ignore_index=False)
    y_train = pd.concat([y_train, y_valid], ignore_index=False)
    return X_train, y_train


def split_control_treatment(X, y, is_treatment):
    X_control = X[is_treatment == 0]
    X_treatment = X[is_treatment == 1]
    y_control = y[is_treatment == 0]
    y_treatment = y[is_treatment == 1]
    return X_control, X_treatment, y_control, y_treatment

In [10]:
X_train, y_train, train_is_treatment, X_valid, y_valid, valid_is_treatment, X_test = read_train_test()

In [4]:
df_clients = pd.read_csv('../data/raw/clients.csv', index_col='client_id',parse_dates=['first_issue_date','first_redeem_date'])
df_train = pd.read_csv('../data/raw/uplift_train.csv', index_col='client_id')
df_test = pd.read_csv('../data/raw/uplift_test.csv', index_col='client_id')
df_products = pd.read_csv('../data/raw/products.csv', index_col='product_id')
df_purchases = pd.read_csv('../data/raw/purchases.csv',parse_dates=['transaction_datetime'])

In [5]:
last_cols = ['regular_points_received', 'express_points_received','regular_points_spent', 'express_points_spent', 'purchase_sum','store_id']
all_hist = df_purchases.groupby(['client_id','transaction_id'])[last_cols].last()
last_month = df_purchases[df_purchases['transaction_datetime'] > '2019-02-18'].groupby(['client_id','transaction_id'])[last_cols].last()

In [6]:
features =  pd.concat([all_hist.groupby('client_id')['purchase_sum'].count(),
                       last_month.groupby('client_id')['purchase_sum'].count(),
                       all_hist.groupby('client_id').sum(),
                       all_hist.groupby('client_id')[['store_id']].nunique(),
                       last_month.groupby('client_id').sum(),
                       last_month.groupby('client_id')[['store_id']].nunique(),
                      ],axis = 1)
features.columns = ['total_trans_count','last_month_trans_count']+list(c+"_sum_all" for c in last_cols)+list(c+"_sum_last_month" for c in last_cols)

In [7]:
features.head()

Unnamed: 0_level_0,total_trans_count,last_month_trans_count,regular_points_received_sum_all,express_points_received_sum_all,regular_points_spent_sum_all,express_points_spent_sum_all,purchase_sum_sum_all,store_id_sum_all,regular_points_received_sum_last_month,express_points_received_sum_last_month,regular_points_spent_sum_last_month,express_points_spent_sum_last_month,purchase_sum_sum_last_month,store_id_sum_last_month
client_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
000012768d,4,2,25.7,0.0,0.0,0.0,2803.0,3,10.0,0.0,0.0,0.0,1222.0,1
000036f903,32,8,54.9,60.0,0.0,0.0,9805.0,5,13.7,0.0,0.0,0.0,2784.0,4
000048b7a6,8,1,26.5,0.0,0.0,0.0,3772.0,2,1.2,0.0,0.0,0.0,342.0,1
000073194a,17,6,74.9,0.0,-96.0,0.0,9601.4,1,25.0,0.0,0.0,0.0,3393.3,1
00007c7133,11,1,56.6,0.0,-240.0,0.0,6719.84,2,1.9,0.0,0.0,0.0,380.0,1


In [13]:
X_train.columns.tolist()

['age',
 'n_alchohol_products',
 'avg_alchohol_products_in_purchase',
 'n_alchohol_products_quantity',
 'avg_alchohol_products_in_purchase_quantity',
 'pct_alcohol_products',
 'n_own_trademark_products',
 'avg_onw_trademark_in_purchase',
 'n_own_trademark_products_quantity',
 'avg_onw_trademark_in_purchase_quantity',
 'pct_own_trademark_products',
 'avg_unique_brands',
 'avg_unique_vendors',
 'avg_unique_segments',
 'avg_unique_level_1',
 'avg_unique_level_2',
 'avg_unique_level_3',
 'pct_unique_brands',
 'pct_unique_vendors',
 'pct_unique_segments',
 'pct_unique_level_1',
 'pct_unique_level_2',
 'pct_unique_level_3',
 'sum_sum_netto',
 'avg_sum_netto',
 'stddev_sum_netto',
 'avg_avg_netto',
 'stddev_avg_netto',
 'n_transactions',
 'avg_transaction_time',
 'stddev_transaction_time',
 'mode_transaction_weekday',
 'sum_regular_points_received',
 'sum_express_points_received',
 'sum_regular_points_spent',
 'sum_express_points_spent',
 'avg_regular_points_received',
 'avg_express_points_re

In [21]:
ids = X_train.sample(5).index

In [22]:
features.loc[ids]

Unnamed: 0_level_0,total_trans_count,last_month_trans_count,regular_points_received_sum_all,express_points_received_sum_all,regular_points_spent_sum_all,express_points_spent_sum_all,purchase_sum_sum_all,store_id_sum_all,regular_points_received_sum_last_month,express_points_received_sum_last_month,regular_points_spent_sum_last_month,express_points_spent_sum_last_month,purchase_sum_sum_last_month,store_id_sum_last_month
client_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
fc972c0bf5,19,4,45.6,0.0,0.0,0.0,7237.28,9,8.7,0.0,0.0,0.0,1088.2,3
f1f4e19712,21,8,66.7,0.0,0.0,0.0,10047.0,1,32.3,0.0,0.0,0.0,4458.0,1
1b3f5c3aa6,20,6,33.1,0.0,0.0,0.0,4599.0,1,7.2,0.0,0.0,0.0,1474.0,1
5ae9fa3a5c,14,4,36.7,0.0,-32.0,0.0,5054.0,2,8.6,0.0,-19.0,0.0,1218.0,1
8e424027fe,36,8,339.2,0.0,-863.0,0.0,36062.0,1,73.6,0.0,0.0,0.0,7682.0,1


In [24]:
X_train.loc[
    ids,
    [
        "n_transactions", 
        "last_month_n_transactions", 
        'sum_regular_points_received',
        'sum_express_points_received',
        'sum_regular_points_spent',
        'sum_express_points_spent',
        'sum_purchase_sum',
        'n_stores',
        'last_month_sum_regular_points_received',
        'last_month_sum_express_points_received',
        'last_month_sum_regular_points_spent',
        'last_month_sum_express_points_spent',
        'last_month_sum_purchase_sum',
        'last_month_n_stores',
    ]
]

Unnamed: 0_level_0,n_transactions,last_month_n_transactions,sum_regular_points_received,sum_express_points_received,sum_regular_points_spent,sum_express_points_spent,sum_purchase_sum,n_stores,last_month_sum_regular_points_received,last_month_sum_express_points_received,last_month_sum_regular_points_spent,last_month_sum_express_points_spent,last_month_sum_purchase_sum,last_month_n_stores
client_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
fc972c0bf5,19,4,45.6,0.0,0.0,0.0,7237.28,9,8.7,0.0,0.0,0.0,1088.2,3
f1f4e19712,21,8,66.7,0.0,0.0,0.0,10047.0,1,32.3,0.0,0.0,0.0,4458.0,1
1b3f5c3aa6,20,6,33.1,0.0,0.0,0.0,4599.0,1,7.2,0.0,0.0,0.0,1474.0,1
5ae9fa3a5c,14,4,36.7,0.0,-32.0,0.0,5054.0,2,8.6,0.0,-19.0,0.0,1218.0,1
8e424027fe,36,8,339.2,0.0,-863.0,0.0,36062.0,1,73.6,0.0,0.0,0.0,7682.0,1


In [25]:
merged_train = pd.concat([df_train,df_clients,features],axis = 1,sort = True)
merged_train = merged_train[~merged_train['target'].isnull()].copy()
merged_train['first_issue_date_weekday'] = merged_train['first_issue_date'].dt.weekday
merged_train['first_redeem_date_weekday'] = merged_train['first_redeem_date'].dt.weekday
merged_train['first_issue_date_hour'] = merged_train['first_issue_date'].dt.hour
merged_train['first_redeem_date_hour'] = merged_train['first_redeem_date'].dt.hour
merged_train['first_issue_date'] = merged_train['first_issue_date'].astype(int)/10**9
merged_train['first_redeem_date'] = merged_train['first_redeem_date'].astype(int)/10**9
merged_train['diff'] = merged_train['first_redeem_date']-merged_train['first_issue_date']
merged_train['gender'] = list(ord(v[0]) for v in merged_train['gender'].values)

In [53]:
merged_train

Unnamed: 0,treatment_flg,target,first_issue_date,first_redeem_date,age,gender,total_trans_count,last_month_trans_count,regular_points_received_sum_all,express_points_received_sum_all,...,express_points_received_sum_last_month,regular_points_spent_sum_last_month,express_points_spent_sum_last_month,purchase_sum_sum_last_month,store_id_sum_last_month,first_issue_date_weekday,first_redeem_date_weekday,first_issue_date_hour,first_redeem_date_hour,diff
000012768d,0.0,1.0,1.501948e+09,1.515094e+09,45,85,4,2,25.7,0.0,...,0.0,0.0,0.0,1222.00,1,5,3.0,15,19.0,1.314656e+07
000036f903,1.0,1.0,1.491832e+09,1.492951e+09,72,70,32,8,54.9,60.0,...,0.0,0.0,0.0,2784.00,4,0,6.0,13,12.0,1.118613e+06
00010925a5,1.0,1.0,1.532449e+09,1.536942e+09,83,85,18,8,31.8,0.0,...,0.0,0.0,0.0,2858.00,2,1,4.0,16,16.0,4.492280e+06
0001f552b0,1.0,1.0,1.498850e+09,1.535461e+09,33,70,15,7,78.9,0.0,...,0.0,0.0,0.0,2211.37,2,4,1.0,19,12.0,3.661075e+07
00020e7b18,1.0,1.0,1.511783e+09,1.515607e+09,73,85,18,5,286.1,0.0,...,0.0,-76.0,-10.0,6096.27,1,0,2.0,11,17.0,3.823700e+06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
fffe0abb97,0.0,0.0,1.511773e+09,1.518341e+09,35,70,9,3,22.2,0.0,...,0.0,-15.0,-60.0,820.72,1,0,6.0,8,9.0,6.568154e+06
fffe0ed719,0.0,1.0,1.505466e+09,1.513090e+09,69,85,30,19,69.9,0.0,...,0.0,0.0,0.0,8551.48,1,4,1.0,8,14.0,7.624608e+06
fffea1204c,0.0,1.0,1.517418e+09,1.520874e+09,73,70,17,6,26.1,0.0,...,0.0,-40.0,0.0,1746.41,1,2,0.0,16,17.0,3.456170e+06
fffeca6d22,1.0,0.0,1.514462e+09,-9.223372e+09,77,70,16,9,47.4,0.0,...,0.0,0.0,0.0,1251.86,1,3,,11,,-1.073783e+10


In [26]:
treatment = merged_train[merged_train['treatment_flg'] == 1].drop('treatment_flg',axis = 1)
treatment_x = treatment.drop('target',axis = 1)
treatment_y = treatment['target']
control = merged_train[merged_train['treatment_flg'] == 0].drop('treatment_flg',axis = 1)
control_x = control.drop('target',axis = 1)
control_y = control['target']

In [34]:
X_train_control, X_train_treatment, y_train_control, y_train_treatment = split_control_treatment(
    X_train, y_train, train_is_treatment
)
X_valid_control, X_valid_treatment, y_valid_control, y_valid_treatment = split_control_treatment(
    X_valid, y_valid, valid_is_treatment
)

In [35]:
treatment_x_train = treatment_x.loc[X_train_treatment.index]
treatment_x_valid = treatment_x.loc[X_valid_treatment.index]
treatment_y_train = treatment_y.loc[y_train_treatment.index]
treatment_y_valid = treatment_y.loc[y_valid_treatment.index]

control_x_train = control_x.loc[X_train_control.index]
control_x_valid = control_x.loc[X_valid_control.index]
control_y_train = control_y.loc[y_train_control.index]
control_y_valid = control_y.loc[y_valid_control.index]

In [36]:
import lightgbm as lgbm
params = {'learning_rate':0.03,'max_depth':4,'num_leaves':20,
             'min_data_in_leaf':3, 'application':'binary',
             'subsample':0.8, 'colsample_bytree': 0.8,
             'reg_alpha':0.01,'data_random_seed':42,'metric':'binary_logloss',
             'max_bin':416,'bagging_freq':3,'reg_lambda':0.01,'num_leaves':20             
    }
matrix = lgbm.Dataset(treatment_x_train, label=treatment_y_train)
cv_result = lgbm.cv(params, matrix, num_boost_round=5000,nfold=5, stratified=True, 
                              early_stopping_rounds=50, seed=42, verbose_eval=50)


[50]	cv_agg's binary_logloss: 0.552697 + 0.00300076
[100]	cv_agg's binary_logloss: 0.538496 + 0.00398536
[150]	cv_agg's binary_logloss: 0.534558 + 0.00413736
[200]	cv_agg's binary_logloss: 0.532816 + 0.00421507
[250]	cv_agg's binary_logloss: 0.532031 + 0.00411974
[300]	cv_agg's binary_logloss: 0.53181 + 0.00414129
[350]	cv_agg's binary_logloss: 0.531795 + 0.00418921
[400]	cv_agg's binary_logloss: 0.53167 + 0.00420979
[450]	cv_agg's binary_logloss: 0.531533 + 0.00421051
[500]	cv_agg's binary_logloss: 0.531508 + 0.00412138


In [48]:
treatment_model = lgbm.LGBMClassifier(n_estimators = len(cv_result['binary_logloss-mean']),**params)
treatment_model.fit(treatment_x_train, treatment_y_train)
control_model = lgbm.LGBMClassifier(n_estimators = len(cv_result['binary_logloss-mean']),**params)
control_model.fit(control_x_train, control_y_train)

LGBMClassifier(application='binary', bagging_freq=3, boosting_type='gbdt',
               class_weight=None, colsample_bytree=0.8, data_random_seed=42,
               importance_type='split', learning_rate=0.03, max_bin=416,
               max_depth=4, metric='binary_logloss', min_child_samples=20,
               min_child_weight=0.001, min_data_in_leaf=3, min_split_gain=0.0,
               n_estimators=345, n_jobs=-1, num_leaves=20, objective=None,
               random_state=None, reg_alpha=0.01, reg_lambda=0.01, silent=True,
               subsample=0.8, subsample_for_bin=200000, subsample_freq=0)

In [50]:
(
    treatment_model.score(merged_train.loc[X_valid.index].drop(['treatment_flg', 'target'],axis = 1), y_valid),
    control_model.score(merged_train.loc[X_valid.index].drop(['treatment_flg', 'target'],axis = 1), y_valid)
)

(0.7197810437912417, 0.7209558088382324)

In [29]:
def uplift_score(prediction, treatment, target, rate=0.3):
    """
    Подсчет Uplift Score
    """
    order = np.argsort(-prediction)
    treatment_n = int((treatment == 1).sum() * rate)
    print(f"    number of treatment users: {treatment_n}")
    treatment_p = target[order][treatment[order] == 1][:treatment_n].mean()
    print(f"    treatment p: {treatment_p}")
    control_n = int((treatment == 0).sum() * rate)
    print(f"    number of control users: {treatment_n}")
    control_p = target[order][treatment[order] == 0][:control_n].mean()
    print(f"    control p: {control_p}")
    score = treatment_p - control_p
    return score

In [43]:
import numpy as np

In [44]:
preds_pos = treatment_model.predict_proba(merged_train.loc[X_valid.index].drop(['treatment_flg', 'target'],axis = 1))[:,1]
preds_neg = control_model.predict_proba(merged_train.loc[X_valid.index].drop(['treatment_flg', 'target'],axis = 1))[:,1]
uplift_score(preds_pos - preds_neg, valid_is_treatment, y_valid)

    number of treatment users: 6038
    treatment p: 0.6074859224908911
    number of control users: 6038
    control p: 0.5310194500335346


0.0764664724573565

In [45]:
import lightgbm as lgbm
params = {'learning_rate':0.03,'max_depth':4,'num_leaves':20,
             'min_data_in_leaf':3, 'application':'binary',
             'subsample':0.8, 'colsample_bytree': 0.8,
             'reg_alpha':0.01,'data_random_seed':42,'metric':'binary_logloss',
             'max_bin':416,'bagging_freq':3,'reg_lambda':0.01,'num_leaves':20             
    }
matrix = lgbm.Dataset(X_train_treatment, label=y_train_treatment)
cv_result = lgbm.cv(params, matrix, num_boost_round=5000,nfold=5, stratified=True, 
                              early_stopping_rounds=50, seed=42, verbose_eval=50)


[50]	cv_agg's binary_logloss: 0.555815 + 0.00280809
[100]	cv_agg's binary_logloss: 0.545328 + 0.00340326
[150]	cv_agg's binary_logloss: 0.543137 + 0.00354846
[200]	cv_agg's binary_logloss: 0.542269 + 0.00359253
[250]	cv_agg's binary_logloss: 0.541843 + 0.00356861
[300]	cv_agg's binary_logloss: 0.541665 + 0.00359192
[350]	cv_agg's binary_logloss: 0.541616 + 0.00362086


In [51]:
treatment_model = lgbm.LGBMClassifier(n_estimators = len(cv_result['binary_logloss-mean']),**params)
treatment_model.fit(X_train_treatment, y_train_treatment)
control_model = lgbm.LGBMClassifier(n_estimators = len(cv_result['binary_logloss-mean']),**params)
control_model.fit(X_train_control, y_train_control)

LGBMClassifier(application='binary', bagging_freq=3, boosting_type='gbdt',
               class_weight=None, colsample_bytree=0.8, data_random_seed=42,
               importance_type='split', learning_rate=0.03, max_bin=416,
               max_depth=4, metric='binary_logloss', min_child_samples=20,
               min_child_weight=0.001, min_data_in_leaf=3, min_split_gain=0.0,
               n_estimators=345, n_jobs=-1, num_leaves=20, objective=None,
               random_state=None, reg_alpha=0.01, reg_lambda=0.01, silent=True,
               subsample=0.8, subsample_for_bin=200000, subsample_freq=0)

In [47]:
preds_pos = treatment_model.predict_proba(X_valid)[:,1]
preds_neg = control_model.predict_proba(X_valid)[:,1]
uplift_score(preds_pos - preds_neg, valid_is_treatment, y_valid)

    number of treatment users: 6038
    treatment p: 0.5756873136800265
    number of control users: 6038
    control p: 0.5068745808182428


0.06881273286178369

In [52]:
treatment_model.score(X_valid, y_valid), control_model.score(X_valid, y_valid)

(0.7153319336132773, 0.7148570285942811)