In [36]:
import pandas as pd
import numpy as np
from collections import defaultdict
from sklift.metrics import (
    uplift_at_k, uplift_auc_score, qini_auc_score, weighted_average_uplift
)
import datetime
from sklearn.model_selection import ParameterGrid
from sklearn.model_selection import KFold

In [37]:
from tree_uplift import Boosting

In [38]:
from tree_uplift import UpliftTree

In [4]:
df_x5 = pd.read_csv('retail_hero_final_model_train_data.csv', index_col='client_id')
df_features, w, y = df_x5[['first_redeem_date', 'express_spent_mean',
       'express_points_spent_sum', 'regular_points_received_sum_last_m',
       'avg_spent_perday', 'after_redeem_sum_perday']], df_x5['treatment_flg'], df_x5['target']

def x5_coding(x):
    return int(str(int(x['treatment_flg']))+str(int(x['target'])), 2)

df_x5['coding'] = df_x5.apply(x5_coding, axis=1)
c = df_x5['coding']

from sklearn.model_selection import train_test_split
indices_learn, indices_valid = train_test_split(df_x5.index, test_size=0.3,random_state=0)
X_train, w_train, y_train, c_train = df_features.loc[indices_learn].fillna(0), w.loc[indices_learn], y.loc[indices_learn], c.loc[indices_learn]
X_valid, w_valid, y_valid, c_valid = df_features.loc[indices_valid].fillna(0), w.loc[indices_valid], y.loc[indices_valid], c.loc[indices_valid]

In [24]:
random_state=4
rng = np.random.RandomState(random_state)
ada_boost = Boosting(n_estimators=50,
         base_classifier=UpliftTree(max_depth=5, scoring='ED', min_samples_leaf=100, 
                                    min_samples_treatment=10, n_rand_features=5,random_state=0),
         random_state=None)

In [35]:
adaboost = Boosting(**params).fit(X_train.to_numpy().copy(), y_train, w_train)
class_ = ada_boost.predict(X_valid)

In [96]:
# Uplift at k best
uplift_k = uplift_at_k(y_true=y_valid, uplift=class_, treatment=w_valid, strategy='overall', k=0.3)
# Area Under Qini Curve
tm_qini_auc = qini_auc_score(y_true=y_valid, uplift=class_, treatment=w_valid)

# Area Under Uplift Curve
tm_uplift_auc = uplift_auc_score(y_true=y_valid, uplift=class_, treatment=w_valid)

# Weighted average uplift
tm_wau = weighted_average_uplift(y_true=y_valid, uplift=class_,  treatment=w_valid)

print("Boosting: ", tm_qini_auc, tm_uplift_auc, tm_wau, uplift_k)

Boosting:  0.000812790586958195 0.0011521789658458688 0.03695267609316616 0.03811788141016348


In [97]:
from sklearn.preprocessing import OneHotEncoder

def kuusisto_coding(x):
    return int(str(x['w'])+str(x['target']), 2)

def kuusisto_preprocessing(df):
    
    df = df.drop(['customer_type'],axis=1)
    df['target'] = df['outcome'].apply(lambda x: 1 if x=='positive' else 0)
    df['w'] = df['target_control'].apply(lambda x: 1 if x=='target' else 0) 
    df['coding'] = df.apply(kuusisto_coding, axis=1)  
    df = df.drop(['outcome','target_control'],axis=1)
    
    ohe = OneHotEncoder(sparse=False)
    encoded_features = pd.DataFrame(ohe.fit_transform(df[['Node1','Node2','Node3','Node4',
                                                          'Node5','Node6','Node7','Node8',
                                                          'Node9','Node10','Node11','Node12',
                                                          'Node13','Node14','Node15','Node17',
                                                          'Node18','Node19','Node20']]))
    
    df = df.drop(['Node1','Node2','Node3','Node4','Node5','Node6','Node7',
                  'Node8','Node9','Node10','Node11','Node12','Node13','Node14',
                  'Node15','Node17','Node18','Node19','Node20'],axis=1)
    
    df = pd.concat([df,encoded_features],axis=1)
    df = df.dropna()
    c = df['coding']
    y = df['target']
    w = df['w']
    X = df.drop(['target','w','coding'],axis=1)
        
    return X,y,w,c

In [98]:
df = pd.read_csv('stereotypical_customer_simulation.csv',index_col='customer_id')
X, y, w, c  = kuusisto_preprocessing(df)
from sklearn.model_selection import train_test_split
indices_learn, indices_valid = train_test_split(X.index, test_size=0.3,random_state=0)
X_train, w_train, y_train, c_train = X.loc[indices_learn], w.loc[indices_learn], y.loc[indices_learn], c.loc[indices_learn]
X_valid, w_valid, y_valid, c_valid = X.loc[indices_valid], w.loc[indices_valid], y.loc[indices_valid], c.loc[indices_valid]

In [99]:
random_state=4
rng = np.random.RandomState(random_state)
ada_boost = Boosting(n_estimators=50,
         base_classifier=UpliftTree(max_depth=5, scoring='Chi', min_samples_leaf=100, 
                                    min_samples_treatment=10, n_rand_features=20,rng=rng),
         random_state=0)
ada_boost.fit(X_train.to_numpy().astype(int), y_train, w_train)
class_ = ada_boost.predict(X_valid.to_numpy().astype(int))

In [101]:
# Uplift at k best
uplift_k = uplift_at_k(y_true=y_valid, uplift=class_, treatment=w_valid, strategy='overall', k=0.3)
# Area Under Qini Curve
tm_qini_auc = qini_auc_score(y_true=y_valid, uplift=class_, treatment=w_valid)

# Area Under Uplift Curve
tm_uplift_auc = uplift_auc_score(y_true=y_valid, uplift=class_, treatment=w_valid)

# Weighted average uplift
tm_wau = weighted_average_uplift(y_true=y_valid, uplift=class_,  treatment=w_valid)

print("Boosting: ", tm_qini_auc, tm_uplift_auc, tm_wau, uplift_k)

Boosting:  0.00035103711264467127 5.435472853509591e-06 0.024207523529786168 0.018346175326568792


In [39]:
from sklift.datasets import fetch_hillstrom
from sklearn.preprocessing import OneHotEncoder
def mine_coding(dataset):
    return int(str(int(dataset['segment']))+str(int(dataset['visit'])), 2)

def mine_preproccessing():  
    bunch = fetch_hillstrom(target_col='visit')

    dataset, target, treatment = bunch['data'], bunch['target'], bunch['treatment']

    # Selecting two segments
    dataset = dataset[treatment!='Mens E-Mail']
    target = target[treatment!='Mens E-Mail']
    treatment = treatment[treatment!='Mens E-Mail'].map({
        'Womens E-Mail': 1,
        'No E-Mail': 0
    })
    cat_cols = dataset.select_dtypes(include='object').columns.tolist()
    ohe = OneHotEncoder(sparse=False)
    encoded_features = pd.DataFrame(ohe.fit_transform(dataset[cat_cols]))

    dataset = pd.concat([dataset,target,treatment],axis=1)
    dataset = dataset.drop(cat_cols,axis=1)
    dataset = pd.concat([dataset,encoded_features],axis=1)
    dataset = dataset.dropna()

    dataset['coding'] = dataset.apply(mine_coding, axis=1) 

    c = dataset['coding']
    y = dataset['visit'].astype(int)
    w = dataset['segment'].astype(int)
    X = dataset.drop(['visit','segment','coding'],axis=1)

    return X,y,w,c

In [40]:
X_mine,y_mine,w_mine,c_mine = mine_preproccessing()
X_mine[['mens',  'womens',  'newbie',         0,
               1,         2,         3,         4,         5,         6,
               7,         8,         9,        10,        11,        12]] = X_mine[['mens',  'womens',  'newbie',         0,
               1,         2,         3,         4,         5,         6,
               7,         8,         9,        10,        11,        12]].astype(int)

In [41]:
indices_learn, indices_valid = train_test_split(X_mine.index, test_size=0.3)
X_train, w_train, y_train, c_train = X_mine.loc[indices_learn], w_mine.loc[indices_learn], y_mine.loc[indices_learn], c_mine.loc[indices_learn]
X_valid, w_valid, y_valid, c_valid = X_mine.loc[indices_valid], w_mine.loc[indices_valid], y_mine.loc[indices_valid], c_mine.loc[indices_valid]

In [43]:
random_state=6
rng = np.random.RandomState(random_state)
ada_boost = Boosting(n_estimators=50,
         base_classifier=UpliftTree(max_depth=5, scoring='ED', min_samples_leaf=100, 
                                    min_samples_treatment=10, n_rand_features=20, norm=False, rng=rng),
         random_state=0)
ada_boost.fit(X_train.to_numpy().astype(int), y_train, w_train)
class_ = ada_boost.predict(X_valid.to_numpy().astype(int))

In [59]:
# Uplift at k best
uplift_k = uplift_at_k(y_true=y_valid, uplift=class_, treatment=w_valid, strategy='overall', k=0.3)
# Area Under Qini Curve
tm_qini_auc = qini_auc_score(y_true=y_valid, uplift=class_, treatment=w_valid)

# Area Under Uplift Curve
tm_uplift_auc = uplift_auc_score(y_true=y_valid, uplift=class_, treatment=w_valid)

# Weighted average uplift
tm_wau = weighted_average_uplift(y_true=y_valid, uplift=class_,  treatment=w_valid)

print("Boosting: ", tm_qini_auc, tm_uplift_auc, tm_wau, uplift_k)

Boosting:  0.020286239920493734 0.009853374826839702 0.04833342223387788 0.052080696664654674
