In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.base import clone
import xgboost

In [2]:
clients_df = pd.read_csv("../../data/raw/clients.csv", parse_dates=['first_issue_date', 'first_redeem_date'])
# products_df = pd.read_csv("../../data/raw/products.csv")
# purchases_df = pd.read_csv("../../data/raw/purchases.csv", parse_dates=['transaction_datetime'])

In [3]:
clients_df['issue_dayofyear'] = clients_df['first_issue_date'].dt.dayofyear
clients_df['issue_hour'] = clients_df['first_issue_date'].dt.hour
clients_df['issue_weekday'] = clients_df['first_issue_date'].dt.weekday
clients_df['issue_dayofmonth'] = clients_df['first_issue_date'].dt.day
clients_df['issue_year'] = clients_df['first_issue_date'].dt.year
clients_df['issue_month'] = clients_df['first_issue_date'].dt.month
clients_df['issue_weekofyear'] = clients_df['first_issue_date'].dt.weekofyear
clients_df['issue_week'] = clients_df['first_issue_date'].dt.week
clients_df['issue_quarter'] = clients_df['first_issue_date'].dt.quarter

In [4]:
clients_df['redeem_issue_diff'] = (clients_df['first_redeem_date'] - clients_df['first_issue_date']).dt.total_seconds()
cat_col = "gender"
encoding = clients_df.groupby(f"{cat_col}").size()
encoding = encoding / clients_df.shape[0]
clients_df[f"{cat_col}_freq_enc"] = clients_df[f"{cat_col}"].map(encoding)

In [5]:
clients_df['strange_age'] = (clients_df['age'] < clients_df['age'].quantile(.01)) | (clients_df['age'] > clients_df['age'].quantile(.99))
clients_df['strange_age'] = clients_df['strange_age'].astype(int)

In [6]:
for value in clients_df['gender'].unique():
    clients_df[f'gender_{value}'] = (clients_df['gender'] == value).astype(int)

In [7]:
features = [
    "age",
    "gender_U",
    "gender_F",
    "gender_M",
    "issue_dayofyear",
    "issue_hour",
    "issue_weekday",
    "issue_dayofmonth",
    "issue_year",
    "issue_month",
    "issue_weekofyear",
    "issue_week",
    "issue_quarter",
    "redeem_issue_diff",
    "gender_freq_enc",
    "strange_age",
]
df = clients_df.set_index('client_id')[features]

In [8]:
train_df = pd.read_csv("../../data/raw/uplift_train.csv", index_col='client_id')
test_df = pd.read_csv("../../data/raw/uplift_test.csv", index_col='client_id')

In [9]:
train_df.head()

Unnamed: 0_level_0,treatment_flg,target
client_id,Unnamed: 1_level_1,Unnamed: 2_level_1
000012768d,0,1
000036f903,1,1
00010925a5,1,1
0001f552b0,1,1
00020e7b18,1,1


In [10]:
train_df['new_target'] = ((train_df['treatment_flg'] == 1) & (train_df['target'] ==  1)) | ((train_df['treatment_flg'] == 0) & (train_df['target'] == 0))
train_df['new_target'] = train_df['new_target'].astype(int)
train_df.head()

Unnamed: 0_level_0,treatment_flg,target,new_target
client_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
000012768d,0,1,0
000036f903,1,1,1
00010925a5,1,1,1
0001f552b0,1,1,1
00020e7b18,1,1,1


In [11]:
indices_train = train_df.index
indices_test = test_df.index
indices_learn, indices_valid = train_test_split(train_df.index, test_size=0.2, random_state=42)

In [23]:
df.head()

Unnamed: 0_level_0,age,gender_U,gender_F,gender_M,issue_dayofyear,issue_hour,issue_weekday,issue_dayofmonth,issue_year,issue_month,issue_weekofyear,issue_week,issue_quarter,redeem_issue_diff,gender_freq_enc,strange_age
client_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
000012768d,45,1,0,0,217,15,5,5,2017,8,31,31,3,13146559.0,0.464077,0
000036f903,72,0,1,0,100,13,0,10,2017,4,15,15,2,1118613.0,0.368973,0
000048b7a6,68,0,1,0,349,13,5,15,2018,12,50,50,4,,0.368973,0
000073194a,60,0,1,0,143,12,1,23,2017,5,21,21,2,15978107.0,0.368973,0
00007c7133,67,1,0,0,142,16,0,22,2017,5,21,21,2,50806825.0,0.464077,0


In [22]:
X_train = df.loc[indices_learn, :]
y_train = train_df.loc[indices_learn, 'new_target']

X_test = df.loc[indices_valid, :]
y_test = train_df.loc[indices_valid, 'new_target']

clf = xgboost.XGBClassifier()
clf.fit(X_train, y_train)
predict_new_target = clf.predict_proba(X_test)[:, 1]
valid_uplift = (2 * predict_new_target) - 1
clf.score(X_test, y_test)

0.5170965806838632

In [19]:
n_users = int(len(valid_uplift) * 0.3)
df_valid_uplift = pd.DataFrame({'uplift': valid_uplift}, index=train_df.loc[indices_valid].index).sort_values(by='uplift', ascending=False)
df_valid_uplift['new_target'] = train_df.loc[indices_valid]['new_target']
df_valid_uplift['diff'] = df_valid_uplift['uplift'] - df_valid_uplift['new_target']
df_valid_uplift[:n_users]['diff'].mean()

-0.46572473645210266

In [27]:
predict_new_target = clf.predict_proba(df.loc[indices_test, :])[:, 1]
test_uplift = (2 * predict_new_target) - 1

In [28]:
df_submission = pd.DataFrame({'uplift': test_uplift}, index=test_df.index)
df_submission.to_csv('../../data/submissions/baseline_class_transformation.csv')

In [29]:
df_submission

Unnamed: 0_level_0,uplift
client_id,Unnamed: 1_level_1
000048b7a6,0.003469
000073194a,0.039504
00007c7133,0.056646
00007f9014,0.014655
0000a90cf7,0.031699
0000b59cec,0.039774
0000bb4e4e,0.021165
0000bcec9c,0.017207
0000eecb82,0.007698
0000f0ecdb,0.000435


In [44]:
n_users = int(df.shape[0] * 0.3)
df_submission.sort_values(by='uplift', ascending=False)[:n_users] - df_

Unnamed: 0_level_0,uplift
client_id,Unnamed: 1_level_1
a0c9132ca7,0.812842
9ffa63f10c,0.716443
ec2b70c3cf,0.706744
2eff5d3eef,0.678377
b534894551,0.651506
efd61e43fa,0.643815
fb68fb0e4a,0.640801
6ba86fa404,0.618017
ce231d4afa,0.605029
8cde72c289,0.577585
