In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.base import clone
import xgboost

In [2]:
clients_df = pd.read_csv("../../data/raw/clients.csv", parse_dates=['first_issue_date', 'first_redeem_date'])
# products_df = pd.read_csv("../../data/raw/products.csv")
# purchases_df = pd.read_csv("../../data/raw/purchases.csv", parse_dates=['transaction_datetime'])

In [3]:
clients_df['issue_dayofyear'] = clients_df['first_issue_date'].dt.dayofyear
clients_df['issue_hour'] = clients_df['first_issue_date'].dt.hour
clients_df['issue_weekday'] = clients_df['first_issue_date'].dt.weekday
clients_df['issue_dayofmonth'] = clients_df['first_issue_date'].dt.day
clients_df['issue_year'] = clients_df['first_issue_date'].dt.year
clients_df['issue_month'] = clients_df['first_issue_date'].dt.month
clients_df['issue_weekofyear'] = clients_df['first_issue_date'].dt.weekofyear
clients_df['issue_week'] = clients_df['first_issue_date'].dt.week
clients_df['issue_quarter'] = clients_df['first_issue_date'].dt.quarter

In [4]:
clients_df['redeem_issue_diff'] = (clients_df['first_redeem_date'] - clients_df['first_issue_date']).dt.total_seconds()
cat_col = "gender"
encoding = clients_df.groupby(f"{cat_col}").size()
encoding = encoding / clients_df.shape[0]
clients_df[f"{cat_col}_freq_enc"] = clients_df[f"{cat_col}"].map(encoding)

In [5]:
clients_df['strange_age'] = (clients_df['age'] < clients_df['age'].quantile(.01)) | (clients_df['age'] > clients_df['age'].quantile(.99))
clients_df['strange_age'] = clients_df['strange_age'].astype(int)

In [6]:
for value in clients_df['gender'].unique():
    clients_df[f'gender_{value}'] = (clients_df['gender'] == value).astype(int)

In [7]:
features = [
    "age",
    "gender_U",
    "gender_F",
    "gender_M",
    "issue_dayofyear",
    "issue_hour",
    "issue_weekday",
    "issue_dayofmonth",
    "issue_year",
    "issue_month",
    "issue_weekofyear",
    "issue_week",
    "issue_quarter",
    "redeem_issue_diff",
    "gender_freq_enc",
    "strange_age",
]
df = clients_df.set_index('client_id')[features]

In [8]:
train_df = pd.read_csv("../../data/raw/uplift_train.csv", index_col='client_id')
test_df = pd.read_csv("../../data/raw/uplift_test.csv", index_col='client_id')

In [10]:
train_df.head()

Unnamed: 0_level_0,treatment_flg,target
client_id,Unnamed: 1_level_1,Unnamed: 2_level_1
000012768d,0,1
000036f903,1,1
00010925a5,1,1
0001f552b0,1,1
00020e7b18,1,1


In [18]:
train_df['new_target'] = ((train_df['treatment_flg'] == 1) & (train_df['target'] ==  1)) | ((train_df['treatment_flg'] == 0) & (train_df['target'] == 0))
train_df['new_target'] = train_df['new_target'].astype(int)
train_df.head()

Unnamed: 0_level_0,treatment_flg,target,new_target
client_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
000012768d,0,1,0
000036f903,1,1,1
00010925a5,1,1,1
0001f552b0,1,1,1
00020e7b18,1,1,1


In [9]:
indices_train = train_df.index
indices_test = test_df.index
indices_learn, indices_valid = train_test_split(train_df.index, test_size=0.2, random_state=42)

In [38]:
X_train = df.loc[indices_learn, :].values
y_train = train_df.loc[indices_learn, 'new_target'].values

X_test = df.loc[indices_valid, :].values
y_test = train_df.loc[indices_valid, 'new_target'].values

clf = xgboost.XGBClassifier()
clf.fit(X_train, y_train)
predict_new_target = clf.predict_proba(X_test)[:, 1]
valid_uplift = (2 * predict_new_target) - 1
clf.score(X_test, y_test)

0.5170965806838632

In [48]:
df_valid_uplift = pd.DataFrame({'uplift': valid_uplift}, index=train_df.loc[indices_valid].index).sort_values(by='uplift', ascending=False)
df_valid_uplift - train_df.loc[indices_valid]['new_target']

MemoryError: 

In [39]:
predict_new_target = clf.predict_proba(df.loc[indices_test, :].values)[:, 1]
test_uplift = (2 * predict_new_target) - 1

In [41]:
df_submission = pd.DataFrame({'uplift': test_uplift}, index=test_df.index)
df_submission.to_csv('../../data/submissions/baseline_class_transformation.csv')

In [44]:
n_users = int(df.shape[0] * 0.3)
df_submission.sort_values(by='uplift', ascending=False)[:n_users] - df_

Unnamed: 0_level_0,uplift
client_id,Unnamed: 1_level_1
a0c9132ca7,0.812842
9ffa63f10c,0.716443
ec2b70c3cf,0.706744
2eff5d3eef,0.678377
b534894551,0.651506
efd61e43fa,0.643815
fb68fb0e4a,0.640801
6ba86fa404,0.618017
ce231d4afa,0.605029
8cde72c289,0.577585
