In [None]:
# Dependent and Shared Data Representations improve Uplift Prediction in Imbalanced Treatment Conditions

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.base import clone
import xgboost

In [2]:
clients_df = pd.read_csv("../../data/raw/clients.csv", parse_dates=['first_issue_date', 'first_redeem_date'])
# products_df = pd.read_csv("../../data/raw/products.csv")
# purchases_df = pd.read_csv("../../data/raw/purchases.csv", parse_dates=['transaction_datetime'])

In [3]:
clients_df['issue_dayofyear'] = clients_df['first_issue_date'].dt.dayofyear
clients_df['issue_hour'] = clients_df['first_issue_date'].dt.hour
clients_df['issue_weekday'] = clients_df['first_issue_date'].dt.weekday
clients_df['issue_dayofmonth'] = clients_df['first_issue_date'].dt.day
clients_df['issue_year'] = clients_df['first_issue_date'].dt.year
clients_df['issue_month'] = clients_df['first_issue_date'].dt.month
clients_df['issue_weekofyear'] = clients_df['first_issue_date'].dt.weekofyear
clients_df['issue_week'] = clients_df['first_issue_date'].dt.week
clients_df['issue_quarter'] = clients_df['first_issue_date'].dt.quarter

In [4]:
clients_df['redeem_issue_diff'] = (clients_df['first_redeem_date'] - clients_df['first_issue_date']).dt.total_seconds()
cat_col = "gender"
encoding = clients_df.groupby(f"{cat_col}").size()
encoding = encoding / clients_df.shape[0]
clients_df[f"{cat_col}_freq_enc"] = clients_df[f"{cat_col}"].map(encoding)

In [5]:
clients_df['strange_age'] = (clients_df['age'] < clients_df['age'].quantile(.01)) | (clients_df['age'] > clients_df['age'].quantile(.99))
clients_df['strange_age'] = clients_df['strange_age'].astype(int)

In [6]:
for value in clients_df['gender'].unique():
    clients_df[f'gender_{value}'] = (clients_df['gender'] == value).astype(int)

In [7]:
features = [
    "age",
    "gender_U",
    "gender_F",
    "gender_M",
    "issue_dayofyear",
    "issue_hour",
    "issue_weekday",
    "issue_dayofmonth",
    "issue_year",
    "issue_month",
    "issue_weekofyear",
    "issue_week",
    "issue_quarter",
    "redeem_issue_diff",
    "gender_freq_enc",
    "strange_age",
]
df = clients_df.set_index('client_id')[features]

In [8]:
train_df = pd.read_csv("../../data/raw/uplift_train.csv", index_col='client_id')
test_df = pd.read_csv("../../data/raw/uplift_test.csv", index_col='client_id')

In [9]:
indices_train = train_df.index
indices_test = test_df.index
indices_learn, indices_valid = train_test_split(train_df.index, test_size=0.3, random_state=42)

In [None]:
# 1. Train model on control data.

In [45]:
X_train_control = df.loc[indices_learn][train_df.loc[indices_learn]['treatment_flg'] == 0]
y_train_control = train_df.loc[indices_learn][train_df.loc[indices_learn]['treatment_flg'] == 0]['target']
assert X_train_control.shape[0] == y_train_control.shape[0]

In [50]:
clf = xgboost.XGBClassifier()
clf.fit(X_train_control, y_train_control)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [46]:
# 2. Use model prediction on treatment data as feature.

In [51]:
X_train_treatment = df.loc[indices_learn][train_df.loc[indices_learn]['treatment_flg'] == 1]
y_train_treatment = train_df.loc[indices_learn][train_df.loc[indices_learn]['treatment_flg'] == 1]['target']
assert X_train_treatment.shape[0] == y_train_treatment.shape[0]

In [52]:
X_train_treatment['prediction_1'] = clf.predict_proba(X_train_treatment)[:, 1]

In [54]:
clf2 = xgboost.XGBClassifier()
clf2.fit(X_train_treatment, y_train_treatment)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [43]:
X_valid = df.loc[indices_valid, :]
y_valid = train_df.loc[indices_valid, 'target']

In [55]:
predict_control = clf.predict_proba(X_valid)[:, 1]
X_valid['prediction_1'] = clf.predict_proba(X_valid)[:, 1]
predict_treatment = clf2.predict_proba(X_valid)[:, 1]
predict_uplift = predict_treatment - predict_control

In [33]:
def uplift_score(prediction, treatment, target, rate=0.3):
    """
    Подсчет Uplift Score
    """
    order = np.argsort(-prediction)
    treatment_n = int((treatment == 1).sum() * rate)
    treatment_p = target[order][treatment[order] == 1][:treatment_n].mean()
    control_n = int((treatment == 0).sum() * rate)
    control_p = target[order][treatment[order] == 0][:control_n].mean()
    score = treatment_p - control_p
    return score

In [56]:
valid_score = uplift_score(
    predict_uplift,
    treatment=train_df.loc[indices_valid, 'treatment_flg'].values,
    target=train_df.loc[indices_valid, 'target'].values,
)

In [57]:
valid_score

0.06555880976314943

In [58]:
X_test = df.loc[indices_test, :]

In [59]:
predict_control = clf.predict_proba(X_test)[:, 1]
X_test['prediction_1'] = clf.predict_proba(X_test)[:, 1]
predict_treatment = clf_2.predict_proba(X_test)[:, 1]
predict_uplift = predict_treatment - predict_control

In [60]:
df_submission = pd.DataFrame({'uplift': predict_uplift}, index=test_df.index)

In [61]:
df_submission.to_csv('../../data/submissions/baseline_ddr.csv')