In [120]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.base import clone
import xgboost

In [3]:
clients_df = pd.read_csv("../../data/raw/clients.csv", parse_dates=['first_issue_date', 'first_redeem_date'])
products_df = pd.read_csv("../../data/raw/products.csv")
purchases_df = pd.read_csv("../../data/raw/purchases.csv", parse_dates=['transaction_datetime'])

In [5]:
clients_df['issue_dayofyear'] = clients_df['first_issue_date'].dt.dayofyear
clients_df['issue_hour'] = clients_df['first_issue_date'].dt.hour
clients_df['issue_weekday'] = clients_df['first_issue_date'].dt.weekday
clients_df['issue_dayofmonth'] = clients_df['first_issue_date'].dt.day
clients_df['issue_year'] = clients_df['first_issue_date'].dt.year
clients_df['issue_month'] = clients_df['first_issue_date'].dt.month
clients_df['issue_weekofyear'] = clients_df['first_issue_date'].dt.weekofyear
clients_df['issue_week'] = clients_df['first_issue_date'].dt.week
clients_df['issue_quarter'] = clients_df['first_issue_date'].dt.quarter

In [11]:
clients_df['redeem_issue_diff'] = (clients_df['first_redeem_date'] - clients_df['first_issue_date']).dt.total_seconds()
cat_col = "gender"
encoding = clients_df.groupby(f"{cat_col}").size()
encoding = encoding / clients_df.shape[0]
clients_df[f"{cat_col}_freq_enc"] = clients_df[f"{cat_col}"].map(encoding)

In [12]:
clients_df['strange_age'] = (clients_df['age'] < clients_df['age'].quantile(.01)) | (clients_df['age'] > clients_df['age'].quantile(.99))
clients_df['strange_age'] = clients_df['strange_age'].astype(int)

In [19]:
for value in clients_df['gender'].unique():
    clients_df[f'gender_{value}'] = (clients_df['gender'] == value).astype(int)

In [20]:
clients_df.head()

Unnamed: 0,client_id,first_issue_date,first_redeem_date,age,gender,issue_dayofyear,issue_hour,issue_weekday,issue_dayofmonth,issue_year,issue_month,issue_weekofyear,issue_week,issue_quarter,redeem_issue_diff,gender_freq_enc,strange_age,gender_U,gender_F,gender_M
0,000012768d,2017-08-05 15:40:48,2018-01-04 19:30:07,45,U,217,15,5,5,2017,8,31,31,3,13146559.0,0.464077,0,1,0,0
1,000036f903,2017-04-10 13:54:23,2017-04-23 12:37:56,72,F,100,13,0,10,2017,4,15,15,2,1118613.0,0.368973,0,0,1,0
2,000048b7a6,2018-12-15 13:33:11,NaT,68,F,349,13,5,15,2018,12,50,50,4,,0.368973,0,0,1,0
3,000073194a,2017-05-23 12:56:14,2017-11-24 11:18:01,60,F,143,12,1,23,2017,5,21,21,2,15978107.0,0.368973,0,0,1,0
4,00007c7133,2017-05-22 16:17:08,2018-12-31 17:17:33,67,U,142,16,0,22,2017,5,21,21,2,50806825.0,0.464077,0,1,0,0


In [30]:
features = [
    "age",
    "gender_U",
    "gender_F",
    "gender_M",
    "issue_dayofyear",
    "issue_hour",
    "issue_weekday",
    "issue_dayofmonth",
    "issue_year",
    "issue_month",
    "issue_weekofyear",
    "issue_week",
    "issue_quarter",
    "redeem_issue_diff",
    "gender_freq_enc",
    "strange_age",
]
df = clients_df.set_index('client_id')[features]

In [31]:
train_df = pd.read_csv("../../data/raw/uplift_train.csv", index_col='client_id')
test_df = pd.read_csv("../../data/raw/uplift_test.csv", index_col='client_id')

In [34]:
train_df.head()

Unnamed: 0_level_0,treatment_flg,target
client_id,Unnamed: 1_level_1,Unnamed: 2_level_1
000012768d,0,1
000036f903,1,1
00010925a5,1,1
0001f552b0,1,1
00020e7b18,1,1


In [118]:
indices_train = train_df.index
indices_test = test_df.index
indices_learn, indices_valid = train_test_split(train_df.index, test_size=0.2, random_state=42)

In [117]:
def uplift_fit_predict(model, X_train, treatment_train, target_train, X_test):
    """
    Реализация простого способа построения uplift-модели.
    
    Обучаем два бинарных классификатора, которые оценивают вероятность target для клиента:
    1. с которым была произведена коммуникация (treatment=1)
    2. с которым не было коммуникации (treatment=0)
    
    В качестве оценки uplift для нового клиента берется разница оценок вероятностей:
    Predicted Uplift = P(target|treatment=1) - P(target|treatment=0)
    """
    X_treatment, y_treatment = X_train[treatment_train == 1, :], target_train[treatment_train == 1]
    X_control, y_control = X_train[treatment_train == 0, :], target_train[treatment_train == 0]
    model_treatment = clone(model).fit(X_treatment, y_treatment)
    model_control = clone(model).fit(X_control, y_control)
    predict_treatment = model_treatment.predict_proba(X_test)[:, 1]
    predict_control = model_control.predict_proba(X_test)[:, 1]
    predict_uplift = predict_treatment - predict_control
    return predict_uplift


def uplift_score(prediction, treatment, target, rate=0.3):
    """
    Подсчет Uplift Score
    """
    order = np.argsort(-prediction)
    treatment_n = int((treatment == 1).sum() * rate)
    treatment_p = target[order][treatment[order] == 1][:treatment_n].mean()
    control_n = int((treatment == 0).sum() * rate)
    control_p = target[order][treatment[order] == 0][:control_n].mean()
    score = treatment_p - control_p
    return score

In [121]:
valid_uplift = uplift_fit_predict(
    model=xgboost.XGBClassifier(),
    X_train=df.loc[indices_learn, :].fillna(0).values,
    treatment_train=train_df.loc[indices_learn, 'treatment_flg'].values,
    target_train=train_df.loc[indices_learn, 'target'].values,
    X_test=df.loc[indices_valid, :].fillna(0).values,
)
valid_score = uplift_score(
    valid_uplift,
    treatment=train_df.loc[indices_valid, 'treatment_flg'].values,
    target=train_df.loc[indices_valid, 'target'].values,
)
print('Validation score:', valid_score)

Validation score: 0.05081166028966111


In [122]:
# Подготовка предсказаний для тестовых клиентов

test_uplift = uplift_fit_predict(
    model=xgboost.XGBClassifier(),
    X_train=df.loc[indices_train, :].fillna(0).values,
    treatment_train=train_df.loc[indices_train, 'treatment_flg'].values,
    target_train=train_df.loc[indices_train, 'target'].values,
    X_test=df.loc[indices_test, :].fillna(0).values,
)

NameError: name 'pandas' is not defined

In [124]:
df_submission = pd.DataFrame({'uplift': test_uplift}, index=test_df.index)
df_submission.to_csv('../../data/submissions/baseline_two_models.csv')