In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.base import clone
import xgboost

In [2]:
clients_df = pd.read_csv("../../data/raw/clients.csv", parse_dates=['first_issue_date', 'first_redeem_date'])
# products_df = pd.read_csv("../../data/raw/products.csv")
# purchases_df = pd.read_csv("../../data/raw/purchases.csv", parse_dates=['transaction_datetime'])

In [3]:
clients_df['issue_dayofyear'] = clients_df['first_issue_date'].dt.dayofyear
clients_df['issue_hour'] = clients_df['first_issue_date'].dt.hour
clients_df['issue_weekday'] = clients_df['first_issue_date'].dt.weekday
clients_df['issue_dayofmonth'] = clients_df['first_issue_date'].dt.day
clients_df['issue_year'] = clients_df['first_issue_date'].dt.year
clients_df['issue_month'] = clients_df['first_issue_date'].dt.month
clients_df['issue_weekofyear'] = clients_df['first_issue_date'].dt.weekofyear
clients_df['issue_week'] = clients_df['first_issue_date'].dt.week
clients_df['issue_quarter'] = clients_df['first_issue_date'].dt.quarter

In [4]:
clients_df['redeem_issue_diff'] = (clients_df['first_redeem_date'] - clients_df['first_issue_date']).dt.total_seconds()
cat_col = "gender"
encoding = clients_df.groupby(f"{cat_col}").size()
encoding = encoding / clients_df.shape[0]
clients_df[f"{cat_col}_freq_enc"] = clients_df[f"{cat_col}"].map(encoding)

In [5]:
clients_df['strange_age'] = (clients_df['age'] < clients_df['age'].quantile(.01)) | (clients_df['age'] > clients_df['age'].quantile(.99))
clients_df['strange_age'] = clients_df['strange_age'].astype(int)

In [6]:
for value in clients_df['gender'].unique():
    clients_df[f'gender_{value}'] = (clients_df['gender'] == value).astype(int)

In [7]:
features = [
    "age",
    "gender_U",
    "gender_F",
    "gender_M",
    "issue_dayofyear",
    "issue_hour",
    "issue_weekday",
    "issue_dayofmonth",
    "issue_year",
    "issue_month",
    "issue_weekofyear",
    "issue_week",
    "issue_quarter",
    "redeem_issue_diff",
    "gender_freq_enc",
    "strange_age",
]
df = clients_df.set_index('client_id')[features]

In [8]:
train_df = pd.read_csv("../../data/raw/uplift_train.csv", index_col='client_id')
test_df = pd.read_csv("../../data/raw/uplift_test.csv", index_col='client_id')

In [9]:
train_df.head()

Unnamed: 0_level_0,treatment_flg,target
client_id,Unnamed: 1_level_1,Unnamed: 2_level_1
000012768d,0,1
000036f903,1,1
00010925a5,1,1
0001f552b0,1,1
00020e7b18,1,1


In [10]:
train_df['new_target'] = ((train_df['treatment_flg'] == 1) & (train_df['target'] ==  1)) | ((train_df['treatment_flg'] == 0) & (train_df['target'] == 0))
train_df['new_target'] = train_df['new_target'].astype(int)
train_df.head()

Unnamed: 0_level_0,treatment_flg,target,new_target
client_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
000012768d,0,1,0
000036f903,1,1,1
00010925a5,1,1,1
0001f552b0,1,1,1
00020e7b18,1,1,1


In [11]:
indices_train = train_df.index
indices_test = test_df.index
indices_learn, indices_valid = train_test_split(train_df.index, test_size=0.3, random_state=42)

In [12]:
X_train = df.loc[indices_learn, :]
y_train = train_df.loc[indices_learn, 'new_target']

X_test = df.loc[indices_valid, :]
y_test = train_df.loc[indices_valid, 'new_target']

clf = xgboost.XGBClassifier()
clf.fit(X_train, y_train)
predict_new_target = clf.predict_proba(X_test)[:, 1]
valid_uplift = (2 * predict_new_target) - 1
clf.score(X_test, y_test)

0.5170799173498634

In [18]:
import scipy.stats as st
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold

In [21]:
clf = xgboost.XGBClassifier()
params_grid = {
    "n_estimators": st.randint(100, 500),
    "colsample_bytree": st.beta(10, 1),
    "subsample": st.beta(10, 1),
    "gamma": st.uniform(0, 10),
    'reg_alpha': st.expon(0, 50),
    "min_child_weight": st.expon(0, 50),
    "learning_rate": st.uniform(0.06, 0.12),
    'max_depth': st.randint(6, 30)
}
cv = StratifiedKFold(n_splits=5)
search_sk = RandomizedSearchCV(clf, params_grid, cv=cv, random_state=42, n_iter=20)  # 5 fold cross validation
search_sk.fit(X_train, y_train)

RandomizedSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
                   error_score='raise-deprecating',
                   estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                           colsample_bylevel=1,
                                           colsample_bynode=1,
                                           colsample_bytree=1, gamma=0,
                                           learning_rate=0.1, max_delta_step=0,
                                           max_depth=3, min_child_weight=1,
                                           missing=None, n_estimators=100,
                                           n_jobs=1, nthread=N...
                                        'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000027E12D446D8>,
                                        'reg_alpha': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000027E125B1F60>,
                             

In [22]:
# best parameters
print("best parameters:", search_sk.best_params_); print("best score:", search_sk.best_score_)

best parameters: {'colsample_bytree': 0.9303737761527869, 'gamma': 5.926967238793935, 'learning_rate': 0.06970239915992582, 'max_depth': 29, 'min_child_weight': 13.566658616790289, 'n_estimators': 354, 'reg_alpha': 81.26306126260499, 'subsample': 0.8074961064961306}
best score: 0.5157648167853343


In [23]:
X_train = df.loc[indices_learn, :]
y_train = train_df.loc[indices_learn, 'new_target']

X_test = df.loc[indices_valid, :]
y_test = train_df.loc[indices_valid, 'new_target']

clf = xgboost.XGBClassifier(**search_sk.best_params_)
clf.fit(X_train, y_train)
predict_new_target = clf.predict_proba(X_test)[:, 1]
valid_uplift = (2 * predict_new_target) - 1
clf.score(X_test, y_test)

0.518512964073852

In [19]:
n_users = int(len(valid_uplift) * 0.3)
df_valid_uplift = pd.DataFrame({'uplift': valid_uplift}, index=train_df.loc[indices_valid].index).sort_values(by='uplift', ascending=False)
df_valid_uplift['new_target'] = train_df.loc[indices_valid]['new_target']
df_valid_uplift['diff'] = df_valid_uplift['uplift'] - df_valid_uplift['new_target']
df_valid_uplift[:n_users]['diff'].mean()

-0.46572473645210266

In [27]:
predict_new_target = clf.predict_proba(df.loc[indices_test, :])[:, 1]
test_uplift = (2 * predict_new_target) - 1

In [28]:
df_submission = pd.DataFrame({'uplift': test_uplift}, index=test_df.index)
df_submission.to_csv('../../data/submissions/baseline_class_transformation.csv')