In [1]:
import pandas as pd
import os
import numpy as np
import itertools
import xgboost as xgb
import joblib
import datetime

pd.set_option('display.max_columns', None)

# Load Dataset

In [2]:
dsdir = 'dataset/coupon-purchase-prediction'

submission = pd.read_csv(os.path.join(dsdir,'sample_submission.csv'))

train = joblib.load('world/train.pkl').sample(frac=1, random_state=0).reset_index(drop=True)
test = joblib.load('world/test.pkl')

train.SEX_ID = train.SEX_ID.astype(np.uint8)
test.SEX_ID = test.SEX_ID.astype(np.uint8)

train.drop(['COUPON_ID_hash','USER_ID_hash'], axis=1, inplace=True)
sub = test[['USER_ID_hash','COUPON_ID_hash']].copy()
test.drop(['COUPON_ID_hash','USER_ID_hash', 'DISPFROM', 'PURCHASE_FLG'], axis=1, inplace=True)

In [3]:
print(len(sub.USER_ID_hash.unique()))
print(len(sub.COUPON_ID_hash.unique()))

22873
310


In [4]:
train.set_index('DISPFROM', inplace=True)
train.index = pd.to_datetime(train.index)

# Get Train Val

In [5]:
train_range = []

start = pd.to_datetime('01/08/2012')
end = start
for i in range(23):
    end = end + datetime.timedelta(days=7)

In [6]:
xgb_train = train[(train.index >= start) & (train.index < end)]
xgb_eval = train[(train.index >= end)]

In [7]:
print(len(xgb_train[xgb_train.PURCHASE_FLG==0]),len(xgb_train[xgb_train.PURCHASE_FLG==1]))
print(len(xgb_eval[xgb_eval.PURCHASE_FLG==0]),len(xgb_eval[xgb_eval.PURCHASE_FLG==1]))

3234269 545442
112892 17724


In [8]:
xgb_train = xgb.DMatrix(xgb_train.drop('PURCHASE_FLG', axis=1), label=xgb_train.PURCHASE_FLG.values.reshape(-1))

In [9]:
xgb_eval = xgb.DMatrix(xgb_eval.drop('PURCHASE_FLG', axis=1), label=xgb_eval.PURCHASE_FLG.values.reshape(-1))

# Train

In [10]:
xgb_params = {'booster':'gbtree',
              'objective': 'binary:logistic',
              'eta': 0.005,
              'max_depth': 12,
              'eval_metric': 'logloss'
             }

num_round = 5000

watchlist = [(xgb_train, 'train'), (xgb_eval, 'valid')]
xgb_model = xgb.train(xgb_params, xgb_train, num_round, watchlist, early_stopping_rounds=200, verbose_eval=100)

[0]	train-logloss:0.68811	valid-logloss:0.68887
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 200 rounds.
[100]	train-logloss:0.40752	valid-logloss:0.40760
[200]	train-logloss:0.27710	valid-logloss:0.27785
[300]	train-logloss:0.20989	valid-logloss:0.21150
[400]	train-logloss:0.17386	valid-logloss:0.17671
[500]	train-logloss:0.15406	valid-logloss:0.15825
[600]	train-logloss:0.14296	valid-logloss:0.14854
[700]	train-logloss:0.13658	valid-logloss:0.14353
[800]	train-logloss:0.13277	valid-logloss:0.14100
[900]	train-logloss:0.13024	valid-logloss:0.13981
[1000]	train-logloss:0.12859	valid-logloss:0.13924
[1100]	train-logloss:0.12741	valid-logloss:0.13894
[1200]	train-logloss:0.12648	valid-logloss:0.13885
[1300]	train-logloss:0.12577	valid-logloss:0.13886
[1400]	train-logloss:0.12509	valid-logloss:0.13883
[1500]	train-logloss:0.12451	valid-logloss:0.13881
[1600]	train-logloss:0.12399	valid-logloss:0

In [11]:
xgb_model.save_model('CPP_REPO_xgb_new_world_23-1.model')

# Predict

In [12]:
xgb_testA = xgb.DMatrix(test.iloc[:len(test)//3])
xgb_testB = xgb.DMatrix(test.iloc[len(test)//3:len(test)//3*2])
xgb_testC = xgb.DMatrix(test.iloc[len(test)//3*2:])

In [13]:
y_predA = xgb_model.predict(xgb_testA)
y_predB = xgb_model.predict(xgb_testB)
y_predC = xgb_model.predict(xgb_testC)

In [14]:
y_pred = y_predA.tolist() + y_predB.tolist() + y_predC.tolist()

# Convert to Submission Format

In [15]:
sub['PURCHASE_FLG'] = y_pred

In [None]:
sub.sort_values("PURCHASE_FLG", inplace = True, ascending=False)
sub.drop_duplicates(subset =['USER_ID_hash', 'COUPON_ID_hash'], inplace = True) 

In [16]:
grouped = sub.groupby('USER_ID_hash')

In [17]:
def get_top10(row):
    pred = grouped.get_group(row.USER_ID_hash).sort_values(by=['PURCHASE_FLG'],ascending=False)
    pred = ' '.join(map(str, pred.head(10).COUPON_ID_hash.values))
    return pred

In [20]:
submission['PURCHASED_COUPONS'] = submission.apply(get_top10, axis=1)

In [21]:
submission.to_csv('sub_CPP_REPRO_XGB_new_world.csv', index=False)

In [22]:
submission

Unnamed: 0,USER_ID_hash,PURCHASED_COUPONS
0,0000b53e182165208887ba65c079fc21,42cc500acba3c79883cfd40adcd5ae96 42cc500acba3c...
1,00035b86e6884589ec8d28fbf2fe7757,5e47b887e154f746883013f863c3ffe1 05c58bb36b58b...
2,0005b1068d5f2b8f2a7c978fcfe1ca06,2af19a2244a2c2466b87b98e065cdfa7 9193590f0f6d2...
3,000cc06982785a19e2a2fdb40b1c9d59,42cc500acba3c79883cfd40adcd5ae96 42cc500acba3c...
4,0013518e41c416cd6a181d277dd8ca0b,5e47b887e154f746883013f863c3ffe1 5e47b887e154f...
...,...,...
22868,fff1a623187cefd7a594e338709b0f40,0c015306597566b632bebfb63b7e59f3 c988d799bc7db...
22869,fff4a076cfda6ff9dbe85e1cb678791b,42cc500acba3c79883cfd40adcd5ae96 42cc500acba3c...
22870,fff970d2014c3e10a77e38d540239017,42cc500acba3c79883cfd40adcd5ae96 42cc500acba3c...
22871,fffafc024e264d5d539813444cf61199,51da52d5516033bea13972588b671184 51da52d551603...
