In [1]:
import pandas as pd
import os
import numpy as np
import itertools
import xgboost as xgb
from scipy.spatial.distance import euclidean
import joblib
import datetime

pd.set_option('display.max_columns', None)

# Load Data

In [2]:
dsdir = 'dataset/coupon-purchase-prediction'

In [3]:
train = pd.read_csv('CPP_REPRO_cl_train.csv').sample(frac=1, random_state=0).reset_index(drop=True)
test =  pd.read_csv('CPP_REPRO_cl_test.csv')
submission = pd.read_csv(os.path.join(dsdir,'sample_submission.csv'))

# Preprocess Some Columns

In [4]:
train.DISPFROM = pd.to_datetime(train.DISPFROM)
train.DISPEND = pd.to_datetime(train.DISPEND)
train.VALIDFROM = pd.to_datetime(train.VALIDFROM)
train.VALIDEND = pd.to_datetime(train.VALIDEND)
train.REG_DATE = pd.to_datetime(train.REG_DATE)
train.WITHDRAW_DATE = pd.to_datetime(train.WITHDRAW_DATE)

test.DISPFROM = pd.to_datetime(test.DISPFROM)
test.DISPEND = pd.to_datetime(test.DISPEND)
test.VALIDFROM = pd.to_datetime(test.VALIDFROM)
test.VALIDEND = pd.to_datetime(test.VALIDEND)
test.REG_DATE = pd.to_datetime(test.REG_DATE)
test.WITHDRAW_DATE = pd.to_datetime(test.WITHDRAW_DATE)

In [5]:
train.drop(['DISPEND', 'VALIDFROM', 'VALIDEND', 'REG_DATE', 'WITHDRAW_DATE'], axis=1, inplace=True)
test.drop(['DISPFROM', 'DISPEND', 'VALIDFROM', 'VALIDEND', 'REG_DATE', 'WITHDRAW_DATE'], axis=1, inplace=True)

In [6]:
train.drop(['USABLE_DATE_MON', 'USABLE_DATE_TUE', 'USABLE_DATE_WED', 'USABLE_DATE_THU', 'USABLE_DATE_FRI', 'USABLE_DATE_SAT', 'USABLE_DATE_SUN', 'USABLE_DATE_HOLIDAY', 'USABLE_DATE_BEFORE_HOLIDAY'], axis=1, inplace=True)
test.drop(['USABLE_DATE_MON', 'USABLE_DATE_TUE', 'USABLE_DATE_WED', 'USABLE_DATE_THU', 'USABLE_DATE_FRI', 'USABLE_DATE_SAT', 'USABLE_DATE_SUN', 'USABLE_DATE_HOLIDAY', 'USABLE_DATE_BEFORE_HOLIDAY'], axis=1, inplace=True)

In [7]:
train['SAME_PREF'] = train['PREF_NAME_COUPON'] == train['PREF_NAME_USER']
test['SAME_PREF'] =  test['PREF_NAME_COUPON'] == test['PREF_NAME_USER']

In [8]:
def calc_euc(row):
    return euclidean((row['LATITUDE_COUPON'],row['LONGITUDE_COUPON']),(row['LATITUDE_USER'],row['LONGITUDE_USER']))

In [9]:
print('TRAIN-LONGITUDE_DIST')
train['LONGITUDE_DIST'] = train.LONGITUDE_COUPON - train.LONGITUDE_USER
print('TRAIN-LATITUDE_DIST')
train['LATITUDE_DIST'] = train.LATITUDE_COUPON - train.LATITUDE_USER
print('TRAIN-EUCLIDEAN_DIST')
train['EUCLIDEAN_DIST'] = train.apply(calc_euc,axis=1)

print('TEST-LONGITUDE_DIST')
test['LONGITUDE_DIST'] = test.LONGITUDE_COUPON - test.LONGITUDE_USER
print('TEST-LATITUDE_DIST')
test['LATITUDE_DIST'] = test.LATITUDE_COUPON - test.LATITUDE_USER
print('TEST-EUCLIDEAN_DIST')
test['EUCLIDEAN_DIST'] = test.apply(calc_euc,axis=1)

TRAIN-LONGITUDE_DIST
TRAIN-LATITUDE_DIST
TRAIN-EUCLIDEAN_DIST
TEST-LONGITUDE_DIST
TEST-LATITUDE_DIST
TEST-EUCLIDEAN_DIST


In [10]:
x_train = train.drop(['USER_ID_hash', 'COUPON_ID_hash'],axis=1)

x_test = test.drop(['USER_ID_hash', 'COUPON_ID_hash'],axis=1)

In [11]:
categoricals = x_train.dtypes[x_train.dtypes == 'object'].index.tolist()

In [12]:
x_train_d = pd.get_dummies(x_train, columns=categoricals)
x_test_d = pd.get_dummies(x_test, columns=categoricals)

x_train_d, x_test_d = x_train_d.align(x_test_d, join='left', axis=1)

x_train_d.fillna(0,inplace=True)
x_test_d.fillna(0,inplace=True)

In [13]:
x_train_d.set_index('DISPFROM', inplace=True)

In [14]:
x_test_d.drop(['DISPFROM', 'TARGET'], axis=1, inplace=True)

# Split Data by Week

In [15]:
train_range = []

start = pd.to_datetime('01/08/2012')
end = start
for i in range(23):
    end = end + datetime.timedelta(days=7)

In [16]:
xgb_train = x_train_d[(x_train_d.index >= start) & (x_train_d.index < end)]
xgb_eval = x_train_d[(x_train_d.index >= end)]

# Convert to DMatrix

In [17]:
xgb_train = xgb.DMatrix(xgb_train.drop('TARGET', axis=1), label=xgb_train.TARGET.values.reshape(-1))

In [18]:
xgb_eval = xgb.DMatrix(xgb_eval.drop('TARGET', axis=1), label=xgb_eval.TARGET.values.reshape(-1))

# Train

In [19]:
xgb_params = {'booster':'gbtree',
              'objective': 'binary:logistic',
              'eta': 0.001,
              'max_depth': 12,
              'eval_metric': 'logloss',
              'seed': 0
             }

num_round = 5000

watchlist = [(xgb_train, 'train'), (xgb_eval, 'valid')]
xgb_model = xgb.train(xgb_params, xgb_train, num_round, watchlist, early_stopping_rounds=200, verbose_eval=100)

[0]	train-logloss:0.69217	valid-logloss:0.69246
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 200 rounds.
[100]	train-logloss:0.62600	valid-logloss:0.63045
[200]	train-logloss:0.57061	valid-logloss:0.57886
[300]	train-logloss:0.52390	valid-logloss:0.53598
[400]	train-logloss:0.48413	valid-logloss:0.49964
[500]	train-logloss:0.45011	valid-logloss:0.46829
[600]	train-logloss:0.42080	valid-logloss:0.44181
[700]	train-logloss:0.39510	valid-logloss:0.41934
[800]	train-logloss:0.37270	valid-logloss:0.39914
[900]	train-logloss:0.35258	valid-logloss:0.38304
[1000]	train-logloss:0.33491	valid-logloss:0.36918
[1100]	train-logloss:0.31920	valid-logloss:0.35737
[1200]	train-logloss:0.30521	valid-logloss:0.34759
[1300]	train-logloss:0.29289	valid-logloss:0.33976
[1400]	train-logloss:0.28201	valid-logloss:0.33109
[1500]	train-logloss:0.27257	valid-logloss:0.32325
[1600]	train-logloss:0.26420	valid-logloss:0

In [20]:
xgb_model.save_model('CPP_REPO_xgb_model_cut_23_1.model')

# Predict

In [21]:
xgb_testA = xgb.DMatrix(x_test_d.iloc[:len(x_test_d)//3])
xgb_testB = xgb.DMatrix(x_test_d.iloc[len(x_test_d)//3:len(x_test_d)//3*2])
xgb_testC = xgb.DMatrix(x_test_d.iloc[len(x_test_d)//3*2:])

In [22]:
y_predA = xgb_model.predict(xgb_testA)
y_predB = xgb_model.predict(xgb_testB)
y_predC = xgb_model.predict(xgb_testC)

In [23]:
y_pred = y_predA.tolist() + y_predB.tolist() + y_predC.tolist()

# Convert to Submission Format

In [24]:
sub = test[['USER_ID_hash','COUPON_ID_hash']]
sub['TARGET'] = y_pred

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [25]:
grouped = sub.groupby('USER_ID_hash')

In [26]:
def get_top10(row):
    pred = grouped.get_group(row.USER_ID_hash).sort_values(by=['TARGET'],ascending=False)
    pred = ' '.join(map(str, pred.head(10).COUPON_ID_hash.values))
    return pred

In [27]:
submission['PURCHASED_COUPONS'] = submission.apply(get_top10, axis=1)

In [28]:
submission.to_csv('sub_CPP_REPRO_XGB_CUT.csv', index=False)

In [29]:
submission

Unnamed: 0,USER_ID_hash,PURCHASED_COUPONS
0,0000b53e182165208887ba65c079fc21,1d6cb6629979a1d0e9038c9309c6c41b c9e1dcbd8c98f...
1,00035b86e6884589ec8d28fbf2fe7757,262572324a598435b68ead0aff867e48 fc5f052a1bd97...
2,0005b1068d5f2b8f2a7c978fcfe1ca06,1d6cb6629979a1d0e9038c9309c6c41b 5a55bfad2ceef...
3,000cc06982785a19e2a2fdb40b1c9d59,79de77aa8c36fdf17cb3366e2084e353 0e917a0e87224...
4,0013518e41c416cd6a181d277dd8ca0b,5c5ed1a52294f8d92c6f152b934cc4d4 c65df9a9bdea0...
...,...,...
22868,fff1a623187cefd7a594e338709b0f40,9fe88dabce1401044b09d0642ca4e399 11cd8c9131da2...
22869,fff4a076cfda6ff9dbe85e1cb678791b,79de77aa8c36fdf17cb3366e2084e353 c76ea297ebd3a...
22870,fff970d2014c3e10a77e38d540239017,f0685cf6de3c1e1fd86d2f10784b85f5 46da51ba6dd20...
22871,fffafc024e264d5d539813444cf61199,98dc8da9b3c7ca94aeaa398aef52784f 9fe88dabce140...
