In [1]:
import pandas as pd
import os
import numpy as np
import xgboost as xgb
import joblib
import datetime
from time import time
from time import sleep

pd.set_option('display.max_columns', None)

# Load Data

In [2]:
dsdir = 'dataset/coupon-purchase-prediction'

In [3]:
train = joblib.load('CPP_REPRO_cl_train.pkl').sample(frac=1, random_state=0).reset_index(drop=True)
test =  joblib.load('CPP_REPRO_cl_test.pkl')
submission = pd.read_csv(os.path.join(dsdir,'sample_submission.csv'))

# Preprocess Some Columns

In [4]:
# harness DISPFROM to split 23:1 weeks for early stopping validation
train.DISPFROM = pd.to_datetime(train.DISPFROM)

train.drop(['DISPEND', 'VALIDFROM', 'VALIDEND', 'REG_DATE', 'WITHDRAW_DATE'], axis=1, inplace=True)
test.drop(['DISPFROM', 'DISPEND', 'VALIDFROM', 'VALIDEND', 'REG_DATE', 'WITHDRAW_DATE'], axis=1, inplace=True)

In [5]:
train.drop(['USABLE_DATE_MON', 'USABLE_DATE_TUE', 'USABLE_DATE_WED', 'USABLE_DATE_THU', 'USABLE_DATE_FRI', 'USABLE_DATE_SAT', 'USABLE_DATE_SUN', 'USABLE_DATE_HOLIDAY', 'USABLE_DATE_BEFORE_HOLIDAY'], axis=1, inplace=True)
test.drop(['USABLE_DATE_MON', 'USABLE_DATE_TUE', 'USABLE_DATE_WED', 'USABLE_DATE_THU', 'USABLE_DATE_FRI', 'USABLE_DATE_SAT', 'USABLE_DATE_SUN', 'USABLE_DATE_HOLIDAY', 'USABLE_DATE_BEFORE_HOLIDAY'], axis=1, inplace=True)

In [6]:
x_train = train.drop(['USER_ID_hash', 'COUPON_ID_hash'],axis=1)

x_test = test.drop(['USER_ID_hash', 'COUPON_ID_hash'],axis=1)

In [7]:
categoricals = x_train.dtypes[x_train.dtypes == 'category'].index.tolist()

In [8]:
x_train_d = pd.get_dummies(x_train, columns=categoricals)
x_test_d = pd.get_dummies(x_test, columns=categoricals)

x_train_d, x_test_d = x_train_d.align(x_test_d, join='left', axis=1)

x_train_d.fillna(0,inplace=True)
x_test_d.fillna(0,inplace=True)

In [9]:
x_train_d.set_index('DISPFROM', inplace=True)

In [10]:
x_test_d.drop(['DISPFROM', 'TARGET'], axis=1, inplace=True)

# Split Data by Week

In [11]:
train_range = []

start = pd.to_datetime('01/08/2012')
end = start
for i in range(23):
    end = end + datetime.timedelta(days=7)

In [12]:
xgb_train = x_train_d[(x_train_d.index >= start) & (x_train_d.index < end)]
xgb_eval = x_train_d[(x_train_d.index >= end)]

# Convert to DMatrix

In [13]:
xgb_train = xgb.DMatrix(xgb_train.drop('TARGET', axis=1), label=xgb_train.TARGET.values.reshape(-1))

In [14]:
xgb_eval = xgb.DMatrix(xgb_eval.drop('TARGET', axis=1), label=xgb_eval.TARGET.values.reshape(-1))

# Train

In [15]:
xgb_params = {'booster':'gbtree',
              'objective': 'binary:logistic',
              'eta': 0.001,
              'max_depth': 12,
              'eval_metric': 'logloss',
              'seed': 0
             }

num_round = 5000

watchlist = [(xgb_train, 'train'), (xgb_eval, 'valid')]
xgb_model = xgb.train(xgb_params, xgb_train, num_round, watchlist, early_stopping_rounds=200, verbose_eval=100)

[0]	train-logloss:0.69216	valid-logloss:0.69248
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 200 rounds.
[100]	train-logloss:0.62612	valid-logloss:0.63062
[200]	train-logloss:0.57092	valid-logloss:0.57950
[300]	train-logloss:0.52448	valid-logloss:0.53694
[400]	train-logloss:0.48497	valid-logloss:0.50121
[500]	train-logloss:0.45106	valid-logloss:0.47021
[600]	train-logloss:0.42187	valid-logloss:0.44337
[700]	train-logloss:0.39646	valid-logloss:0.42028
[800]	train-logloss:0.37414	valid-logloss:0.40011
[900]	train-logloss:0.35397	valid-logloss:0.38326
[1000]	train-logloss:0.33626	valid-logloss:0.36922
[1100]	train-logloss:0.32057	valid-logloss:0.35641
[1200]	train-logloss:0.30625	valid-logloss:0.34591
[1300]	train-logloss:0.29364	valid-logloss:0.33737
[1400]	train-logloss:0.28283	valid-logloss:0.32903
[1500]	train-logloss:0.27340	valid-logloss:0.32139
[1600]	train-logloss:0.26494	valid-logloss:0

In [16]:
xgb_model.save_model('CPP_REPO_xgb_model_cut_23_1.model')

# Predict

In [17]:
#Split test data to prevent memory limit

xgb_testA = xgb.DMatrix(x_test_d.iloc[:len(x_test_d)//3])
xgb_testB = xgb.DMatrix(x_test_d.iloc[len(x_test_d)//3:len(x_test_d)//3*2])
xgb_testC = xgb.DMatrix(x_test_d.iloc[len(x_test_d)//3*2:])

In [18]:
y_predA = xgb_model.predict(xgb_testA)
y_predB = xgb_model.predict(xgb_testB)
y_predC = xgb_model.predict(xgb_testC)

In [19]:
y_pred = y_predA.tolist() + y_predB.tolist() + y_predC.tolist()

# Convert to Submission Format

In [20]:
sub = test[['USER_ID_hash','COUPON_ID_hash']]
sub['TARGET'] = y_pred

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [21]:
grouped = sub.groupby('USER_ID_hash')

In [22]:
def get_top10(row):
    pred = grouped.get_group(row.USER_ID_hash).sort_values(by=['TARGET'],ascending=False)
    pred = ' '.join(map(str, pred.head(10).COUPON_ID_hash.values))
    return pred

In [23]:
submission['PURCHASED_COUPONS'] = submission.apply(get_top10, axis=1)

In [29]:
sub_name = 'sub_CPP_REPRO_XGB_'+str(int(time()))+'.csv'

In [30]:
submission.to_csv(sub_name, index=False)

In [31]:
submission

Unnamed: 0,USER_ID_hash,PURCHASED_COUPONS
0,0000b53e182165208887ba65c079fc21,81c1c7241aadbb323b38689a64fbc83a e4db7645ae556...
1,00035b86e6884589ec8d28fbf2fe7757,fc5f052a1bd97696fbcab35d8d974b73 262572324a598...
2,0005b1068d5f2b8f2a7c978fcfe1ca06,1d6cb6629979a1d0e9038c9309c6c41b 81c1c7241aadb...
3,000cc06982785a19e2a2fdb40b1c9d59,0e917a0e872246a36e7a9a28df505562 79de77aa8c36f...
4,0013518e41c416cd6a181d277dd8ca0b,d79a889ee9d0712607a2672e96ba3d69 98dc8da9b3c7c...
...,...,...
22868,fff1a623187cefd7a594e338709b0f40,e4db7645ae556f252e60636df7c8eac8 9fe88dabce140...
22869,fff4a076cfda6ff9dbe85e1cb678791b,79de77aa8c36fdf17cb3366e2084e353 0e917a0e87224...
22870,fff970d2014c3e10a77e38d540239017,3c5bdd8fea0674f40e831568a7ea6a92 f0685cf6de3c1...
22871,fffafc024e264d5d539813444cf61199,98dc8da9b3c7ca94aeaa398aef52784f e4db7645ae556...


In [32]:
!kaggle competitions submit -c coupon-purchase-prediction -f $sub_name -m $sub_name

Successfully submitted to Coupon Purchase Prediction



  0%|          | 0.00/7.94M [00:00<?, ?B/s]
  1%|1         | 88.0k/7.94M [00:00<00:09, 883kB/s]
  2%|2         | 176k/7.94M [00:00<00:09, 857kB/s] 
  3%|2         | 232k/7.94M [00:00<00:25, 321kB/s]
  3%|3         | 272k/7.94M [00:01<00:44, 180kB/s]
  4%|3         | 304k/7.94M [00:01<01:01, 131kB/s]
  4%|4         | 344k/7.94M [00:01<01:08, 117kB/s]
  5%|4         | 384k/7.94M [00:02<01:13, 108kB/s]
  5%|5         | 424k/7.94M [00:02<01:17, 102kB/s]
  6%|5         | 456k/7.94M [00:02<01:01, 128kB/s]
  6%|5         | 480k/7.94M [00:03<01:23, 94.2kB/s]
  6%|6         | 512k/7.94M [00:03<01:05, 120kB/s] 
  7%|6         | 536k/7.94M [00:03<01:26, 89.5kB/s]
  7%|6         | 560k/7.94M [00:04<01:36, 80.1kB/s]
  7%|7         | 592k/7.94M [00:04<01:15, 102kB/s] 
  8%|7         | 616k/7.94M [00:04<01:32, 83.1kB/s]
  8%|7         | 648k/7.94M [00:04<01:11, 107kB/s] 
  8%|8         | 672k/7.94M [00:05<01:30, 84.1kB/s]
  9%|8         | 704k/7.94M [00:05<01:34, 80.7kB/s]
  9%|9         | 744k/7.94

 78%|#######7  | 6.17M/7.94M [01:18<00:27, 66.8kB/s]
 78%|#######8  | 6.21M/7.94M [01:19<00:20, 88.9kB/s]
 79%|#######8  | 6.23M/7.94M [01:19<00:23, 76.7kB/s]
 79%|#######8  | 6.26M/7.94M [01:19<00:25, 70.1kB/s]
 79%|#######9  | 6.30M/7.94M [01:20<00:23, 74.7kB/s]
 80%|#######9  | 6.31M/7.94M [01:20<00:26, 64.4kB/s]
 80%|########  | 6.36M/7.94M [01:20<00:19, 86.7kB/s]
 80%|########  | 6.38M/7.94M [01:21<00:22, 74.1kB/s]
 81%|########  | 6.41M/7.94M [01:21<00:16, 95.1kB/s]
 81%|########1 | 6.44M/7.94M [01:21<00:19, 81.5kB/s]
 81%|########1 | 6.46M/7.94M [01:22<00:21, 73.8kB/s]
 82%|########1 | 6.51M/7.94M [01:22<00:18, 80.4kB/s]
 82%|########2 | 6.55M/7.94M [01:23<00:17, 82.3kB/s]
 83%|########2 | 6.59M/7.94M [01:23<00:13, 107kB/s] 
 83%|########3 | 6.61M/7.94M [01:23<00:16, 83.9kB/s]
 84%|########3 | 6.64M/7.94M [01:24<00:17, 80.0kB/s]
 84%|########4 | 6.68M/7.94M [01:24<00:12, 105kB/s] 
 84%|########4 | 6.70M/7.94M [01:24<00:15, 86.2kB/s]
 85%|########4 | 6.73M/7.94M [01:25<00:16, 77.

In [34]:
found = False
while 1:
    submission_list = !kaggle competitions submissions -c coupon-purchase-prediction
    for sub_row in submission_list:
        if sub_name in sub_row and 'complete' in sub_row:
            scores = sub_row[sub_row.find('complete')+10:].split()
            print('Private :',scores[1],'\t|\tPublic :',scores[0])
            found = True
            break
    if found:
        break
    sleep(10)

Private : 0.00535 	|	Public : 0.00593
