In [None]:
import pandas as pd
import os
import numpy as np
import itertools
from xgboost import XGBClassifier
from scipy.spatial.distance import euclidean
import joblib

pd.set_option('display.max_columns', None)

In [None]:
dsdir = 'dataset/coupon-purchase-prediction'

In [None]:
train = pd.read_csv('CPP_REPRO_cl_train.csv').sample(frac=1, random_state=0).reset_index(drop=True)
test =  pd.read_csv('CPP_REPRO_cl_test.csv')
submission = pd.read_csv(os.path.join(dsdir,'sample_submission.csv'))

In [None]:
train.drop(['DISPFROM', 'DISPEND', 'VALIDFROM', 'VALIDEND', 'REG_DATE', 'WITHDRAW_DATE'], axis=1, inplace=True)
test.drop(['DISPFROM', 'DISPEND', 'VALIDFROM', 'VALIDEND', 'REG_DATE', 'WITHDRAW_DATE'], axis=1, inplace=True)

In [None]:
train['SAME_PREF'] = train['PREF_NAME_COUPON'] == train['PREF_NAME_USER']
test['SAME_PREF'] =  test['PREF_NAME_COUPON'] == test['PREF_NAME_USER']

In [None]:
def calc_euc(row):
    return euclidean((row['LATITUDE_COUPON'],row['LONGITUDE_COUPON']),(row['LATITUDE_USER'],row['LONGITUDE_USER']))

In [None]:
print('TRAIN-LONGITUDE_DIST')
train['LONGITUDE_DIST'] = train.apply(lambda x: x.LONGITUDE_COUPON - x.LONGITUDE_USER,axis=1)
print('TRAIN-LATITUDE_DIST')
train['LATITUDE_DIST'] = train.apply(lambda x: x.LATITUDE_COUPON - x.LATITUDE_USER,axis=1)
print('TRAIN-EUCLIDEAN_DIST')
train['EUCLIDEAN_DIST'] = train.apply(calc_euc,axis=1)

test['LONGITUDE_DIST'] = train.apply(lambda x: x.LONGITUDE_COUPON - x.LONGITUDE_USER,axis=1)
print('TEST-LONGITUDE_DIST')
test['LATITUDE_DIST'] = train.apply(lambda x: x.LATITUDE_COUPON - x.LATITUDE_USER,axis=1)
print('TEST-LATITUDE_DIST')
test['EUCLIDEAN_DIST'] = train.apply(calc_euc,axis=1)
print('TEST-EUCLIDEAN_DIST')

In [None]:
x_train = train.drop(['USER_ID_hash', 'COUPON_ID_hash', 'TARGET'],axis=1)
y_train = train.TARGET.values.reshape(-1)

x_test = test.drop(['USER_ID_hash', 'COUPON_ID_hash'],axis=1)

In [None]:
categoricals = x_train.dtypes[x_train.dtypes == 'object'].index.tolist()

In [None]:
categoricals

In [None]:
x_train_d = pd.get_dummies(x_train, columns=categoricals)
x_test_d = pd.get_dummies(x_test, columns=categoricals)

x_train_d, x_test_d = x_train_d.align(x_test_d, join='left', axis=1)

x_train_d.fillna(0,inplace=True)
x_test_d.fillna(0,inplace=True)

In [None]:
xgb = XGBClassifier(random_state=0, verbosity=1, n_jobs=-1, learning_rate=0.01, max_depth=24, subsample=0.8, scale_pos_weight=(len(y_train)-y_train.sum())/y_train.sum())

In [None]:
%%time
xgb.fit(x_train_d, y_train)

In [None]:
joblib.dump(xgb,'CPP_REPO_XGB.mdl')

In [None]:
A=x_test_d.iloc[:len(x_test_d)//3]
B=x_test_d.iloc[len(x_test_d)//3:len(x_test_d)//3*2]
C=x_test_d.iloc[len(x_test_d)//3*2:]

In [None]:
y_predA = xgb.predict_proba(A)
y_predB = xgb.predict_proba(B)
y_predC = xgb.predict_proba(C)

In [None]:
y_pred = y_predA[:,1].tolist() + y_predB[:,1].tolist() + y_predC[:,1].tolist()

In [None]:
sub = test[['USER_ID_hash','COUPON_ID_hash']].copy()
sub['TARGET'] = y_pred

In [None]:
grouped = sub.groupby('USER_ID_hash')

In [None]:
def get_top10(row):
    pred = grouped.get_group(row.USER_ID_hash).sort_values(by=['TARGET'],ascending=False)
    pred = ' '.join(map(str, pred.head(10).COUPON_ID_hash.values))
    return pred

In [None]:
submission['PURCHASED_COUPONS'] = submission.apply(get_top10, axis=1)

In [None]:
submission.to_csv('sub_CPP_REPRO_XGB.csv', index=False)

In [None]:
submission