In [None]:
__author__ = 'Nick Sarris (ngs5st)'

import os
import gc
import time
import random
import numpy as np
import pandas as pd
import lightgbm as lgb

from datetime import datetime
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
from scipy.stats import skew, boxcox
from sklearn.preprocessing import StandardScaler
import itertools

print(os.listdir("./data"))

In [None]:
def seed_everything(seed=1235):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(1235)

In [None]:
start_time = time.time()
print("Loading Data ...")
  
features = [
    'reordered_sum', 'add_to_cart_order_inverted_mean', 'add_to_cart_order_relative_mean',
    'reorder_prob',  'last', 'prev1', 'prev2', 'median', 'mean', 'dep_reordered_ratio',
    'aisle_reordered_ratio', 'aisle_products', 'aisle_reordered', 'dep_products', 'dep_reordered',
    'prod_users_unq', 'prod_users_unq_reordered', 'order_number', 'prod_add_to_card_mean',
    'days_since_prior_order', 'order_dow', 'order_hour_of_day', 'reorder_ration',
    'user_orders', 'user_order_starts_at', 'user_mean_days_since_prior',
    'user_average_basket', 'user_distinct_products', 'user_reorder_ratio', 'user_total_products',
    'prod_orders', 'prod_reorders', 'up_order_rate', 'up_orders_since_last_order',
    'up_order_rate_since_first_order', 'up_orders', 'up_first_order', 'up_last_order',
    'up_mean_cart_position', 'days_since_prior_order_mean', 'order_dow_mean', 'order_hour_of_day_mean',
    'user_product_reordered_ratio', 'order_id', 'product_id']

categories = ['product_id', 'aisle_id', 'department_id']
features.extend(categories)

directory = "./data/"
train_df = pd.read_csv(directory + "order_train.csv", usecols=features + ['reordered'])
test_df = pd.read_csv(directory + "order_test.csv", usecols=features)

order_ids = test_df['order_id'].values
product_ids = test_df['product_id'].values
labels = train_df[['reordered']].values.astype(np.float32).flatten()

train_df.drop(['reordered', 'order_id', 'product_id'], inplace=True, axis=1)
test_df.drop(['order_id', 'product_id'], inplace=True, axis=1)

print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
class LgbWrapper(object):
    
    def __init__(self, seed=2017, params=None):
        self.param = params
        self.param['seed'] = seed

    def train(self, xtra, ytra, xte, yte):
        ytra = ytra.ravel()
        yte = yte.ravel()
        dtrain = lgb.Dataset(xtra, label=ytra)
        dvalid = lgb.Dataset(xte, label=yte)
        watchlist = [dvalid]
        self.gbdt = lgb.train(self.param, dtrain, 400, 
            watchlist, early_stopping_rounds=10, verbose_eval=20)

    def predict(self, x):
        return self.gbdt.predict(x)

In [None]:
def get_oof(clf, ntrain, ntest, kf, train, labels, test):

    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((5, ntest))

    for i, (train_index, test_index) in enumerate(kf):
        x_tr = train[train_index]
        y_tr = labels[train_index]
        x_te = train[test_index]
        y_te = labels[test_index]

        clf.train(x_tr, y_tr, x_te, y_te)
        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(test)

    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)

In [None]:
start_time = time.time()
print("Splitting Data ...")

train_df = np.array(train_df)
test_df = np.array(test_df)
labels = np.array(labels)

ntrain = train_df.shape[0]
ntest = test_df.shape[0]

print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
start_time = time.time()
print("Establishing Parameters ...")

lgb_params = {}
lgb_params['task'] = 'train'
lgb_params['boosting_type'] = 'gbdt'
lgb_params['objective'] = 'binary'
lgb_params['metric'] = {'binary_logloss', 'auc'}
lgb_params['num_leaves'] = 256
lgb_params['min_sum_hessian_in_leaf'] = 20
lgb_params['max_depth'] = 12
lgb_params['learning_rate'] = 0.05
lgb_params['feature_fraction'] = 0.6
lgb_params['verbose'] = 1

print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
start_time = time.time()
print("Training Model ...")

lg = LgbWrapper(seed=2017, params=lgb_params)
kf = KFold(n_splits=5, shuffle=True, random_state=2018).split(train_df)
lg_oof_train, lg_oof_test = get_oof(lg, ntrain, ntest, kf, train_df, labels, test_df)
print("LG-CV: {}".format(roc_auc_score(labels, lg_oof_train)))

print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
start_time = time.time()
print("Generate Submission ...")

submission = pd.DataFrame()
submission["order_id"] = order_ids
submission["product_id"] = product_ids
submission['prediction'] = lg_oof_test
submission.to_csv("./data/output_data.csv", index=False)

print("--- %s seconds ---" % (time.time() - start_time))