In [None]:
__author__ = 'Nick Sarris (ngs5st)'

import os
import time
import random
import numpy as np
import pandas as pd
import lightgbm as lgb

from datetime import datetime
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold
from scipy.stats import skew, boxcox
from sklearn.preprocessing import StandardScaler
import itertools

print(os.listdir("./data"))

In [None]:
def seed_everything(seed=1235):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(1235)

In [None]:
start_time = time.time()
print("Loading Data ...")
  
directory = "./data/"

order_train = pd.read_csv(directory + 'order_products__train.csv', dtype={
    'order_id': np.uint32, 'product_id': np.uint16, 'add_to_cart_order':np.uint8,
    'reordered': bool})
orders = pd.read_csv(directory + 'orders.csv', dtype={
    'order_id':np.uint32, 'user_id': np.uint32, 'eval_set': 'category',
    'order_number':np.uint8, 'order_dow': np.uint8, 'order_hour_of_day': np.uint8})
labels = pd.read_pickle(directory + 'previous_products.pkl')

print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
start_time = time.time()
print("Establish Orders ...")

orders = orders.loc[(orders.eval_set == 'train') | (orders.eval_set == 'test'), :]
labels = pd.merge(labels, orders[['order_id', 'user_id', 'eval_set']], on='user_id')\
    .drop(['user_id'], axis=1)
order_train.drop(['add_to_cart_order'], axis=1, inplace=True)

print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
start_time = time.time()
print("Establish Orders ...")

orders = np.unique(labels.order_id)
size = orders.shape[0]

fold = 0
current = orders[fold * size:(fold + 1) * size]
current = labels.loc[np.in1d(labels.order_id, current), :]
current = pd.merge(order_train, current, on=['order_id', 'product_id'], how='right')
current.reordered.fillna(False, inplace=True)
current.to_pickle('./data/chunk_{}.pkl'.format(fold))

print("--- %s seconds ---" % (time.time() - start_time))