In [None]:
__author__ = 'Nick Sarris (ngs5st)'

import os
import time
import random
import numpy as np
import pandas as pd
import lightgbm as lgb

from datetime import datetime
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold
from scipy.stats import skew, boxcox
from sklearn.preprocessing import StandardScaler
import itertools

print(os.listdir("./data"))

In [None]:
def seed_everything(seed=1235):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(1235)

In [None]:
start_time = time.time()
print("Loading Data ...")

directory = "./data/"
    
order_prior = pd.read_csv(directory + 'order_products__prior.csv', dtype={
    'order_id': np.uint32, 'product_id': np.uint16, 'add_to_cart_order': np.uint8,
    'reordered': bool})

orders = pd.read_csv(directory + 'orders.csv', dtype={
    'order_id': np.uint32, 'user_id': np.uint32, 'eval_set': 'category',
    'order_number': np.uint8, 'order_dow': np.uint8, 'order_hour_of_day': np.uint8})

labels = pd.read_pickle(directory + 'chunk_0.pkl')
user_product = pd.read_pickle(directory + 'previous_products.pkl')

print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
start_time = time.time()
print("Processing Orders [1] ...")

order_comsum = orders[['user_id', 'order_number', 'days_since_prior_order']].groupby(
    ['user_id', 'order_number'])['days_since_prior_order'].sum().groupby(
    level=[0]).cumsum().reset_index().rename(columns={
    'days_since_prior_order':'days_since_prior_order_comsum'})

order_comsum.to_pickle('./data/orders_comsum.pkl')

print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
start_time = time.time()
print("Processing Orders [2] ...")

order_comsum = pd.merge(order_comsum, orders, on=['user_id', 'order_number']) \
    [['user_id', 'order_number', 'days_since_prior_order_comsum', 'order_id']]

order_product = pd.merge(order_prior, orders, on='order_id')[['order_id', 'product_id', 'eval_set']]
order_product_train_test = labels[['order_id', 'product_id', 'eval_set']]
order_product = pd.concat([order_product, order_product_train_test])
order_product = pd.merge(order_product, order_comsum, on='order_id')

order_product = pd.merge(order_product, user_product, on=['user_id', 'product_id'])
temp = order_product.groupby(['user_id', 'product_id', 'order_number'])\
    ['days_since_prior_order_comsum'].sum().groupby(level=[0, 1]).apply(
    lambda x: np.diff(np.nan_to_num(x)))

temp = temp.to_frame('periods').reset_index()
temp.to_pickle('./data/product_period.pkl')

print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
start_time = time.time()
print("Processing Orders [3] ...")

aggregated = temp.copy()
aggregated['last'] = aggregated.periods.apply(lambda x: x[-1])
aggregated['prev1'] = aggregated.periods.apply(lambda x: x[-2] if len(x) > 1 else np.nan)
aggregated['prev2'] = aggregated.periods.apply(lambda x: x[-3] if len(x) > 2 else np.nan)
aggregated['median'] = aggregated.periods.apply(lambda x: np.median(x[:-1]))
aggregated['mean'] = aggregated.periods.apply(lambda x: np.mean(x[:-1]))
aggregated.drop('periods', axis=1, inplace=True)
aggregated.to_pickle('./data/product_periods_stat.pkl')

print("--- %s seconds ---" % (time.time() - start_time))