In [None]:
__author__ = 'Nick Sarris (ngs5st)'

import os
import time
import random
import numpy as np
import pandas as pd
import lightgbm as lgb

from datetime import datetime
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold
from scipy.stats import skew, boxcox
from sklearn.preprocessing import StandardScaler
import itertools

print(os.listdir("./data"))

In [None]:
def seed_everything(seed=1235):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(1235)

In [None]:
start_time = time.time()
print("Loading Data ...")

directory = './data/'

order_prior = pd.read_csv(directory + "order_products__prior.csv", dtype={
    'order_id': np.uint32, 'product_id': np.uint16, 'add_to_cart_order': np.uint8,
    'reordered': bool})
orders = pd.read_csv(directory + "orders.csv", dtype={
    'order_id': np.uint32, 'user_id': np.uint32, 'eval_set': 'category',
    'order_number': np.uint8, 'order_dow': np.uint8, 'order_hour_of_day': np.uint8})
products = pd.read_csv(directory + "products.csv", dtype={
    'product_id': np.uint16, 'aisle_id': np.uint8, 'department_id': np.uint8})

print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
start_time = time.time()
print("Processing Orders ...")

orders_products = pd.merge(orders, order_prior, on="order_id")
orders_products_products = pd.merge(orders_products, products[
    ['product_id', 'department_id', 'aisle_id']], on='product_id')

print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
start_time = time.time()
print("Processing Departments ...")

user_dep_stat = orders_products_products.groupby(
    ['user_id', 'department_id']).agg( {'product_id': lambda x: x.nunique(),
     'reordered': 'sum'})

user_dep_stat.rename(columns={
    'product_id': 'dep_products',
    'reordered': 'dep_reordered'}, inplace=True)
user_dep_stat.reset_index(inplace=True)
user_dep_stat.to_pickle('./data/user_department_products.pkl')

print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
start_time = time.time()
print("Processing Aisles ...")

user_aisle_stat = orders_products_products.groupby(
    ['user_id', 'aisle_id']).agg({'product_id': lambda x: x.nunique(),
     'reordered': 'sum'})

user_aisle_stat.rename(columns={
    'product_id': 'aisle_products',
    'reordered': 'aisle_reordered'}, inplace=True)
user_aisle_stat.reset_index(inplace=True)
user_aisle_stat.to_pickle('./data/user_aisle_products.pkl')

print("--- %s seconds ---" % (time.time() - start_time))