In [None]:
__author__ = 'Nick Sarris (ngs5st)'

import os
import gc
import time
import random
import numpy as np
import pandas as pd
import lightgbm as lgb

from datetime import datetime
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold
from scipy.stats import skew, boxcox
from sklearn.preprocessing import StandardScaler
import itertools

print(os.listdir("./data/"))

In [None]:
def seed_everything(seed=1235):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(1235)

In [None]:
start_time = time.time()
print("Loading Data ...")

directory = "./data/"

order_prior = pd.read_csv(directory + "order_products__prior.csv", dtype={
    'order_id': np.uint32, 'product_id': np.uint16, 'add_to_cart_order': np.uint8,
    'reordered': bool})
orders = pd.read_csv(directory + "orders.csv", dtype={
    'order_id': np.uint32, 'user_id': np.uint32, 'eval_set': 'category',
    'order_number': np.uint8, 'order_dow': np.uint8, 'order_hour_of_day': np.uint8})
products = pd.read_csv(directory + "products.csv", dtype={
    'product_id': np.uint16, 'aisle_id': np.uint8, 'department_id': np.uint8})

order_train = pd.read_pickle(directory + 'chunk_0.pkl')
order_test = order_train.loc[order_train.eval_set == "test", ['order_id', 'product_id']]
order_train = order_train.loc[order_train.eval_set == "train", ['order_id',  'product_id',  'reordered']]

user_dep_stat = pd.read_pickle(directory + 'user_department_products.pkl')
user_aisle_stat = pd.read_pickle(directory + 'user_aisle_products.pkl')

print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
start_time = time.time()
print("Processing Data [1] ...")

prob = pd.merge(order_prior, orders, on='order_id')
prob = prob.groupby(['product_id', 'user_id'])\
    .agg({'reordered':'sum', 'user_id': 'size'})

prob.rename(columns={'sum': 'reordered',
                     'user_id': 'total'}, inplace=True)

prob.reordered = (prob.reordered > 0).astype(np.float32)
prob.total = (prob.total > 0).astype(np.float32)
prob['reorder_prob'] = prob.reordered / prob.total
prob = prob.groupby('product_id').agg(
    {'reorder_prob': 'mean'}).rename(columns={'mean': 'reorder_prob'})\
    .reset_index()

print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
start_time = time.time()
print("Processing Data [2] ...")

prod_stat = order_prior.groupby('product_id').agg(
    {'reordered': ['sum', 'size'], 'add_to_cart_order':'mean'})

prod_stat.columns = prod_stat.columns.levels[1]
prod_stat.rename(columns={'sum':'prod_reorders', 'size':'prod_orders', 'mean':
                          'prod_add_to_card_mean'}, inplace=True)

prod_stat.reset_index(inplace=True)
prod_stat['reorder_ration'] = prod_stat['prod_reorders'] / prod_stat['prod_orders']

prod_stat = pd.merge(prod_stat, prob, on='product_id')

print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
start_time = time.time()
print("Processing Data [3] ...")

user_stat = orders.loc[orders.eval_set == 'prior', :].groupby(
    'user_id').agg({'order_number': 'max', 'days_since_prior_order': ['sum', 'mean', 'median']})

user_stat.columns = user_stat.columns.droplevel(0)
user_stat.rename(columns={'max': 'user_orders',
                          'sum': 'user_order_starts_at',
                          'mean': 'user_mean_days_since_prior',
                          'median': 'user_median_days_since_prior'}, inplace=True)
user_stat.reset_index(inplace=True)

print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
start_time = time.time()
print("Processing Data [4] ...")

orders_products = pd.merge(orders, order_prior, on="order_id")
user_order_stat = orders_products.groupby('user_id').agg(
    {'user_id': 'size', 'reordered': 'sum', "product_id": lambda x: x.nunique()})

user_order_stat.rename(columns={
    'user_id': 'user_total_products', 'product_id': 'user_distinct_products',
    'reordered': 'user_reorder_ratio'}, inplace=True)

user_order_stat.reset_index(inplace=True)
user_order_stat.user_reorder_ratio = user_order_stat.user_reorder_ratio / \
                                     user_order_stat.user_total_products

user_stat = pd.merge(user_stat, user_order_stat, on='user_id')
user_stat['user_average_basket'] = user_stat.user_total_products / user_stat.user_orders

print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
start_time = time.time()
print("Processing Data [5] ...")

order_stat = orders_products.groupby('order_id').agg({'order_id': 'size'}) \
    .rename(columns={'order_id': 'order_size'}).reset_index()

orders_products = pd.merge(orders_products, order_stat, on='order_id')
orders_products['add_to_cart_order_inverted'] = orders_products.order_size - orders_products.add_to_cart_order
orders_products['add_to_cart_order_relative'] = orders_products.add_to_cart_order / orders_products.order_size

data = orders_products.groupby(['user_id', 'product_id']).agg({
    'user_id': 'size', 'order_number': ['min', 'max'], 'add_to_cart_order': ['mean', 'median'],
    'days_since_prior_order': ['mean', 'median'], 'order_dow': ['mean', 'median'],
    'order_hour_of_day': ['mean', 'median'], 'add_to_cart_order_inverted': ['mean', 'median'],
    'add_to_cart_order_relative': ['mean', 'median'],  'reordered': ['sum']})

data.columns = data.columns.droplevel(0)

data.columns = [
    'up_orders', 'up_first_order', 'up_last_order', 'up_mean_cart_position', 'up_median_cart_position',
    'days_since_prior_order_mean', 'days_since_prior_order_median', 'order_dow_mean', 'order_dow_median',
    'order_hour_of_day_mean', 'order_hour_of_day_median', 'add_to_cart_order_inverted_mean',
    'add_to_cart_order_inverted_median', 'add_to_cart_order_relative_mean', 'add_to_cart_order_relative_median',
    'reordered_sum']

data['user_product_reordered_ratio'] = (data.reordered_sum + 1.0) / data.up_orders

data.reset_index(inplace=True)
data = pd.merge(data, prod_stat, on='product_id')
data = pd.merge(data, user_stat, on='user_id')

data['up_order_rate'] = data.up_orders / data.user_orders
data['up_orders_since_last_order'] = data.user_orders - data.up_last_order
data['up_order_rate_since_first_order'] = data.user_orders / (data.user_orders - data.up_first_order + 1)

print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
start_time = time.time()
print("Processing Data [6] ...")

order_train = pd.merge(order_train, products, on='product_id')
order_train = pd.merge(order_train, orders, on='order_id')
order_train = pd.merge(order_train, user_dep_stat, on=['user_id', 'department_id'])
order_train = pd.merge(order_train, user_aisle_stat, on=['user_id', 'aisle_id'])

order_test = pd.merge(order_test, products, on='product_id')
order_test = pd.merge(order_test, orders, on='order_id')
order_test = pd.merge(order_test, user_dep_stat, on=['user_id', 'department_id'])
order_test = pd.merge(order_test, user_aisle_stat, on=['user_id', 'aisle_id'])

del products, orders, user_dep_stat, user_aisle_stat
gc.collect()

print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
start_time = time.time()
print("Processing Data [7] ...")

order_train = pd.merge(order_train, data, on=['product_id', 'user_id'])
order_test = pd.merge(order_test, data, on=['product_id', 'user_id'])

del data
gc.collect()

print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
start_time = time.time()
print("Processing Data [8] ...")

prod_usr = orders_products.groupby(['product_id']).agg({'user_id': lambda x: x.nunique()})
prod_usr.rename(columns={'user_id':'prod_users_unq'}, inplace=True)
prod_usr.reset_index(inplace=True)

prod_usr_reordered = orders_products.loc[orders_products.reordered, :].groupby(
    ['product_id']).agg({'user_id': lambda x: x.nunique()})
prod_usr_reordered.rename(columns={'user_id': 'prod_users_unq_reordered'}, inplace=True)
prod_usr_reordered.reset_index(inplace=True)

print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
start_time = time.time()
print("Processing Data [9] ...")

product_periods = pd.read_pickle(directory + 'product_periods_stat.pkl').fillna(9999)

order_train = pd.merge(order_train, prod_usr, on='product_id')
order_train = pd.merge(order_train, prod_usr_reordered, on='product_id', how='left')
order_train = pd.merge(order_train, product_periods, on=['user_id',  'product_id'])

order_test = pd.merge(order_test, prod_usr, on='product_id')
order_test = pd.merge(order_test, prod_usr_reordered, on='product_id', how='left')
order_test = pd.merge(order_test, product_periods, on=['user_id', 'product_id'])

del prod_usr, prod_usr_reordered, product_periods
gc.collect()

print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
start_time = time.time()
print("Processing Data [10] ...")

order_train.prod_users_unq_reordered.fillna(0, inplace=True)
order_train['aisle_reordered_ratio'] = order_train.aisle_reordered / order_train.user_orders
order_train['dep_reordered_ratio'] = order_train.dep_reordered / order_train.user_orders

order_test.prod_users_unq_reordered.fillna(0, inplace=True)
order_test['aisle_reordered_ratio'] = order_test.aisle_reordered / order_test.user_orders
order_test['dep_reordered_ratio'] = order_test.dep_reordered / order_test.user_orders

order_train.to_csv("./data/order_train.csv", index=False)
order_test.to_csv("./data/order_test.csv", index=False)

print("--- %s seconds ---" % (time.time() - start_time))