# Load the imports

In [1]:
import gc
import pandas as pd
import numpy as np
import os
import arboretum
import lightgbm as lgb
import json
import sklearn.metrics
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.model_selection import train_test_split
from scipy.sparse import dok_matrix, coo_matrix
from sklearn.utils.multiclass import  type_of_target

# Load the datasets

In [2]:
aisles = pd.read_csv("data/aisles.csv",
                     dtype={'aisle': 'category'})
departments = pd.read_csv("data/departments.csv",
                          dtype={'department': 'category'})
order_prior = pd.read_csv("data/order_products__prior.csv",
                          dtype={'reordered': bool})
order_train = pd.read_csv("data/order_products__train.csv",
                          dtype={'reordered': bool})
orders = pd.read_csv("data/orders.csv",
                     dtype={'eval_set': 'category'})

products = pd.read_csv("data/products.csv")

product_embeddings = pd.read_pickle('data/product_embeddings.pkl')
embedings = list(range(32))
product_embeddings = product_embeddings[embedings + ['product_id']]

order_train = pd.read_pickle("data/train_test_set.pkl")
product_periods = pd.read_pickle("data/product_periods_stat.pkl").fillna(9999)
user_dep_stat = pd.read_pickle("data/user_department_products.pkl")
user_aisle_stat = pd.read_pickle("data/user_aisle_products.pkl")
order_streaks = pd.read_csv("data/order_streaks.csv")

# Get the train and test orders

In [3]:
order_test = order_train[order_train.eval_set == "test"][['order_id',
                                                          'product_id']]
order_train = order_train[order_train.eval_set == "train"][['order_id',
                                                            'product_id',
                                                            'reordered']]

# Compute other features

In [4]:
prob = pd.merge(order_prior, orders, on='order_id')

# Calculate the count of user ordered a given product and how many times
# the product was reorderd by the user
prob = prob.groupby(
    ['product_id', 'user_id']).agg(
    {
    'reordered':'sum',
    'user_id': 'size'
    })

# Rename the grouped columns
prob.rename(
    columns={'sum': 'reordered', 'user_id': 'total'},
    inplace=True
    )

# Calculate the ratio of reorder
prob['reorder_prob'] = prob.reordered / prob.total

# Calculate the mean of the product reordered
prob = prob.groupby('product_id').agg({'reorder_prob': 'mean'})
prob = prob.rename(columns={'mean': 'reorder_prob'}).reset_index()


## Calculate the product statistics

In [5]:
# Calculate the count of product was totally ordered and how many times it was
# reordered
prod_stat = order_prior.groupby('product_id').agg(
    {
    'reordered': ['sum', 'size'],
    'add_to_cart_order':'mean'
    })

# Set the column to level 1
prod_stat.columns = prod_stat.columns.levels[1]
# Rename the columns
prod_stat.rename(columns={'sum':'prod_reorders',
                          'size':'prod_orders',
                          'mean': 'prod_add_to_card_mean'}, inplace=True)
prod_stat.reset_index(inplace=True)

prod_stat['reorder_ration'] = prod_stat['prod_reorders'] / prod_stat['prod_orders']

prod_stat = pd.merge(prod_stat, prob, on='product_id')

## Calculate user statistics

In [6]:
# Get the max order_number for a given user_id
# calculate the sum, mean, median of days_since_prior_order
user_stat = orders[orders.eval_set == 'prior'].groupby('user_id').agg(
    {
        'order_number': 'max',
        'days_since_prior_order': ['sum','mean','median']
     })

# Drop the 0th level of column generated from groupby
user_stat.columns = user_stat.columns.droplevel(0)
user_stat.rename(columns={'max': 'user_orders',
                          'sum': 'user_order_starts_at',
                          'mean': 'user_mean_days_since_prior',
                          'median': 'user_median_days_since_prior'},
                 inplace=True)

user_stat.reset_index(inplace=True)

# Merging the orders and prior orders to get the products info for orders
orders_products = pd.merge(orders, order_prior, on="order_id")

# Compute the number of times user reordered till now and count of unique
# products ordered by the user
user_order_stat = orders_products.groupby('user_id').agg(
    {
        "user_id": "size",
        "reordered": "sum",
        "product_id": lambda x: x.nunique()
    })

user_order_stat.rename(
    columns = {
        'user_id':'user_total_products',
        'product_id': 'user_distinct_products',
        'reordered': 'user_reorder_ratio'
    },
    inplace=True)

user_order_stat.reset_index(inplace=True)

# compute the reorder ratio based on how many times the user has reordered till
# now by total products ordered by the user
user_order_stat['user_reorder_ratio'] = user_order_stat['user_reorder_ratio'] / user_order_stat['user_total_products']

user_stat = pd.merge(user_stat, user_order_stat, on='user_id')
# Calculate the avg basket size by total products bought and total user orders
user_stat['user_average_basket'] = (user_stat['user_total_products'] 
                                    / user_stat['user_orders'])

## User product Features

In [7]:
# How many users purchased a given product?
prod_usr = orders_products.groupby(['product_id']).agg(
    {'user_id': lambda x: x.nunique()})
prod_usr.rename(columns={'user_id':'prod_users_unq'}, inplace=True)
prod_usr.reset_index(inplace=True)

# How many users reordered a product?
prod_usr_reordered = orders_products[orders_products.reordered==True].groupby(
    ['product_id']).agg(
        {'user_id': lambda x: x.nunique()}
        )
prod_usr_reordered.rename(columns={'user_id': 'prod_users_unq_reordered'}, inplace=True)
prod_usr_reordered.reset_index(inplace=True)

order_stat = orders_products.groupby('order_id').agg(
    {'order_id': 'size'})
order_stat = order_stat.rename(columns={'order_id': 'order_size'}).reset_index()

orders_products = pd.merge(orders_products, order_stat, on='order_id')
orders_products['add_to_cart_order_inverted'] = orders_products['order_size'] - orders_products['add_to_cart_order']
orders_products['add_to_cart_order_relative'] = orders_products['add_to_cart_order'] / orders_products['order_size']

# Compute user product features related to orders
data = orders_products.groupby(['user_id', 'product_id']).agg(
    {
        'user_id': 'size',
        'order_number': ['min', 'max'],
        'add_to_cart_order': ['mean', 'median'],
        'days_since_prior_order': ['mean', 'median'],
        'order_dow': ['mean', 'median'],
        'order_hour_of_day': ['mean', 'median'],
        'add_to_cart_order_inverted': ['mean', 'median'],
        'add_to_cart_order_relative': ['mean', 'median'],
        'reordered': ['sum']
     })

data.columns = data.columns.droplevel(0)
data.columns = ['up_orders', 'up_first_order', 'up_last_order',
                'up_mean_cart_position', 'up_median_cart_position',
                'days_since_prior_order_mean',
                'days_since_prior_order_median', 'order_dow_mean',
                'order_dow_median', 'order_hour_of_day_mean', 
                'order_hour_of_day_median', 'add_to_cart_order_inverted_mean',
                'add_to_cart_order_inverted_median',
                'add_to_cart_order_relative_mean',
                'add_to_cart_order_relative_median',
                'reordered_sum']

# Adding 1 incae of reordered_sum is 0
data['user_product_reordered_ratio'] = (data['reordered_sum'] + 1.0) / data['up_orders']

data.reset_index(inplace=True)

data = pd.merge(data, prod_stat, on='product_id')
data = pd.merge(data, user_stat, on='user_id')

# Ratio of user ordered a particular product by total orders by the order
data['up_order_rate'] = data['up_orders'] / data['user_orders']
data['up_orders_since_last_order'] = data['user_orders'] - data['up_last_order']
# ordering rate since first order by the user
data['up_order_rate_since_first_order'] = data['user_orders'] / (data['user_orders'] - data['up_first_order'] + 1)

# Prepare final Train data

In [8]:
# Merge products
order_train = pd.merge(order_train, products, on='product_id')
# Merge the orders
order_train = pd.merge(order_train, orders, on='order_id')
# Merge the User department statistics
order_train = pd.merge(order_train, user_dep_stat, on=['user_id', 'department_id'])
# Merge the user aisle statistics
order_train = pd.merge(order_train, user_aisle_stat, on=['user_id', 'aisle_id'])

# Merge the user products features
order_train = pd.merge(order_train, prod_usr, on='product_id')
# Merge the user product reordered stats
order_train = pd.merge(order_train, prod_usr_reordered, on='product_id', how='left')
order_train.prod_users_unq_reordered.fillna(0, inplace=True)

# Merge the this prepared set with the data
order_train = pd.merge(order_train, data, on=['product_id', 'user_id'])

# Compute the aisle and departement reorder ratio
order_train['aisle_reordered_ratio'] = order_train['aisle_reordered'] / order_train['user_orders']
order_train['dep_reordered_ratio'] = order_train['dep_reordered'] / order_train['user_orders']

order_train = pd.merge(order_train, product_periods, on=['user_id',  'product_id'])
order_train = pd.merge(order_train, order_streaks, on=['user_id', 'product_id'], how='left')
order_train = pd.merge(order_train, product_embeddings, on=['product_id'])
print(f"Final shape of train data {order_train.shape}")

Final shape of train data (8474661, 91)


# Prepare Final Test Data

In [9]:
# Merge products
order_test = pd.merge(order_test, products, on='product_id')
# Merge the orders
order_test = pd.merge(order_test, orders, on='order_id')
# Merge the User department statistics
order_test = pd.merge(order_test, user_dep_stat, on=['user_id', 'department_id'])
# Merge the user aisle statistics
order_test = pd.merge(order_test, user_aisle_stat, on=['user_id', 'aisle_id'])
# Merge the user products features
order_test = pd.merge(order_test, prod_usr, on='product_id')
# Merge the user product reordered stats
order_test = pd.merge(order_test, prod_usr_reordered, on='product_id', how='left')
order_train.prod_users_unq_reordered.fillna(0, inplace=True)

# Merge the this prepared set with the data
order_test = pd.merge(order_test, data, on=['product_id', 'user_id'])

# Compute the aisle and departement reorder ratio
order_test['aisle_reordered_ratio'] = order_test['aisle_reordered'] / order_test['user_orders']
order_test['dep_reordered_ratio'] = order_test['dep_reordered'] / order_test['user_orders']

order_test = pd.merge(order_test, product_periods, on=['user_id', 'product_id'])
order_test = pd.merge(order_test, product_embeddings, on=['product_id'])
order_test = pd.merge(order_test, order_streaks, on=['user_id', 'product_id'], how='left')
print(f"Final shape of test data {order_test.shape}")

Final shape of test data (4833292, 90)


# Get the necessary features

In [15]:
features = [
    'user_product_reordered_ratio', 'reordered_sum',
    'add_to_cart_order_inverted_mean',
    'add_to_cart_order_relative_mean', 'reorder_prob',
    'last', 'prev1', 'prev2', 'median', 'mean',
    'dep_reordered_ratio', 'aisle_reordered_ratio',
    'aisle_products', 'aisle_reordered',
    'dep_products', 'dep_reordered',
    'prod_users_unq', 'prod_users_unq_reordered',
    'order_number', 'prod_add_to_card_mean',
    'days_since_prior_order',
    'order_dow', 'order_hour_of_day',
    'reorder_ration', 'user_orders',
    'user_order_starts_at', 'user_mean_days_since_prior',
    'user_average_basket', 'user_distinct_products',
    'user_reorder_ratio', 'user_total_products',
    'prod_orders', 'prod_reorders',
    'up_order_rate', 'up_orders_since_last_order',
    'up_order_rate_since_first_order',
    'up_orders', 'up_first_order', 'up_last_order',
    'up_mean_cart_position', 'days_since_prior_order_mean',
    'order_dow_mean', 'order_hour_of_day_mean',
    'user_id', 'order_id'
    ]
features.extend(embedings)
categories = ['product_id', 'aisle_id', 'department_id']
features.extend(categories)

In [18]:
data = order_train[features]
labels = order_train[['reordered']].values.astype(np.float32).flatten()
features.remove('user_id')
data_val = order_test[features]

In [19]:
print(f"shape of data is {data.shape}, shape of data_val is {data_val.shape}")

shape of data is (8474661, 80), shape of data_val is (4833292, 79)


# Save the data

In [None]:
data.to_pickle("data/final_train.pkl")
data_val.to_pickle("data/final_test.pkl")
labels = pd.DataFrame(labels, columns=['labels'])
labels.to_pickle("data/final_labels.pkl")