In [1]:
# Importing libraries
import joblib

import pandas as pd
import numpy as np
import scipy.stats as stats

from os import listdir
from os import cpu_count

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import classification_report

In [2]:
max_cores = cpu_count()

In [3]:
files = [file.split('.')[0] for file in listdir("../data")]

In [4]:
eval_expr = ', '.join(f'pd.read_csv(\'../data/{file}.csv\')' for file in files)

In [5]:
# Importing all files
products, orders, departments, order_products_train, aisles, order_products_prior, sample_submission = eval(eval_expr)

In [6]:
# Merging the orders DataFrame with order_products_prior to get combined dataframe with previous orders
prior_orders = pd.merge(orders, order_products_prior, on='order_id', how='inner')

In [7]:
#Creating predictors

#Possible predictor categories (examples)
#1. User predictors: How often does a user reorder? 
#2. Product Predictors: How often is a product reordered?
#3. User-Product Predictors: How often does a user buy a specific product?

In [8]:
# This could be made into a simple function call with dictionary arguments or **kwargs
total_orders_feature = prior_orders.groupby('user_id').agg(
    total_orders = pd.NamedAgg(column='order_number', aggfunc='max')
).reset_index()

In [9]:
user_reorder_ratio = prior_orders.groupby('user_id').agg(
    reorder_ratio = pd.NamedAgg(column='reordered', aggfunc='mean')
).reset_index()

In [10]:
# Joining the two user features on id 'user_id'
user_features = total_orders_feature.merge(user_reorder_ratio, on='user_id', how='inner')

In [11]:
# Product features

total_product_purchases = prior_orders.groupby('product_id').agg(
    total_product_purchases = pd.NamedAgg(column='order_id', aggfunc='count')
).reset_index()

In [12]:
product_reorder_ratio = prior_orders.groupby('product_id').agg(
    product_reorder_ratio = pd.NamedAgg(column='reordered', aggfunc='mean')
).reset_index()

In [13]:
# If the probability is higher than 50%, we might have a better predictor than a coin toss
product_reorder_ratio_filtered = product_reorder_ratio.loc[product_reorder_ratio['product_reorder_ratio'] > 0.5]

In [14]:
# Join product features
product_features = total_product_purchases.merge(product_reorder_ratio_filtered, on='product_id', how='left').fillna(0)

In [15]:
# User-product predictors
total_product_buys = prior_orders.groupby(['user_id', 'product_id']).agg(
    total_product_buys = pd.NamedAgg(column='order_id', aggfunc='count')
).reset_index()

In [16]:
# Add to Cart Order as an Ordinal Feature
pre_binning_dataframe = prior_orders.copy()

pre_binning_dataframe['add_to_cart_sequence'] = pd.cut(pre_binning_dataframe['add_to_cart_order'], bins=[1, 10, 20, float('Inf')], labels=[0, 1, 2])


In [17]:
product_feature_behaviour = total_product_buys.merge(pre_binning_dataframe[['user_id', 'product_id', 'add_to_cart_sequence']].dropna(subset=['add_to_cart_sequence']), on=['user_id', 'product_id'], how='inner')

In [18]:
features_user_product = product_feature_behaviour.merge(user_features, on='user_id', how='left')

In [19]:
# Combining features
# Using the features_user_product as base since there are both "product_id" and "user_id" columns
features_user_product = features_user_product.merge(product_features, on='product_id', how='inner')

In [20]:
# Setting up the data for modeling
# Since the test set was originally used for prediction and test_set_orders are not available, 
# we will use only the training data later. So, extracting eval_set == "test" is not strictly necessary.
orders_future = orders[((orders['eval_set']=='train') | (orders['eval_set']=='test'))]
orders_future = orders_future[ ['user_id', 'eval_set', 'order_id'] ]
orders_future.head(10)

Unnamed: 0,user_id,eval_set,order_id
10,1,train,1187899
25,2,train,1492625
38,3,test,2774568
44,4,test,329954
49,5,train,2196797
53,6,test,1528013
74,7,train,525192
78,8,train,880375
82,9,train,1094988
88,10,train,1822501


In [21]:
data_prep = features_user_product.merge(orders_future, on='user_id', how='left')


In [22]:
# Curating training dataframe
train_data = data_prep[data_prep['eval_set']=='train']

In [23]:
# We will receive NA values in the reordered column because some user_id/product_id combinations in the training data
# might not exist, which means that the user did not reorder a specific product.
train_data = train_data.merge(order_products_train[['product_id','order_id', 'reordered']], on=['product_id','order_id'], how='left')

In [24]:
# We adjust the data accordingly
train_data['reordered'] = train_data['reordered'].fillna(0)

In [25]:
feature_names = ['total_product_buys', 'total_orders', 'reorder_ratio', 'total_product_purchases', 'product_reorder_ratio', 'add_to_cart_sequence']

In [26]:
train_data_clean = train_data[feature_names]

In [27]:
train_data_clean.head(10)

Unnamed: 0,total_product_buys,total_orders,reorder_ratio,total_product_purchases,product_reorder_ratio,add_to_cart_sequence
0,10,10,0.694915,35791,0.77648,0
1,10,10,0.694915,35791,0.77648,0
2,1,33,0.502439,35791,0.77648,0
3,2,11,0.401361,35791,0.77648,0
4,14,27,0.698225,35791,0.77648,0
5,14,27,0.698225,35791,0.77648,0
6,14,27,0.698225,35791,0.77648,0
7,14,27,0.698225,35791,0.77648,0
8,14,27,0.698225,35791,0.77648,0
9,14,27,0.698225,35791,0.77648,0


In [28]:
# Creating test an validation sets
X_train, X_test, y_train, y_test = train_test_split(train_data_clean, train_data['reordered'], train_size=0.8, shuffle=True)

scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [29]:
# Defining the hyperparameter space and fitting a Gradient Booster 
param_space = {
    'learning_rate': stats.uniform(0.01, 0.1),
    'max_iter': stats.randint(80, 250),
}

model = HistGradientBoostingClassifier()
search = RandomizedSearchCV(model, param_space, cv=3, n_iter=5, verbose=10, n_jobs=max_cores, scoring='f1')

search.fit(X_train_scaled, y_train)

Fitting 3 folds for each of 5 candidates, totalling 15 fits


RandomizedSearchCV(cv=3, estimator=HistGradientBoostingClassifier(), n_iter=5,
                   n_jobs=8,
                   param_distributions={'learning_rate': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f9a06a842b0>,
                                        'max_iter': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f9568c0f490>},
                   scoring='f1', verbose=10)

In [30]:
search.cv_results_

{'mean_fit_time': array([1616.58908765, 1538.38041409,  792.43772173,  749.81625247,
         619.98184665]),
 'std_fit_time': array([  2.36199737,   2.78070917, 152.27602075, 129.79063758,
          8.3213497 ]),
 'mean_score_time': array([219.12606859, 230.78754067, 141.81759938, 152.29845627,
        141.27303505]),
 'std_score_time': array([ 0.80629427,  1.57525794, 25.01384921, 10.35202759,  5.717522  ]),
 'param_learning_rate': masked_array(data=[0.016760583880484174, 0.08840817655222107,
                    0.058931133581481505, 0.08197784365231764,
                    0.05703840352007857],
              mask=[False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_max_iter': masked_array(data=[183, 241, 80, 199, 162],
              mask=[False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'learning_rate': 0.016760583880484174, 'max_iter': 183},
  {'learning_rate': 0.08840817655222107, 'max_

In [31]:
search.best_params_

{'learning_rate': 0.08840817655222107, 'max_iter': 241}

In [32]:
predictions = search.predict(X_test_scaled)

In [33]:
# Output classification report
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

         0.0       0.85      0.96      0.90   2989262
         1.0       0.65      0.29      0.40    729661

    accuracy                           0.83   3718923
   macro avg       0.75      0.63      0.65   3718923
weighted avg       0.81      0.83      0.80   3718923



In [34]:
# From the classification report, we can deduce that the model is much better at predicting whether a customer does not repurchase a certain product. Precision and recall values for recall are much lower for the positive class, that is for repurchases.