In [10]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score
from xgboost import XGBClassifier

# Load the data
df = pd.read_feather('mined_data.feather')

df.drop(['order_id', 'days_since_prior_order', 
         'order_number', 'order_dow', 'order_hour_of_day',
         'reordered', 'product_name', 'days_since_user_first_order'], axis=1, inplace=True)

# Rows with -1 were initial product orders, not re-orders
df = df[df['days_since_user_ordered_product'] >= 0]

# Check if the product was re-ordered within the last 30 days.
df['reordered_within_30_days'] = (df['days_since_user_ordered_product'] <= 30).astype(int)
df.drop(['days_since_user_ordered_product'], axis=1, inplace=True)

X = df.drop('reordered_within_30_days', axis=1)
y = df['reordered_within_30_days']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=62)


In [11]:
model = XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    tree_method='hist',
    device='cuda'
)

model.fit(X_train, y_train)


Without hyperparameter tuning

In [12]:
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]

print(classification_report(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_proba))


              precision    recall  f1-score   support

           0       0.59      0.05      0.10   1014460
           1       0.74      0.99      0.85   2810848

    accuracy                           0.74   3825308
   macro avg       0.66      0.52      0.47   3825308
weighted avg       0.70      0.74      0.65   3825308

ROC AUC: 0.6316111074255852


In [13]:

import optuna
from sklearn.metrics import roc_auc_score
from xgboost import XGBClassifier

def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 5),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 5),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 5),
        'tree_method': 'hist',
        'device': 'cuda',
        'objective': 'binary:logistic',
        'eval_metric': 'logloss'
    }

    model = XGBClassifier(**params)
    model.fit(X_train, y_train)
    y_proba = model.predict_proba(X_test)[:, 1]
    return roc_auc_score(y_test, y_proba)

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=30)

print("Best trial:")
print(study.best_trial)


[I 2025-04-08 22:56:48,218] A new study created in memory with name: no-name-50ea0176-2c69-47b9-aaac-cd47e8d2abd3
[I 2025-04-08 22:57:15,520] Trial 0 finished with value: 0.6379895365575217 and parameters: {'n_estimators': 266, 'max_depth': 9, 'learning_rate': 0.23018667419181088, 'subsample': 0.7813102489576821, 'colsample_bytree': 0.6917990175599067, 'gamma': 2.149582039236817, 'reg_alpha': 3.1101689142239923, 'reg_lambda': 4.880988037133747}. Best is trial 0 with value: 0.6379895365575217.
[I 2025-04-08 22:57:34,945] Trial 1 finished with value: 0.6270423321551828 and parameters: {'n_estimators': 96, 'max_depth': 7, 'learning_rate': 0.12771908282225813, 'subsample': 0.8150799419962194, 'colsample_bytree': 0.5849724048129109, 'gamma': 2.669794599484571, 'reg_alpha': 2.276593811701819, 'reg_lambda': 1.5088444062495503}. Best is trial 0 with value: 0.6379895365575217.
[I 2025-04-08 22:57:57,323] Trial 2 finished with value: 0.6352123567192655 and parameters: {'n_estimators': 160, 'max_

Best trial:
FrozenTrial(number=18, state=1, values=[0.6432776786970396], datetime_start=datetime.datetime(2025, 4, 8, 23, 4, 5, 681933), datetime_complete=datetime.datetime(2025, 4, 8, 23, 4, 38, 64639), params={'n_estimators': 268, 'max_depth': 10, 'learning_rate': 0.24033037535546375, 'subsample': 0.7634717600179476, 'colsample_bytree': 0.8777401074571476, 'gamma': 0.06116877269842491, 'reg_alpha': 0.5477930978584884, 'reg_lambda': 2.1852644618495285}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'n_estimators': IntDistribution(high=300, log=False, low=50, step=1), 'max_depth': IntDistribution(high=10, log=False, low=3, step=1), 'learning_rate': FloatDistribution(high=0.3, log=False, low=0.01, step=None), 'subsample': FloatDistribution(high=1.0, log=False, low=0.5, step=None), 'colsample_bytree': FloatDistribution(high=1.0, log=False, low=0.5, step=None), 'gamma': FloatDistribution(high=5.0, log=False, low=0.0, step=None), 'reg_alpha': FloatDistribution(high

Now lets train the model with the "best" parameters

In [14]:
best_params = study.best_trial.params
best_params.update({
    'tree_method': 'hist',
    'device': 'cuda',
    'objective': 'binary:logistic',
    'eval_metric': 'logloss'
})

model_optuna = XGBClassifier(**best_params)
model_optuna.fit(X_train, y_train)

y_pred = model_optuna.predict(X_test)
y_proba = model_optuna.predict_proba(X_test)[:, 1]

print("Classification Report (Optuna-tuned model):")
print(classification_report(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_proba))


Classification Report (Optuna-tuned model):
              precision    recall  f1-score   support

           0       0.59      0.07      0.12   1014460
           1       0.74      0.98      0.85   2810848

    accuracy                           0.74   3825308
   macro avg       0.67      0.53      0.48   3825308
weighted avg       0.70      0.74      0.65   3825308

ROC AUC: 0.6432776786970396


Hmmm, not much better

Need to save the probs for a full roc graph

In [15]:
import pickle

with open('xg_30_probs.pkl', 'wb') as f:
    pickle.dump(y_proba, f)

## Run again for 14 days

In [16]:
df = pd.read_feather('mined_data.feather')

df.drop(['order_id', 'days_since_prior_order', 
         'order_number', 'order_dow', 'order_hour_of_day',
         'reordered', 'product_name', 'days_since_user_first_order'], axis=1, inplace=True)

df = df[df['days_since_user_ordered_product'] >= 0]

df['reordered_within_14_days'] = (df['days_since_user_ordered_product'] <= 14).astype(int)
df.drop(['days_since_user_ordered_product'], axis=1, inplace=True)

X = df.drop('reordered_within_14_days', axis=1)
y = df['reordered_within_14_days']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=62)


In [17]:

# Train model again for 14-day target
xg = XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    tree_method='hist',
    device='cuda'
)
xg.fit(X_train, y_train)

y_pred = xg.predict(X_test)
y_proba = xg.predict_proba(X_test)[:, 1]

print("Classification Report (14-day target):")
print(classification_report(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_proba))


Classification Report (14-day target):
              precision    recall  f1-score   support

           0       0.59      0.71      0.65   2046126
           1       0.57      0.44      0.50   1779182

    accuracy                           0.59   3825308
   macro avg       0.58      0.58      0.57   3825308
weighted avg       0.58      0.59      0.58   3825308

ROC AUC: 0.6152376870395381


In [18]:
with open('xg_14_probs.pkl', 'wb') as f:
    pickle.dump(y_proba, f)