In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

df = pd.read_feather('mined_data.feather')

df.drop(['user_id', 'order_id', 'days_since_prior_order', 'product_id', 'order_number', 'order_dow', 'order_hour_of_day', 'reordered', 'product_name', 'days_since_user_first_order'], axis=1, inplace=True)

# -1 was used to represent first orders of a product for a user
df = df[df['days_since_user_ordered_product'] >= 0]

training_set, testing_set = train_test_split(df, test_size=0.2)

small_train, small_test  = train_test_split(training_set.sample(frac=0.1), test_size=0.2)
df.columns

Index(['aisle_air_fresheners_candles', 'aisle_asian_foods',
       'aisle_baby_accessories', 'aisle_baby_bath_body_care',
       'aisle_baby_food_formula', 'aisle_bakery_desserts',
       'aisle_baking_ingredients', 'aisle_baking_supplies_decor',
       'aisle_beauty', 'aisle_beers_coolers',
       ...
       'department_international', 'department_meat_seafood',
       'department_missing', 'department_other', 'department_pantry',
       'department_personal_care', 'department_pets', 'department_produce',
       'department_snacks', 'days_since_user_ordered_product'],
      dtype='object', length=156)

In [2]:
import optuna
from sklearn.metrics import mean_squared_error
import xgboost as xgb

def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000, step=100),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "gamma": trial.suggest_float("gamma", 0, 0.5),
        "lambda": trial.suggest_float("lambda", 1, 10),
        "tree_method": "hist",
        "objective": "reg:squarederror",
        "device": "cuda",
        "eval_metric": "rmse"
    }
    
    xgb_model = xgb.XGBRegressor(**params)
    xgb_model.fit(small_train.drop(['days_since_user_ordered_product'], axis=1), small_train['days_since_user_ordered_product'])
    
    mse = mean_squared_error(small_test['days_since_user_ordered_product'], xgb_model.predict(small_test.drop(['days_since_user_ordered_product'], axis=1)))
    return mse

study = optuna.create_study(study_name="XGBoost", direction="minimize")
study.optimize(objective, n_trials=100)

print(study.best_params)


[I 2025-04-08 23:32:24,034] A new study created in memory with name: XGBoost
Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


  return func(**kwargs)
[I 2025-04-08 23:32:27,222] Trial 0 finished with value: 1309.5734216638527 and parameters: {'n_estimators': 300, 'max_depth': 7, 'learning_rate': 0.22563621178122772, 'subsample': 0.7960577512058409, 'colsample_bytree': 0.9720860194803735, 'gamma': 0.050112938754335645, 'lambda': 9.38767706377512}. Best is trial 0 with value: 1309.5734216638527.
[I 2025-04-08 23:32:30,390] Trial 1 finished with value: 1309.611088697999 and parameters: {'n_estimators': 500, 'max_depth': 3, 'learning_rate': 0.11459188516864219, 'subsample': 0.6071595868997088, 'colsample_bytree': 0.6985053336280125, 'gamma': 0.16883979385831005, 'lambda': 4.415654856867698}. Best is trial 0 with value: 1309.5734216638527.
[I 2025-04-08 23:32:38,464] Trial 2 finished wi

{'n_estimators': 200, 'max_depth': 8, 'learning_rate': 0.21328669214955887, 'subsample': 0.5612422229400509, 'colsample_bytree': 0.7154256892399239, 'gamma': 0.046961219171458304, 'lambda': 3.4532788992701575}


In [3]:
final_model = xgb.XGBRegressor(**study.best_params, tree_method="hist", objective="reg:squarederror", device="cuda", eval_metric="rmse")
y_train_log = training_set['days_since_user_ordered_product']

final_model.fit(training_set.drop(['days_since_user_ordered_product'], axis=1), y_train_log)

In [4]:
y_pred = final_model.predict(testing_set.drop(['days_since_user_ordered_product'], axis=1))
score = mean_squared_error(testing_set['days_since_user_ordered_product'], y_pred)
print(score)

1299.893176384557


In [5]:
y = testing_set['days_since_user_ordered_product']
y_pred = final_model.predict(testing_set.drop(['days_since_user_ordered_product'], axis=1))

In [6]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

mse = mean_squared_error(y, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y, y_pred)
r2 = r2_score(y, y_pred)

print(f"MSE: {mse}, RMSE: {rmse}, MAE: {mae}, R²: {r2}")


MSE: 1299.893176384557, RMSE: 36.05403134719552, MAE: 22.946104065436796, R²: 0.03671450471042037
