# Testing code for using bayesian hyperparameter optimisation vs grid search
---

In [82]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Modelling
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, roc_auc_score

import lightgbm as lgb

# Hyperparameter tuning
from hyperopt import fmin, rand, tpe, space_eval, STATUS_OK, Trials, hp
from hyperopt.pyll.stochastic import sample
# from bayes_opt import BayesianOptimization
# import optuna

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/retail-transaction-dataset/Retail_Transaction_Dataset.csv


I will be creating a simple model to predict whether a customer would shop multiple times in a month, and using bayesian hyperparameter optimisation and display improvements of the model using this.

For simplicity, I will be using the previous month to predict the following month

In [83]:
df = pd.read_csv("/kaggle/input/retail-transaction-dataset/Retail_Transaction_Dataset.csv")
display(df)

Unnamed: 0,CustomerID,ProductID,Quantity,Price,TransactionDate,PaymentMethod,StoreLocation,ProductCategory,DiscountApplied(%),TotalAmount
0,109318,C,7,80.079844,12/26/2023 12:32,Cash,"176 Andrew Cliffs\nBaileyfort, HI 93354",Books,18.677100,455.862764
1,993229,C,4,75.195229,8/5/2023 0:00,Cash,"11635 William Well Suite 809\nEast Kara, MT 19483",Home Decor,14.121365,258.306546
2,579675,A,8,31.528816,3/11/2024 18:51,Cash,"910 Mendez Ville Suite 909\nPort Lauraland, MO...",Books,15.943701,212.015651
3,799826,D,5,98.880218,10/27/2023 22:00,PayPal,"87522 Sharon Corners Suite 500\nLake Tammy, MO...",Books,6.686337,461.343769
4,121413,A,7,93.188512,12/22/2023 11:38,Cash,"0070 Michelle Island Suite 143\nHoland, VA 80142",Electronics,4.030096,626.030484
...,...,...,...,...,...,...,...,...,...,...
99995,726461,A,2,56.078258,7/17/2023 16:59,Credit Card,"3632 Darren Station Apt. 553\nEricaborough, RI...",Clothing,18.345145,91.581240
99996,328056,A,6,88.516406,5/30/2023 9:04,Credit Card,"821 Taylor Shoals\nEvansville, IL 70845",Electronics,3.995541,509.878179
99997,887304,B,4,72.385564,8/25/2023 7:59,Credit Card,"50653 Kara Lakes\nStephanieborough, RI 94492",Clothing,17.423979,239.092472
99998,326401,C,5,66.542239,2/5/2024 19:45,PayPal,"18756 Mcfarland Way Suite 866\nBarnettside, PR...",Electronics,14.345018,284.983717


### **EDA**
---

In [84]:
#checking for na values
df.isnull().sum(axis = 0)

CustomerID            0
ProductID             0
Quantity              0
Price                 0
TransactionDate       0
PaymentMethod         0
StoreLocation         0
ProductCategory       0
DiscountApplied(%)    0
TotalAmount           0
dtype: int64

In [85]:
#checking data types
df.dtypes

CustomerID              int64
ProductID              object
Quantity                int64
Price                 float64
TransactionDate        object
PaymentMethod          object
StoreLocation          object
ProductCategory        object
DiscountApplied(%)    float64
TotalAmount           float64
dtype: object

In [86]:
#converting transaction date to datetime format.
df["TransactionDate"] = pd.to_datetime(df["TransactionDate"])

In [87]:
df["TransactionDate"].dtype

dtype('<M8[ns]')

In [88]:
#checking min and max transaction date
print(f"max date: {df['TransactionDate'].max()}")
print(f"min date: {df['TransactionDate'].min()}")

max date: 2024-04-28 22:22:00
min date: 2023-04-29 22:27:00


### **Feature Engineering**
---

Grouping data for each month for each customer. Features to engineer include:
* Previous 2 months average spend
* Spend trend over previous 2 months (up or down)
* Average quantity of transactions over previous 2 months

In [89]:
#Grouping all data to month level

# creating a month field
df['Month_Year'] = (df['TransactionDate'].dt.strftime('%m') + "/" + df['TransactionDate'].dt.strftime('%Y'))
display(df['Month_Year'])

0        12/2023
1        08/2023
2        03/2024
3        10/2023
4        12/2023
          ...   
99995    07/2023
99996    05/2023
99997    08/2023
99998    02/2024
99999    02/2024
Name: Month_Year, Length: 100000, dtype: object

In [90]:
from datetime import datetime
# Generate all month/year combinations between April 2023 and April 2024
start_date = datetime.strptime('04/2023', '%m/%Y')
end_date = datetime.strptime('04/2024', '%m/%Y')
date_range = pd.date_range(start=start_date, end=end_date, freq='MS').strftime('%b_%Y').tolist()

# Get unique CustomerIDs
customer_ids = df['CustomerID'].unique()

# Create a DataFrame with all combinations of CustomerID and Month_Year
all_combinations = pd.MultiIndex.from_product([customer_ids, date_range], names=['CustomerID', 'Month_Year']).to_frame(index=False)

# Merge with the original DataFrame to find missing combinations
merged_df = pd.merge(all_combinations, df, on=['CustomerID', 'Month_Year'], how='left')

# Find rows that are missing in the DataFrame
missing_rows = merged_df[merged_df.isnull().any(axis=1)]

# Fill missing values with zeros except for CustomerID and Month_Year
missing_rows.fillna({'ProductID': 0, 'Quantity': 0, 'Price': 0, 'TransactionDate': '', 'TotalAmount': 0}, inplace=True)

# Append the missing rows to the original DataFrame
final_df = pd.concat([df, missing_rows], ignore_index=True)

display(final_df)


  missing_rows.fillna({'ProductID': 0, 'Quantity': 0, 'Price': 0, 'TransactionDate': '', 'TotalAmount': 0}, inplace=True)
  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,CustomerID,ProductID,Quantity,Price,TransactionDate,PaymentMethod,StoreLocation,ProductCategory,DiscountApplied(%),TotalAmount,Month_Year
0,109318,C,7.0,80.079844,2023-12-26 12:32:00,Cash,"176 Andrew Cliffs\nBaileyfort, HI 93354",Books,18.677100,455.862764,12/2023
1,993229,C,4.0,75.195229,2023-08-05 00:00:00,Cash,"11635 William Well Suite 809\nEast Kara, MT 19483",Home Decor,14.121365,258.306546,08/2023
2,579675,A,8.0,31.528816,2024-03-11 18:51:00,Cash,"910 Mendez Ville Suite 909\nPort Lauraland, MO...",Books,15.943701,212.015651,03/2024
3,799826,D,5.0,98.880218,2023-10-27 22:00:00,PayPal,"87522 Sharon Corners Suite 500\nLake Tammy, MO...",Books,6.686337,461.343769,10/2023
4,121413,A,7.0,93.188512,2023-12-22 11:38:00,Cash,"0070 Michelle Island Suite 143\nHoland, VA 80142",Electronics,4.030096,626.030484,12/2023
...,...,...,...,...,...,...,...,...,...,...,...
1337790,771566,0,0.0,0.000000,NaT,,,,,0.000000,Dec_2023
1337791,771566,0,0.0,0.000000,NaT,,,,,0.000000,Jan_2024
1337792,771566,0,0.0,0.000000,NaT,,,,,0.000000,Feb_2024
1337793,771566,0,0.0,0.000000,NaT,,,,,0.000000,Mar_2024


In [91]:
# Group by CustomerID and Month_Year and aggregate other columns

aggregated_df = final_df.groupby(['CustomerID', 'Month_Year']).agg({
    'ProductID': pd.Series.nunique,
    'Quantity': 'sum',
    'TransactionDate': pd.Series.nunique,
    'TotalAmount': ['sum', 'mean']
}).reset_index()


# Rename columns 
aggregated_df.columns = ['CustomerID', 'Month_Year', 'DistinctProductCount', 'TotalQuantity', 'DistinctTransactionCount', 'TotalAmountSum', 'TotalAmountMean']

display(aggregated_df)

Unnamed: 0,CustomerID,Month_Year,DistinctProductCount,TotalQuantity,DistinctTransactionCount,TotalAmountSum,TotalAmountMean
0,14,08/2023,1,5.0,1,256.232791,256.232791
1,14,Apr_2023,1,0.0,0,0.000000,0.000000
2,14,Apr_2024,1,0.0,0,0.000000,0.000000
3,14,Aug_2023,1,0.0,0,0.000000,0.000000
4,14,Dec_2023,1,0.0,0,0.000000,0.000000
...,...,...,...,...,...,...,...
1337386,999997,Mar_2024,1,0.0,0,0.000000,0.000000
1337387,999997,May_2023,1,0.0,0,0.000000,0.000000
1337388,999997,Nov_2023,1,0.0,0,0.000000,0.000000
1337389,999997,Oct_2023,1,0.0,0,0.000000,0.000000


In [92]:
#trimming dataset 
trimmed_df = aggregated_df[aggregated_df["Month_Year"].isin(["05/2023","06/2023","07/2023","08/2023","09/2023","10/2023"])]
display(trimmed_df)

Unnamed: 0,CustomerID,Month_Year,DistinctProductCount,TotalQuantity,DistinctTransactionCount,TotalAmountSum,TotalAmountMean
0,14,08/2023,1,5.0,1,256.232791,256.232791
14,42,05/2023,1,7.0,1,502.656523,502.656523
28,49,06/2023,1,1.0,1,21.399047,21.399047
43,59,08/2023,1,4.0,1,139.612036,139.612036
57,65,06/2023,1,8.0,1,548.006625,548.006625
...,...,...,...,...,...,...,...
1337321,999910,10/2023,1,1.0,1,12.441084,12.441084
1337335,999931,09/2023,1,8.0,1,105.039745,105.039745
1337349,999977,07/2023,1,1.0,1,71.135444,71.135444
1337363,999990,08/2023,1,9.0,1,195.449291,195.449291


In [93]:
trimmed_df["DistinctTransactionCount"].value_counts()

DistinctTransactionCount
1    49829
2      200
3        1
Name: count, dtype: int64

In [94]:
#dropping duplicates
trimmed_df = trimmed_df.drop_duplicates(["CustomerID","Month_Year"])

In [95]:
display(trimmed_df[trimmed_df["TotalAmountSum"] == 0])

Unnamed: 0,CustomerID,Month_Year,DistinctProductCount,TotalQuantity,DistinctTransactionCount,TotalAmountSum,TotalAmountMean


In [96]:
trimmed_df["shopped_many"] = (
    ((trimmed_df["TotalQuantity"] > 1) &
     (trimmed_df["Month_Year"].isin(["06_2023", "08_2023","10/2023"])))
    .astype(int)
)

In [97]:
# Ensure Month_Year is in datetime format
trimmed_df["Month_Year"] = pd.to_datetime(trimmed_df["Month_Year"], format="%m/%Y")

# Filter for the base months
base_months = ["05/2023", "07/2023", "09/2023"]
base_months_dt = pd.to_datetime(base_months, format="%m/%Y")
combined_df = trimmed_df[trimmed_df["Month_Year"].isin(base_months_dt)].copy()

# Initialize shopped_many to 0
combined_df["shopped_many"] = 0

# Loop through each base month and update shopped_many from the next month
for month in base_months_dt:
    next_month = (month + pd.DateOffset(months=1))
    mask = (trimmed_df["Month_Year"] == next_month) & (trimmed_df["shopped_many"] == 1)
    customers = trimmed_df.loc[mask, "CustomerID"]
    combined_df.loc[combined_df["CustomerID"].isin(customers) & (combined_df["Month_Year"] == month), "shopped_many"] = 1

# Convert Month_Year back to string if needed
combined_df["Month_Year"] = combined_df["Month_Year"].dt.strftime("%m/%Y")

display(combined_df)

Unnamed: 0,CustomerID,Month_Year,DistinctProductCount,TotalQuantity,DistinctTransactionCount,TotalAmountSum,TotalAmountMean,shopped_many
14,42,05/2023,1,7.0,1,502.656523,502.656523,0
113,100,05/2023,1,8.0,1,710.062576,710.062576,0
127,101,07/2023,1,9.0,1,364.974295,364.974295,0
155,165,05/2023,1,4.0,1,181.983935,181.983935,0
239,223,07/2023,1,6.0,1,155.704311,155.704311,0
...,...,...,...,...,...,...,...,...
1337237,999855,05/2023,1,7.0,1,131.785289,131.785289,0
1337279,999897,07/2023,1,5.0,1,87.669918,87.669918,0
1337307,999902,07/2023,1,3.0,1,120.460567,120.460567,0
1337335,999931,09/2023,1,8.0,1,105.039745,105.039745,0


In [98]:
combined_df["shopped_many"].value_counts()

shopped_many
0    25001
1       62
Name: count, dtype: int64

In [105]:
train = combined_df[combined_df["Month_Year"].isin(["05/2023","07_2023"])]
test_blind = combined_df[combined_df["Month_Year"].isin(["09_2023"])]

In [106]:
#splitting label from data. 
X = train[~train["shopped_many"]]
y = train["shopped_many"]

KeyError: "None of [Index([-1, -1, -1, -1, -1, -1, -1, -1, -1, -1,\n       ...\n       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],\n      dtype='int64', length=8333)] are in the [columns]"

### Model Creation
---
Using LightGBM to replicate example

In [None]:
#keeping a small amount of the training data for validation
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=13)

In [None]:
# Create a LightGBM dataset
train_data = lgb.Dataset(X_train, label=y_train)

In [None]:
#model creation and initial run
clf = lgb.train(params, train_data, num_boost_round=100)

### Bayesian Hyperparameter Optimisation
---

In [None]:
#defining objective function
def objective(params):
    
    # Convert to integers where expected
    params = {
        'max_depth': int(params['max_depth']),
        
        # Configuration parameters
        'force_col_wise': 'true',
        'verbose': -1
    }
    
    # LightBGM classifier
    clf = lgb.train(params, train_data, num_boost_round=100)
       
    # Make predictions
    y_pred = clf.predict(X_test)
    
    # Calculate ROC AUC score
    roc_auc = roc_auc_score(y_test, y_pred)
    
    return {
        'loss': -roc_auc, 
        'status': STATUS_OK,
        'params': params
    }

In [None]:
# Full search space for final model
space = {
    'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(1)),
    'max_depth': hp.quniform('max_depth', 5, 15, 1),
    'n_estimators': hp.quniform('n_estimators', 5, 35, 1),
    'num_leaves': hp.quniform('num_leaves', 5, 50, 1),
    'boosting_type': hp.choice('boosting_type', ['gbdt', 'dart']),
    'colsample_bytree': hp.uniform('colsample_by_tree', 0.6, 1.0),
    'reg_lambda': hp.uniform('reg_lambda', 0.0, 1.0),
}

# Small search space to visualise the fitting
space_small = {
    'max_depth': hp.quniform('max_depth', 2, 20, 1)
}

# Sample from space_small
max_depth_dist_small = []
for _ in range(1000):
    max_depth_dist_small.append(sample(space_small)['max_depth'])

In [107]:
# Create the algorithms
rand_algo = rand.suggest

# Create two trials objects
rand_trials = Trials()

In [108]:
# Create the algorithms
tpe_algo = tpe.suggest

# Create two trials objects
tpe_trials = Trials()

In [109]:
# Random Bayesian Optimisation
best = fmin(
    fn = objective,
    space = space_small,
    algo = rand_algo,
    max_evals = 100,
    trials = rand_trials
)

NameError: name 'objective' is not defined

In [110]:
# TPE Optimisation
best = fmin(
    fn = objective,
    space = space_small,
    algo = tpe_algo,
    max_evals = 100,
    trials = tpe_trials
)

NameError: name 'objective' is not defined

In [None]:
# View the results of the first trial
rand_trials.trials[0]['result']

In [None]:
# Number of evaluations
n_evals = len(rand_trials.trials)

# Create a dataframe of results
for i in range(n_evals):
    temp_dict = rand_trials.trials[i]['result']
    
    if i==0:
        df_results_rand = pd.DataFrame({
            'trial': [i],
            'max_depth': [temp_dict['params']['max_depth']],
            'loss': [-temp_dict['loss']]
        })
    else:
        df_temp = pd.DataFrame({
            'trial': [i],
            'max_depth': [temp_dict['params']['max_depth']],
            'loss': [-temp_dict['loss']]
        })
        
        df_results_rand = pd.concat([df_results_rand, df_temp], ignore_index=True, axis=0)
        
df_results_rand = df_results_rand.sort_values('max_depth')
print("Shape of results:\t",df_results_rand.shape)

In [None]:
# Number of evaluations
n_evals = len(tpe_trials.trials)

# Create a dataframe of results
for i in range(n_evals):
    temp_dict = tpe_trials.trials[i]['result']
    
    if i==0:
        df_results_tpe = pd.DataFrame({
            'trial': [i],
            'max_depth': [temp_dict['params']['max_depth']],
            'loss': [-temp_dict['loss']]
        })
    else:
        df_temp = pd.DataFrame({
            'trial': [i],
            'max_depth': [temp_dict['params']['max_depth']],
            'loss': [-temp_dict['loss']]
        })
        
        df_results_tpe = pd.concat([df_results_tpe, df_temp], ignore_index=True, axis=0)
        
df_results_tpe = df_results_tpe.sort_values('max_depth')
print("Shape of results:\t",df_results_tpe.shape)

### Grid Search
---