# Smart Retail ML Model

### Import modules

In [39]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
# Preprocessing Libraries
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

# Pipelines
from sklearn.pipeline import Pipeline

# Models
from sklearn.linear_model import LinearRegression


# Model Evaluation Libraries
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import GridSearchCV



### Load dataset and check size

In [40]:
# Get Raw GitHub URL
DATA_URL = (
    "https://raw.githubusercontent.com/"
    "msselizabeth/smart-ml-retail-model/"
    "development/data/raw/retail_sales_dataset.csv"
)

# Alternative way to get the dataset
DATA_URL = "../data/raw/retail_sales_dataset.csv"

# Read data and inspect data types
retail_data = pd.read_csv(
    DATA_URL)

# Remove duplicates if any
retail_data = retail_data.drop_duplicates()

print(retail_data.shape)
print(retail_data.dtypes)

(1000, 9)
Transaction ID       int64
Date                object
Customer ID         object
Gender              object
Age                  int64
Product Category    object
Quantity             int64
Price per Unit       int64
Total Amount         int64
dtype: object


###Convert and normalize the data

In [41]:
retail_data['Date'] = pd.to_datetime(
    retail_data['Date'], 
    errors='raise'
)

print("After: ", retail_data['Date'].dtype)  # datetime64[ns]

After:  datetime64[ns]


In [42]:
# Print data info
retail_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   Transaction ID    1000 non-null   int64         
 1   Date              1000 non-null   datetime64[ns]
 2   Customer ID       1000 non-null   object        
 3   Gender            1000 non-null   object        
 4   Age               1000 non-null   int64         
 5   Product Category  1000 non-null   object        
 6   Quantity          1000 non-null   int64         
 7   Price per Unit    1000 non-null   int64         
 8   Total Amount      1000 non-null   int64         
dtypes: datetime64[ns](1), int64(5), object(3)
memory usage: 70.4+ KB


In [43]:
# Normalize date and set as index
retail_data['Date'] = retail_data['Date'].dt.normalize()

# Set the date as index for time-series analysis
retail_data.set_index('Date', inplace=True)

# Quick checks
print("Columns:", retail_data.columns)
print("Index name:", retail_data.index.name, "dtype:", retail_data.index.dtype)


Columns: Index(['Transaction ID', 'Customer ID', 'Gender', 'Age', 'Product Category',
       'Quantity', 'Price per Unit', 'Total Amount'],
      dtype='object')
Index name: Date dtype: datetime64[ns]


In [44]:
# Dates of January 2024 transactions:
jan24_retail_data = retail_data.loc['2024-01']

# To just see the unique dates that occur:
unique_dates = jan24_retail_data.index.date
print(unique_dates)

[datetime.date(2024, 1, 1) datetime.date(2024, 1, 1)]


In [45]:
# Remove all rows where the index month is Jan 2024 (because it;s only few of t24)
retail_data = retail_data[~(retail_data.index.to_period('M') == '2024-01')]

# Quick check
print("New date range:", retail_data.index.min().date(), "to", retail_data.index.max().date())

New date range: 2023-01-01 to 2023-12-31


### Clean ans Analyze the data

In [46]:
# Strip whitespace and rename all columns to snake_case
retail_data.columns = (
    retail_data.columns
      .str.strip()                             
      .str.lower()                             # lowercase all characters
      .str.replace(r'[^0-9a-z]+', '_', regex=True)  # non-alphanumerics → underscore
      .str.replace(r'__+', '_', regex=True)        # collapse repeated underscores
      .str.strip('_')                          # remove leading/trailing underscores
)

# Convert categorical columns to 'category' dtype
for col in retail_data.select_dtypes(include=['object']).columns:
    retail_data[col] = retail_data[col].astype('category')

In [47]:
# Inspect the first few rows of the dataset
print(retail_data.head())

            transaction_id customer_id  gender  age product_category  \
Date                                                                   
2023-11-24               1     CUST001    Male   34           Beauty   
2023-02-27               2     CUST002  Female   26         Clothing   
2023-01-13               3     CUST003    Male   50      Electronics   
2023-05-21               4     CUST004    Male   37         Clothing   
2023-05-06               5     CUST005    Male   30           Beauty   

            quantity  price_per_unit  total_amount  
Date                                                
2023-11-24         3              50           150  
2023-02-27         2             500          1000  
2023-01-13         1              30            30  
2023-05-21         1             500           500  
2023-05-06         2              50           100  


In [48]:
## Inspect shape, missing values, and duplicates and more
# 1. Shape
print(f"Shape of the dataset: {retail_data.shape}")

# 2. Missing values
print(f"Missing values in the dataset:\n{retail_data.isnull().sum()}")

# 3. Duplicates
print(f"Number of duplicate rows: {retail_data.duplicated().sum()}")

# 4. Negative or zero checks
print((retail_data['quantity'] <= 0).sum(), 'bad quantities')
print((retail_data['price_per_unit'] <= 0).sum(), 'bad prices')

# 5. Age range
min_age = retail_data['age'].min()
max_age = retail_data['age'].max()
print(f"Age range: {min_age} to {max_age}")

Shape of the dataset: (998, 8)
Missing values in the dataset:
transaction_id      0
customer_id         0
gender              0
age                 0
product_category    0
quantity            0
price_per_unit      0
total_amount        0
dtype: int64
Number of duplicate rows: 0
0 bad quantities
0 bad prices
Age range: 18 to 64


In [49]:
# Initial transaction count
# 6. Transaction count by gender
gender_counts = retail_data['gender'].value_counts()
print("Transaction counts by gender:")
print(gender_counts)

# 7. Transaction count by product category
category_counts = retail_data['product_category'].value_counts()
print("Transaction counts by product category:")
print(category_counts)

# 8. Transaction count by age group
age_counts = retail_data['age'].value_counts()
print("Transaction counts by age:")
print(age_counts)

Transaction counts by gender:
gender
Female    510
Male      488
Name: count, dtype: int64
Transaction counts by product category:
product_category
Clothing       351
Electronics    341
Beauty         306
Name: count, dtype: int64
Transaction counts by age:
age
43    31
64    31
57    30
51    30
34    28
54    28
22    27
62    27
47    26
46    25
42    25
23    24
40    24
50    23
27    23
52    22
60    22
30    22
26    22
31    22
35    22
28    21
41    21
18    21
20    21
19    21
21    20
25    20
55    20
49    19
56    19
32    19
38    19
61    18
39    18
48    18
63    17
45    17
59    17
53    17
29    16
37    16
36    15
44    15
24    15
58    14
33    10
Name: count, dtype: int64


In [50]:
# Checking if necessary to optimize transaction IDs and customer IDs
# Check for unique values in ID columns
n = len(retail_data)
print(f"Unique Transaction IDs: {retail_data['transaction_id'].nunique()} out of {n}")
print(f"Unique Customer IDs: {retail_data['customer_id'].nunique()} out of {n}")


Unique Transaction IDs: 998 out of 998
Unique Customer IDs: 998 out of 998


In [51]:
# Group by Generation
# Define the 2023 generation cut-points
gen_bins  = [18, 26, 42, 57, 77]               
gen_labels = ['Gen Z (18–26)', 
              'Millennials (27–42)', 
              'Gen X (43–57)', 
              'Baby Boomers (58–77)']

# Create a new Generation column
retail_data['generation'] = pd.cut(
    retail_data['age'],
    bins=gen_bins,
    labels=gen_labels,
    right=True,   
    include_lowest=True
)

# Now see the revenue by those cohorts
gen_rev = retail_data.groupby('generation')['total_amount'].sum()
print(gen_rev)

generation
Gen Z (18–26)            98530
Millennials (27–42)     143015
Gen X (43–57)           151245
Baby Boomers (58–77)     61680
Name: total_amount, dtype: int64


  gen_rev = retail_data.groupby('generation')['total_amount'].sum()


In [52]:
from pandas.api.types import CategoricalDtype
## Transformations

# Engineer date features
retail_data['month'] = retail_data.index.month
retail_data['day'] = retail_data.index.day
retail_data['day_of_week'] = retail_data.index.dayofweek
retail_data['is_weekend'] = retail_data['day_of_week'].isin([5, 6])  # Saturday=5, Sunday=6

# Map months to seasons
month_to_season = {
    1: 'winter',  2: 'winter',  3: 'spring',
    4: 'spring',  5: 'spring',  6: 'summer',
    7: 'summer',  8: 'summer',  9: 'fall',
    10: 'fall',   11: 'fall',   12: 'winter'
}
retail_data['season'] = retail_data.index.month.map(month_to_season)

# 4. Make Season an ordered categorical
season_cat = CategoricalDtype(['winter','spring','summer','fall'], ordered=True)
retail_data['season'] = retail_data['season'].astype(season_cat)

# 5. View table
print(retail_data.head())

            transaction_id customer_id  gender  age product_category  \
Date                                                                   
2023-11-24               1     CUST001    Male   34           Beauty   
2023-02-27               2     CUST002  Female   26         Clothing   
2023-01-13               3     CUST003    Male   50      Electronics   
2023-05-21               4     CUST004    Male   37         Clothing   
2023-05-06               5     CUST005    Male   30           Beauty   

            quantity  price_per_unit  total_amount           generation  \
Date                                                                      
2023-11-24         3              50           150  Millennials (27–42)   
2023-02-27         2             500          1000        Gen Z (18–26)   
2023-01-13         1              30            30        Gen X (43–57)   
2023-05-21         1             500           500  Millennials (27–42)   
2023-05-06         2              50         

In [53]:
# Check new dtypes
dtypes = retail_data.dtypes
print(dtypes)   

transaction_id         int64
customer_id         category
gender              category
age                    int64
product_category    category
quantity               int64
price_per_unit         int64
total_amount           int64
generation          category
month                  int32
day                    int32
day_of_week            int32
is_weekend              bool
season              category
dtype: object


In [54]:
from datetime import date

# 1. Define 2023 Canadian statutory holidays
canada_holidays_2023 = {
    date(2023, 1, 1):  "New Year’s Day",
    date(2023, 2, 20): "Family Day",
    date(2023, 4, 7):  "Good Friday",
    date(2023, 5, 22): "Victoria Day",
    date(2023, 7, 1):  "Canada Day",
    date(2023, 8, 7):  "Civic Holiday",
    date(2023, 9, 4):  "Labour Day",
    date(2023, 10, 9): "Thanksgiving",
    date(2023, 12, 25):"Christmas Day",
    date(2023, 12, 26):"Boxing Day"
}

# 2. Map the index dates to holiday names
retail_data['holiday'] = (
    pd.Series(retail_data.index.date, index=retail_data.index)
      .map(canada_holidays_2023)
)
# 3. Peek at the flagged dates
holiday_retail_data = retail_data[retail_data['holiday'].notna()]
print(holiday_retail_data[['holiday']].drop_duplicates())

                   holiday
Date                      
2023-12-26      Boxing Day
2023-07-01      Canada Day
2023-01-01  New Year’s Day
2023-10-09    Thanksgiving
2023-02-20      Family Day
2023-09-04      Labour Day
2023-05-22    Victoria Day
2023-08-07   Civic Holiday
2023-12-25   Christmas Day


In [55]:
# Convert Holiday to categorical
retail_data['holiday'] = retail_data['holiday'].astype('category')# Filter to holiday transactions
retail_data_hol = retail_data[retail_data['holiday'].notna()]

In [56]:
retail_data.info()


<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 998 entries, 2023-11-24 to 2023-04-12
Data columns (total 15 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   transaction_id    998 non-null    int64   
 1   customer_id       998 non-null    category
 2   gender            998 non-null    category
 3   age               998 non-null    int64   
 4   product_category  998 non-null    category
 5   quantity          998 non-null    int64   
 6   price_per_unit    998 non-null    int64   
 7   total_amount      998 non-null    int64   
 8   generation        998 non-null    category
 9   month             998 non-null    int32   
 10  day               998 non-null    int32   
 11  day_of_week       998 non-null    int32   
 12  is_weekend        998 non-null    bool    
 13  season            998 non-null    category
 14  holiday           27 non-null     category
dtypes: bool(1), category(6), int32(3), int64(5)
memory usag

In [57]:
# Encode gender: Assume 'Male' = 1, 'Female' = 0
retail_data['gender_enc'] = retail_data['gender'].map({'Female': 0, 'Male': 1})

# Encode is_weekend: Already boolean — convert to int (True = 1, False = 0)
retail_data['is_weekend_enc'] = retail_data['is_weekend'].astype(int)

# Preview
print(retail_data[['gender', 'gender_enc', 'is_weekend', 'is_weekend_enc']].head())

            gender gender_enc  is_weekend  is_weekend_enc
Date                                                     
2023-11-24    Male          1       False               0
2023-02-27  Female          0       False               0
2023-01-13    Male          1       False               0
2023-05-21    Male          1        True               1
2023-05-06    Male          1        True               1


In [58]:
retail_data['gender_enc'] = retail_data['gender'].map({'Female': 0, 'Male': 1}).astype(int)
retail_data['is_weekend_enc'] = retail_data['is_weekend'].astype(int)

In [59]:
retail_data.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 998 entries, 2023-11-24 to 2023-04-12
Data columns (total 17 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   transaction_id    998 non-null    int64   
 1   customer_id       998 non-null    category
 2   gender            998 non-null    category
 3   age               998 non-null    int64   
 4   product_category  998 non-null    category
 5   quantity          998 non-null    int64   
 6   price_per_unit    998 non-null    int64   
 7   total_amount      998 non-null    int64   
 8   generation        998 non-null    category
 9   month             998 non-null    int32   
 10  day               998 non-null    int32   
 11  day_of_week       998 non-null    int32   
 12  is_weekend        998 non-null    bool    
 13  season            998 non-null    category
 14  holiday           27 non-null     category
 15  gender_enc        998 non-null    int64   
 16  is_week

In [60]:
retail_data.describe()

Unnamed: 0,transaction_id,age,quantity,price_per_unit,total_amount,month,day,day_of_week,gender_enc,is_weekend_enc
count,998.0,998.0,998.0,998.0,998.0,998.0,998.0,998.0,998.0,998.0
mean,500.640281,41.377756,2.51503,179.719439,455.380762,6.56012,15.338677,2.963928,0.488978,0.288577
std,288.924735,13.688342,1.132751,189.541154,559.420064,3.447252,8.857363,2.018645,0.500129,0.453328
min,1.0,18.0,1.0,25.0,25.0,1.0,1.0,0.0,0.0,0.0
25%,251.25,29.0,1.0,30.0,60.0,4.0,7.0,1.0,0.0,0.0
50%,500.5,42.0,3.0,50.0,135.0,6.0,15.0,3.0,0.0,0.0
75%,750.75,53.0,4.0,300.0,900.0,10.0,23.0,5.0,1.0,1.0
max,1000.0,64.0,4.0,500.0,2000.0,12.0,31.0,6.0,1.0,1.0


In [61]:
retail_data.info()


<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 998 entries, 2023-11-24 to 2023-04-12
Data columns (total 17 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   transaction_id    998 non-null    int64   
 1   customer_id       998 non-null    category
 2   gender            998 non-null    category
 3   age               998 non-null    int64   
 4   product_category  998 non-null    category
 5   quantity          998 non-null    int64   
 6   price_per_unit    998 non-null    int64   
 7   total_amount      998 non-null    int64   
 8   generation        998 non-null    category
 9   month             998 non-null    int32   
 10  day               998 non-null    int32   
 11  day_of_week       998 non-null    int32   
 12  is_weekend        998 non-null    bool    
 13  season            998 non-null    category
 14  holiday           27 non-null     category
 15  gender_enc        998 non-null    int64   
 16  is_week

In [62]:
retail_data

Unnamed: 0_level_0,transaction_id,customer_id,gender,age,product_category,quantity,price_per_unit,total_amount,generation,month,day,day_of_week,is_weekend,season,holiday,gender_enc,is_weekend_enc
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2023-11-24,1,CUST001,Male,34,Beauty,3,50,150,Millennials (27–42),11,24,4,False,fall,,1,0
2023-02-27,2,CUST002,Female,26,Clothing,2,500,1000,Gen Z (18–26),2,27,0,False,winter,,0,0
2023-01-13,3,CUST003,Male,50,Electronics,1,30,30,Gen X (43–57),1,13,4,False,winter,,1,0
2023-05-21,4,CUST004,Male,37,Clothing,1,500,500,Millennials (27–42),5,21,6,True,spring,,1,1
2023-05-06,5,CUST005,Male,30,Beauty,2,50,100,Millennials (27–42),5,6,5,True,spring,,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-05-16,996,CUST996,Male,62,Clothing,1,50,50,Baby Boomers (58–77),5,16,1,False,spring,,1,0
2023-11-17,997,CUST997,Male,52,Beauty,3,30,90,Gen X (43–57),11,17,4,False,fall,,1,0
2023-10-29,998,CUST998,Female,23,Beauty,4,25,100,Gen Z (18–26),10,29,6,True,fall,,0,1
2023-12-05,999,CUST999,Female,36,Electronics,3,50,150,Millennials (27–42),12,5,1,False,winter,,0,0


# Feature Construction

In [63]:
# Drop attributes
X = retail_data.drop(columns=['transaction_id','customer_id','age','gender','month','day','day_of_week','total_amount', 'price_per_unit'])
y = retail_data['total_amount']

print(X.columns)
# Identify categorical features
categorical_features = [
    'product_category',
    'generation',
    'holiday',
    'season'
]

# Identify numerical features
numerical_features = [col for col in X.columns if col not in categorical_features and col != 'total_amount']

# Quick sanity-check
print("Categorical:", categorical_features)
print("Numerical:  ", numerical_features)

Index(['product_category', 'quantity', 'generation', 'is_weekend', 'season',
       'holiday', 'gender_enc', 'is_weekend_enc'],
      dtype='object')
Categorical: ['product_category', 'generation', 'holiday', 'season']
Numerical:   ['quantity', 'is_weekend', 'gender_enc', 'is_weekend_enc']


In [64]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train

Unnamed: 0_level_0,product_category,quantity,generation,is_weekend,season,holiday,gender_enc,is_weekend_enc
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2023-01-11,Clothing,3,Millennials (27–42),False,winter,,0,0
2023-01-26,Clothing,4,Millennials (27–42),False,winter,,1,0
2023-04-02,Clothing,4,Millennials (27–42),True,spring,,1,1
2023-10-19,Clothing,2,Gen X (43–57),False,fall,,1,0
2023-07-24,Beauty,2,Gen Z (18–26),False,summer,,1,0
...,...,...,...,...,...,...,...,...
2023-02-03,Clothing,4,Gen Z (18–26),False,winter,,0,0
2023-02-25,Electronics,2,Baby Boomers (58–77),True,winter,,0,1
2023-04-24,Electronics,2,Millennials (27–42),False,spring,,0,0
2023-10-07,Electronics,4,Millennials (27–42),True,fall,,0,1


In [65]:
# Recreate list of updated numerical features
numerical_features = X_train.drop(columns=categorical_features).columns.tolist()

# ColumnTransformer 
preprocessor = ColumnTransformer(transformers=[
    ('num', StandardScaler(), numerical_features),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
])

In [66]:
# Transform the target variable Y using log transformation
y_train_log = np.log1p(y_train)
y_test_log = np.log1p(y_test)


### Model: Linear Regression

#### Build Pipeline

In [67]:

pipeline = Pipeline([
    ('preprocessing', preprocessor),
    ('regressor', LinearRegression())
])


#### Cross Validation Score

In [68]:
# Cross Validation
scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='r2')
print(f"Cross-validated R2 scores: {scores}")
print(f"Mean R2 score: {scores.mean():.4f}")

Cross-validated R2 scores: [-0.04608595  0.05052997  0.17974203  0.10965093  0.18899587]
Mean R2 score: 0.0966


  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_


In [69]:
#Fit on the log-transformed training data
pipeline.fit(X_train, y_train_log)

# Predict on the log-transformed test set
y_pred_log = pipeline.predict(X_test)

# Reverse log-transform to compare on original scale
y_pred = np.expm1(y_pred_log)

# Evaluate performance
mae= mean_absolute_error(y_test, y_pred)
rmse= np.sqrt(mean_squared_error(y_test, y_pred))

# Store the results
final_metrics_d = {
    'Model': '(Linear Regression)',
    'MAE': mae,
    'RMSE': rmse
}

print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse:.2f}")

MAE: 413.78
RMSE: 575.20


  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_


### Model: XGBoost

#### Build Pipeline

In [70]:
from xgboost import XGBRegressor

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', XGBRegressor(n_estimators=100, max_depth=4, learning_rate=0.1, random_state=42))
])


#### Cross Validation Score

In [71]:
# Cross Validation
scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='r2')
print(f"Cross-validated R2 scores: {scores}")
print(f"Mean R2 score: {scores.mean():.4f}")

Cross-validated R2 scores: [-0.10188866  0.03687704  0.07714683  0.10064703  0.15521997]
Mean R2 score: 0.0536


#### Data Tranining 

In [72]:
#Fit on the log-transformed training data
pipeline.fit(X_train, y_train_log)

# Predict on the log-transformed test set
y_pred_log = pipeline.predict(X_test)

# Reverse log-transform to compare on original scale
y_pred = np.expm1(y_pred_log)

# Evaluate performance
mae= mean_absolute_error(y_test, y_pred)
rmse= np.sqrt(mean_squared_error(y_test, y_pred))

# Store the results
final_metrics_d = {
    'Model': '(Linear Regression)',
    'MAE': mae,
    'RMSE': rmse
}

print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse:.2f}")

MAE: 409.39
RMSE: 578.61


#### Grid Search

In [73]:
param_grid = {
    'regressor__n_estimators': [100, 200],
    'regressor__max_depth': [3, 5, 7],
    'regressor__learning_rate': [0.05, 0.1, 0.2],
    'regressor__subsample': [0.8, 1.0]
}

# GridSearchCV setting
grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    cv=3,
    scoring='r2',
    verbose=1,
    n_jobs=-1 
)


grid_search.fit(X_train, y_train_log)

print("Best parameters found:", grid_search.best_params_)
print("Best RMSE (CV):", grid_search.best_score_)

Fitting 3 folds for each of 36 candidates, totalling 108 fits
Best parameters found: {'regressor__learning_rate': 0.05, 'regressor__max_depth': 3, 'regressor__n_estimators': 100, 'regressor__subsample': 1.0}
Best RMSE (CV): 0.13774417808603648
