In [None]:
# üì¶ Install required libraries
!pip install xgboost lightgbm catboost scikit-learn pandas numpy

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error, r2_score

# üìÇ Load dataset
df = pd.read_csv("Warehouse_and_Retail_Sales.csv")

# üßπ Basic cleaning
df = df.dropna(subset=["RETAIL SALES"])
df.fillna("Unknown", inplace=True)

# üéØ Target variable
y = df["RETAIL SALES"]

# üß† Features
X = df.drop(["RETAIL SALES", "ITEM DESCRIPTION", "ITEM CODE"], axis=1)

# üî¢ Encode categorical columns
cat_cols = X.select_dtypes(include="object").columns
encoder = LabelEncoder()
for col in cat_cols:
    X[col] = encoder.fit_transform(X[col])

# ‚úÇÔ∏è Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m99.2/99.2 MB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


codes with hyperparameter tuning bossting and bagging xgboost,adaboost, lightgbm, catboost, randomforest


In [2]:
from xgboost import XGBRegressor

# üîç Hyperparameter tuning using GridSearchCV
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [4, 6, 8],
    'learning_rate': [0.05, 0.1, 0.2],
    'subsample': [0.8, 1]
}

xgb = XGBRegressor(random_state=42)
grid = GridSearchCV(xgb, param_grid, cv=3, scoring='r2', verbose=1, n_jobs=-1)
grid.fit(X_train, y_train)

best_xgb = grid.best_estimator_

# ‚úÖ Predictions
y_train_pred = best_xgb.predict(X_train)
y_test_pred = best_xgb.predict(X_test)

# üìà Metrics
rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
mse = mean_squared_error(y_test, y_test_pred)
mape = mean_absolute_percentage_error(y_test, y_test_pred)
train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)

print("‚úÖ XGBoost Best Params:", grid.best_params_)
print(f"Train R¬≤: {train_r2:.4f}")
print(f"Test R¬≤: {test_r2:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"MSE: {mse:.4f}")
print(f"MAPE: {mape:.4f}")


Fitting 3 folds for each of 36 candidates, totalling 108 fits
‚úÖ XGBoost Best Params: {'learning_rate': 0.05, 'max_depth': 6, 'n_estimators': 100, 'subsample': 0.8}
Train R¬≤: 0.9279
Test R¬≤: 0.9156
RMSE: 8.5249
MSE: 72.6733
MAPE: 517896103571110.6250


In [5]:
from sklearn.ensemble import AdaBoostRegressor

param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1, 0.5],
    'loss': ['linear', 'square']
}

ada = AdaBoostRegressor(random_state=42)
grid_ada = GridSearchCV(ada, param_grid, cv=3, scoring='r2', verbose=1, n_jobs=-1)
grid_ada.fit(X_train, y_train)

best_ada = grid_ada.best_estimator_

y_train_pred = best_ada.predict(X_train)
y_test_pred = best_ada.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
mse = mean_squared_error(y_test, y_test_pred)
mape = mean_absolute_percentage_error(y_test, y_test_pred)
train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)

print("‚úÖ AdaBoost Best Params:", grid_ada.best_params_)
print(f"Train R¬≤: {train_r2:.4f} | Test R¬≤: {test_r2:.4f}")
print(f"RMSE: {rmse:.4f} | MSE: {mse:.4f} | MAPE: {mape:.4f}")


Fitting 3 folds for each of 24 candidates, totalling 72 fits
‚úÖ AdaBoost Best Params: {'learning_rate': 0.01, 'loss': 'linear', 'n_estimators': 100}
Train R¬≤: 0.9114 | Test R¬≤: 0.9023
RMSE: 9.1722 | MSE: 84.1297 | MAPE: 3177733824673613.5000


In [10]:
from lightgbm import LGBMRegressor

param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.05, 0.1],
    'num_leaves': [31, 50, 100],
    'max_depth': [-1, 10, 20],
    'subsample': [0.8, 1.0]
}

lgb = LGBMRegressor(random_state=42)
grid_lgb = GridSearchCV(lgb, param_grid, cv=3, scoring='r2', verbose=1, n_jobs=-1)
grid_lgb.fit(X_train, y_train)

best_lgb = grid_lgb.best_estimator_

y_train_pred = best_lgb.predict(X_train)
y_test_pred = best_lgb.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
mse = mean_squared_error(y_test, y_test_pred)
mape = mean_absolute_percentage_error(y_test, y_test_pred)
train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)

print("‚úÖ LightGBM Best Params:", grid_lgb.best_params_)
print(f"Train R¬≤: {train_r2:.4f} | Test R¬≤: {test_r2:.4f}")
print(f"RMSE: {rmse:.4f} | MSE: {mse:.4f} | MAPE: {mape:.4f}")


Fitting 3 folds for each of 108 candidates, totalling 324 fits
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004414 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 742
[LightGBM] [Info] Number of data points in the train set: 246113, number of used features: 6
[LightGBM] [Info] Start training from score 7.035706
‚úÖ LightGBM Best Params: {'learning_rate': 0.1, 'max_depth': -1, 'n_estimators': 100, 'num_leaves': 50, 'subsample': 0.8}
Train R¬≤: 0.9478 | Test R¬≤: 0.9443
RMSE: 6.9261 | MSE: 47.9714 | MAPE: 364064922005848.0625


In [11]:
from catboost import CatBoostRegressor

param_grid = {
    'iterations': [200, 300],
    'depth': [6, 8, 10],
    'learning_rate': [0.03, 0.05, 0.1],
    'l2_leaf_reg': [3, 5, 7]
}

cat = CatBoostRegressor(random_seed=42, verbose=0)
grid_cat = GridSearchCV(cat, param_grid, cv=3, scoring='r2', verbose=1, n_jobs=-1)
grid_cat.fit(X_train, y_train)

best_cat = grid_cat.best_estimator_

y_train_pred = best_cat.predict(X_train)
y_test_pred = best_cat.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
mse = mean_squared_error(y_test, y_test_pred)
mape = mean_absolute_percentage_error(y_test, y_test_pred)
train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)

print("‚úÖ CatBoost Best Params:", grid_cat.best_params_)
print(f"Train R¬≤: {train_r2:.4f} | Test R¬≤: {test_r2:.4f}")
print(f"RMSE: {rmse:.4f} | MSE: {mse:.4f} | MAPE: {mape:.4f}")


Fitting 3 folds for each of 54 candidates, totalling 162 fits
‚úÖ CatBoost Best Params: {'depth': 10, 'iterations': 200, 'l2_leaf_reg': 3, 'learning_rate': 0.05}
Train R¬≤: 0.9534 | Test R¬≤: 0.9511
RMSE: 6.4910 | MSE: 42.1330 | MAPE: 514583310201579.1250


In [None]:
# ‚úÖ Random Forest Regression with Grid Search and Evaluation

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error, r2_score

# Example: Suppose you already have X and y (your features and target)
# Replace this with your actual dataset
# Example dummy data for structure only
# X = your_dataframe.drop('target_column', axis=1)
# y = your_dataframe['target_column']

# For demo (remove this part if you already have X and y)
from sklearn.datasets import fetch_california_housing
data = fetch_california_housing()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = data.target

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Parameter grid for GridSearchCV
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Initialize Random Forest
rf = RandomForestRegressor(random_state=42)

# Grid Search with 3-fold CV
grid_rf = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    cv=3,
    scoring='r2',
    verbose=1,
    n_jobs=-1  # use all CPU cores
)

# Fit model
grid_rf.fit(X_train, y_train)

# Get the best model
best_rf = grid_rf.best_estimator_

# Predictions
y_train_pred = best_rf.predict(X_train)
y_test_pred = best_rf.predict(X_test)

# Evaluation metrics
rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
mse = mean_squared_error(y_test, y_test_pred)
mape = mean_absolute_percentage_error(y_test, y_test_pred)
train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)

# Print results
print("‚úÖ Random Forest Best Params:", grid_rf.best_params_)
print(f"Train R¬≤: {train_r2:.4f} | Test R¬≤: {test_r2:.4f}")
print(f"RMSE: {rmse:.4f} | MSE: {mse:.4f} | MAPE: {mape:.4f}")


Fitting 3 folds for each of 72 candidates, totalling 216 fits


below codes without hyperparameter tuning

In [6]:
from sklearn.ensemble import AdaBoostRegressor

ada = AdaBoostRegressor(n_estimators=200, learning_rate=0.1, random_state=42)
ada.fit(X_train, y_train)

y_train_pred = ada.predict(X_train)
y_test_pred = ada.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
mse = mean_squared_error(y_test, y_test_pred)
mape = mean_absolute_percentage_error(y_test, y_test_pred)
train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)

print("‚úÖ AdaBoost Regression Results")
print(f"Train R¬≤: {train_r2:.4f}")
print(f"Test R¬≤: {test_r2:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"MSE: {mse:.4f}")
print(f"MAPE: {mape:.4f}")


‚úÖ AdaBoost Regression Results
Train R¬≤: 0.8044
Test R¬≤: 0.7942
RMSE: 13.3116
MSE: 177.2000
MAPE: 8205868267129934.0000


In [7]:
from lightgbm import LGBMRegressor

lgb = LGBMRegressor(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=-1,
    num_leaves=31,
    random_state=42
)
lgb.fit(X_train, y_train)

y_train_pred = lgb.predict(X_train)
y_test_pred = lgb.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
mse = mean_squared_error(y_test, y_test_pred)
mape = mean_absolute_percentage_error(y_test, y_test_pred)
train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)

print("‚úÖ LightGBM Regression Results")
print(f"Train R¬≤: {train_r2:.4f}")
print(f"Test R¬≤: {test_r2:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"MSE: {mse:.4f}")
print(f"MAPE: {mape:.4f}")


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004496 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 742
[LightGBM] [Info] Number of data points in the train set: 246113, number of used features: 6
[LightGBM] [Info] Start training from score 7.035706
‚úÖ LightGBM Regression Results
Train R¬≤: 0.9505
Test R¬≤: 0.9474
RMSE: 6.7325
MSE: 45.3264
MAPE: 373045926976240.0625


In [8]:
from catboost import CatBoostRegressor

cat = CatBoostRegressor(
    iterations=300,
    learning_rate=0.05,
    depth=8,
    verbose=0,
    random_seed=42
)
cat.fit(X_train, y_train)

y_train_pred = cat.predict(X_train)
y_test_pred = cat.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
mse = mean_squared_error(y_test, y_test_pred)
mape = mean_absolute_percentage_error(y_test, y_test_pred)
train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)

print("‚úÖ CatBoost Regression Results")
print(f"Train R¬≤: {train_r2:.4f}")
print(f"Test R¬≤: {test_r2:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"MSE: {mse:.4f}")
print(f"MAPE: {mape:.4f}")


‚úÖ CatBoost Regression Results
Train R¬≤: 0.9672
Test R¬≤: 0.9648
RMSE: 5.5016
MSE: 30.2681
MAPE: 432373476117985.6250


In [None]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(
    n_estimators=200,
    max_depth=10,
    random_state=42,
    n_jobs=-1
)
rf.fit(X_train, y_train)

y_train_pred = rf.predict(X_train)
y_test_pred = rf.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
mse = mean_squared_error(y_test, y_test_pred)
mape = mean_absolute_percentage_error(y_test, y_test_pred)
train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)
 
print("‚úÖ Random Forest Regression Results")
print(f"Train R¬≤: {train_r2:.4f}")
print(f"Test R¬≤: {test_r2:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"MSE: {mse:.4f}")
print(f"MAPE: {mape:.4f}")


‚úÖ Random Forest Regression Results
Train R¬≤: 0.9648
Test R¬≤: 0.9508
RMSE: 6.5102
MSE: 42.3822
MAPE: 317228652986902.6875
