In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
df = pd.read_csv(r"C:\Users\Lenovo\Desktop\Retail Sales\Sourse\cleaned_data.csv")
df 

Unnamed: 0,Store ID,Product ID,Date,Units Sold,Sales Revenue (USD),Discount Percentage,Marketing Spend (USD),Store Location,Product Category,Day of the Week,Holiday Effect
0,Spearsland,52372247,2022-01-01,9,2741.69,20,81.0,Tanzania,Furniture,Saturday,False
1,Spearsland,52372247,2022-01-02,7,2665.53,0,0.0,Mauritania,Furniture,Sunday,False
2,Spearsland,52372247,2022-01-03,1,380.79,0,0.0,Saint Pierre and Miquelon,Furniture,Monday,False
3,Spearsland,52372247,2022-01-04,4,1523.16,0,0.0,Australia,Furniture,Tuesday,False
4,Spearsland,52372247,2022-01-05,2,761.58,0,0.0,Swaziland,Furniture,Wednesday,False
...,...,...,...,...,...,...,...,...,...,...,...
27946,Spearsland,50239115,2022-01-24,7,3501.61,0,137.0,Hong Kong,Clothing,Monday,False
27947,Spearsland,50239115,2022-01-26,3,1500.69,0,0.0,Sudan,Clothing,Wednesday,False
27948,Spearsland,50239115,2022-01-27,6,3001.38,0,0.0,South Georgia and the South Sandwich Islands,Clothing,Thursday,False
27949,Spearsland,50239115,2022-01-28,5,2501.15,0,0.0,Haiti,Clothing,Friday,False


In [191]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27951 entries, 0 to 27950
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Store ID               27951 non-null  object 
 1   Product ID             27951 non-null  int64  
 2   Date                   27951 non-null  object 
 3   Units Sold             27951 non-null  int64  
 4   Sales Revenue (USD)    27951 non-null  float64
 5   Discount Percentage    27951 non-null  int64  
 6   Marketing Spend (USD)  27951 non-null  float64
 7   Store Location         27951 non-null  object 
 8   Product Category       27951 non-null  object 
 9   Day of the Week        27951 non-null  object 
 10  Holiday Effect         27951 non-null  bool   
dtypes: bool(1), float64(2), int64(3), object(5)
memory usage: 2.2+ MB


In [192]:
df['Store ID'].nunique()

1

In [193]:
df['Product ID'].nunique()

42

In [194]:
df.drop(['Store ID','Date'], axis=1, inplace=True)

In [195]:
X = df.drop(columns=['Sales Revenue (USD)'])
y = df['Sales Revenue (USD)']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [196]:
categorical_cols = ['Product ID', 'Store Location', 'Product Category', 'Day of the Week']
numerical_cols = ['Units Sold', 'Discount Percentage', 'Marketing Spend (USD)'] 
boolean_cols = ['Holiday Effect']

In [197]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('bool', 'passthrough', boolean_cols),
        ('cat', OneHotEncoder(), categorical_cols)
    ])

In [198]:
def evaluate_model(model, X_train, y_train, X_test, y_test, model_name):
    # Fit the model on training data
    pipeline = model.fit(preprocessor.fit_transform(X_train), y_train)

    # Predictions on training data
    y_pred_train = pipeline.predict(preprocessor.transform(X_train))
    
    # Calculate training R² and RMSE
    train_r2 = r2_score(y_train, y_pred_train)
    train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train))
    
    # Predictions on test data
    y_pred_test = pipeline.predict(preprocessor.transform(X_test))

    # Calculate test R² and RMSE
    test_r2 = r2_score(y_test, y_pred_test)
    test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))

    # Print train and test results
    print(f"{model_name} Model:")
    print(f"Train R²: {train_r2:.4f}")
    print(f"Train RMSE: {train_rmse:.4f}")
    print(f"Test R²: {test_r2:.4f}")
    print(f"Test RMSE: {test_rmse:.4f}\n")

    # Return results in a dictionary for further use
    results = {
        'Model': model_name,
        'Train R²': train_r2,
        'Train RMSE': train_rmse,
        'Test R²': test_r2,
        'Test RMSE': test_rmse
    }

    return results


In [199]:
models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(),
    'K-Neighbors Regressor': KNeighborsRegressor(),
    'Decision Tree': DecisionTreeRegressor(),
    'SVR': SVR(),
    'XGBoost': XGBRegressor(),
    'Gradient Boosting': GradientBoostingRegressor(),
}

In [200]:
results = []
for model_name, model in models.items():
    evaluate_model(model, X_train, y_train, X_test, y_test, model_name)

results_df = pd.DataFrame(results)

Linear Regression Model:
Train R²: 0.8710
Train RMSE: 696.8887
Test R²: 0.8670
Test RMSE: 706.1978

Random Forest Model:
Train R²: 0.9999
Train RMSE: 19.7679
Test R²: 0.9994
Test RMSE: 48.3900

K-Neighbors Regressor Model:
Train R²: 0.9608
Train RMSE: 384.3489
Test R²: 0.9393
Test RMSE: 476.9489

Decision Tree Model:
Train R²: 1.0000
Train RMSE: 0.0000
Test R²: 0.9991
Test RMSE: 56.6191

SVR Model:
Train R²: 0.0752
Train RMSE: 1865.9443
Test R²: 0.0743
Test RMSE: 1863.1244

XGBoost Model:
Train R²: 0.9993
Train RMSE: 51.3855
Test R²: 0.9991
Test RMSE: 58.9627

Gradient Boosting Model:
Train R²: 0.8689
Train RMSE: 702.5887
Test R²: 0.8671
Test RMSE: 706.0574



# Work on the best model XGBoost

In [201]:
X = preprocessor.fit_transform(X)

In [202]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [203]:
param_grid_xgb = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.3],
    'subsample': [0.8, 0.9, 1.0]
}

xgb_model = XGBRegressor()
xgb_grid_search = GridSearchCV(xgb_model, param_grid_xgb, cv=5, scoring='neg_mean_squared_error')

In [204]:
xgb_grid_search.fit(X_train, y_train)

In [205]:
best_xgb_params = xgb_grid_search.best_params_
best_xgb_model = xgb_grid_search.best_estimator_

In [207]:
best_xgb_pred = best_xgb_model.predict(X_test)
best_xgb_r2 = r2_score(y_test, best_xgb_pred)
best_xgb_rmse = np.sqrt(mean_squared_error(y_test, best_xgb_pred))

In [208]:
print("\nBest XGBoost Model:")
print("R²:", best_xgb_r2)
print("RMSE:", best_xgb_rmse)
print("Best Parameters:", best_xgb_params)


Best XGBoost Model:
R²: 0.9996820777625611
RMSE: 34.52722095477642
Best Parameters: {'learning_rate': 0.3, 'max_depth': 7, 'n_estimators': 300, 'subsample': 0.8}


In [209]:
import joblib
import pickle

In [210]:
# # Save the best model
# model_filename = 'best_xgb_model.pkl'
# joblib.dump(best_xgb_model, model_filename)

In [211]:
# # Save the preprocessing pipeline
# preprocessor_filename = 'preprocessor.pkl'
# joblib.dump(preprocessor, preprocessor_filename)