In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

# Load the dataset
df = pd.read_csv("Store.csv", parse_dates=["Order Date", "Ship Date"])

# Feature Engineering
df['Year'] = df['Order Date'].dt.year
df['Month'] = df['Order Date'].dt.month.astype(str)       # Treat as categorical
df['DayOfWeek'] = df['Order Date'].dt.dayofweek.astype(str)
df['Quantity'] = df['Quantity'].astype(str)               # Convert to categorical
df['Sales_Discount'] = df['Sales'] * df['Discount']       # Interaction term

# Define features and target
categorical_features = ['Region', 'Ship Mode', 'Segment', 'Category',
                        'Sub-Category', 'Month', 'DayOfWeek', 'Quantity']
numerical_features = ['Sales', 'Discount', 'Year', 'Sales_Discount']
target = 'Profit'

# Full feature matrix and target vector
X = df[categorical_features + numerical_features]
y = df[target]

# One-Hot Encoding
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), categorical_features)
    ],
    remainder='passthrough'
)

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
from lazypredict.Supervised import LazyRegressor
from sklearn.model_selection import train_test_split

# Transform data
X_processed = preprocessor.fit_transform(X)

# Convert to DataFrame (force dense array)
X_df = pd.DataFrame(X_processed.toarray())

# Split
X_train_p, X_test_p, y_train_p, y_test_p = train_test_split(X_df, y, test_size=0.2, random_state=42)

# Run LazyPredict
lazy = LazyRegressor(verbose=0, ignore_warnings=True)
models, predictions = lazy.fit(X_train_p, X_test_p, y_train_p, y_test_p)

print(models)

  0%|          | 0/42 [00:00<?, ?it/s]

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003363 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 693
[LightGBM] [Info] Number of data points in the train set: 7995, number of used features: 59
[LightGBM] [Info] Start training from score 30.697456
                                    Adjusted R-Squared  \
Model                                                    
XGBRegressor                                      0.74   
HistGradientBoostingRegressor                     0.58   
ElasticNetCV                                      0.54   
ElasticNet                                        0.52   
LinearSVR                                         0.52   
KNeighborsRegressor                               0.48   
LGBMRegressor                                     0.48   
HuberRegressor                                    0.47   
TweedieRegresso

In [11]:
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

# Encode features with preprocessor → dense array
X_encoded = preprocessor.fit_transform(X).toarray()

# Train-test split
X_train_enc, X_test_enc, y_train_enc, y_test_enc = train_test_split(
    X_encoded, y, test_size=0.2, random_state=42
)

# XGBoost model (default settings)
xgb = XGBRegressor(random_state=42)
xgb.fit(X_train_enc, y_train_enc)
y_pred = xgb.predict(X_test_enc)

# Evaluation
print("XGBoost Regressor")
print(f"R² Score: {r2_score(y_test_enc, y_pred):.4f}")
print(f"RMSE: {mean_squared_error(y_test_enc, y_pred, squared=False):.2f}")
print(f"MAE: {mean_absolute_error(y_test_enc, y_pred):.2f}")

XGBoost Regressor
R² Score: 0.7432
RMSE: 111.59
MAE: 23.57


In [18]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

# Fit
lr = LinearRegression()
lr.fit(X_train_enc, y_train_enc)

# Predict
y_pred_lr = lr.predict(X_test_enc)

# Evaluate
print("Linear Regression")
print(f"R² Score: {r2_score(y_test_enc, y_pred_lr):.4f}")
print(f"RMSE: {mean_squared_error(y_test_enc, y_pred_lr, squared=False):.2f}")
print(f"MAE: {mean_absolute_error(y_test_enc, y_pred_lr):.2f}")

Linear Regression
R² Score: 0.3465
RMSE: 178.01
MAE: 43.26


In [17]:
from sklearn.tree import DecisionTreeRegressor

dt = DecisionTreeRegressor(max_depth=5, min_samples_leaf=10, random_state=42)
dt.fit(X_train_enc, y_train_enc)
y_pred_dt = dt.predict(X_test_enc)

print("Decision Tree Regressor")
print(f"R² Score: {r2_score(y_test_enc, y_pred_dt):.4f}")
print(f"RMSE: {mean_squared_error(y_test_enc, y_pred_dt, squared=False):.2f}")
print(f"MAE: {mean_absolute_error(y_test_enc, y_pred_dt):.2f}")

Decision Tree Regressor
R² Score: -0.0913
RMSE: 230.02
MAE: 49.73


In [22]:
from sklearn.ensemble import RandomForestRegressor
rf_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor(n_estimators=100, max_depth=8, random_state=42))
])

# Fit the model
rf_pipeline.fit(X_train, y_train)

# Predict
y_pred_rf = rf_pipeline.predict(X_test)

# Evaluation
print("Random Forest Regressor")
print(f"R²: {r2_score(y_test, y_pred_rf):.4f}")
print(f"RMSE: {mean_squared_error(y_test, y_pred_rf, squared=False):.2f}")
print(f"MAE: {mean_absolute_error(y_test, y_pred_rf):.2f}")

Random Forest Regressor
R²: 0.0876
RMSE: 210.33
MAE: 32.90
