# **Dataset Pre-Processing**

In [1]:
import pandas as pd
dataset = pd.read_csv('C:/Users/nextn/Downloads/Git/demand_prediction/data/raw/retail_store_inventory.csv')

In [2]:
dataset.drop(columns=['Store ID', 'Product ID'], axis=1, inplace=True)

In [3]:
dataset = dataset.rename(columns={'Inventory Level': 'Inventory', 'Units Sold': 'Sales',
                                  'Units Ordered': 'Order', 'Demand Forecast': 'Demand',
                                  'Weather Condition': 'Weather', 'Holiday/Promotion': 'Promotion',
                                  'Competitor Pricing': 'Competitor Price'})

In [4]:
dataset['Date'] = pd.to_datetime(dataset['Date'])

In [5]:
dataset['Year'] = dataset['Date'].dt.year
dataset['Month'] = dataset['Date'].dt.month
dataset['Day'] = dataset['Date'].dt.day

In [6]:
dataset.drop(columns=['Date'], axis=1, inplace=True)

In [7]:
dataset = pd.get_dummies(dataset, columns=['Category', 'Region', 'Weather', 'Seasonality'], drop_first=True).astype(int)

In [8]:
dataset['Year'] = dataset['Date'].dt.year
dataset['Month'] = dataset['Date'].dt.month
dataset['Day'] = dataset['Date'].dt.day

KeyError: 'Date'

In [None]:
dataset.drop(columns=['Date'], axis=1, inplace=True)

In [None]:
# OHE
dataset = pd.get_dummies(dataset, columns=['Category', 'Region', 'Weather', 'Seasonality'], drop_first=True).astype(int)

# **Feature Scaling**

In [None]:
X = dataset.drop('Demand', axis=1)
y = dataset['Demand']

In [None]:
feature_names = list(X.keys())
print(feature_names)

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=0.95)
X_pca = pca.fit_transform(X_scaled)

print(f"Original number of features: {X_scaled.shape[1]}")
print(f"Number of components after PCA: {X_pca.shape[1]}")

In [None]:
# PCA
from sklearn.decomposition import PCA

pca = PCA(n_components=0.95)
X_pca = pca.fit_transform(X_scaled)

print(f"Original number of features: {X_scaled.shape[1]}")
print(f"Number of components after PCA: {X_pca.shape[1]}")

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42)

1. **SVR**

$$
\hat{y}(x) = \sum_{i=1}^{n} (\alpha_i - \alpha_i^*) K(x_i, x) + b
$$

In [None]:
from sklearn.svm import SVR
# Train
svr_model = SVR(kernel='rbf')
svr_model.fit(X_train, y_train)

In [None]:
# Prediction
y_pred_svr = svr_model.predict(X_test)

In [None]:
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
# Calculate metrics
mse_svr = mean_squared_error(y_test, y_pred_svr)
rmse_svr = np.sqrt(mse_svr)
mae_svr = mean_absolute_error(y_test, y_pred_svr)
r2_svr = r2_score(y_test, y_pred_svr)

# Print results
print("SVR Results:")
print("MSE:", mse_svr)
print("RMSE:", rmse_svr)
print("MAE:", mae_svr)
print("R-squared:", r2_svr)

In [None]:
from sklearn.model_selection import cross_val_score

# Cross-validation
cv_rmse_gb = cross_val_score(svr_model, X_train, y_train, cv=5, scoring='neg_root_mean_squared_error')
cv_rmse_mean_gb = np.mean(np.abs(cv_rmse_gb))
cv_rmse_std_gb = np.std(np.abs(cv_rmse_gb))
print(f"Cross-validation RMSE: {cv_rmse_mean_gb} ± {cv_rmse_std_gb}")

In [None]:
import matplotlib.pyplot as plt
from sklearn.model_selection import learning_curve
import numpy as np
from sklearn.metrics import mean_absolute_error

# Define a function to plot learning curves with MSE, RMSE, R², and MAE
def plot_learning_curve(estimator, X, y, cv=5, train_sizes=np.linspace(0.1, 1.0, 10)):
    # Compute learning curves for MSE
    train_sizes, train_scores_mse, val_scores_mse = learning_curve(
        estimator, X, y, cv=cv, train_sizes=train_sizes, scoring='neg_mean_squared_error'
    )

    # Convert negative MSE to positive MSE
    train_scores_mse = -train_scores_mse
    val_scores_mse = -val_scores_mse

    # Compute RMSE from MSE
    train_scores_rmse = np.sqrt(train_scores_mse)
    val_scores_rmse = np.sqrt(val_scores_mse)

    # Compute learning curves for R²
    train_sizes, train_scores_r2, val_scores_r2 = learning_curve(
        estimator, X, y, cv=cv, train_sizes=train_sizes, scoring='r2'
    )

    # Compute learning curves for MAE
    train_sizes, train_scores_mae, val_scores_mae = learning_curve(
        estimator, X, y, cv=cv, train_sizes=train_sizes, scoring='neg_mean_absolute_error'
    )

    # Convert negative MAE to positive MAE
    train_scores_mae = -train_scores_mae
    val_scores_mae = -val_scores_mae

    # Calculate mean and standard deviation for all metrics
    metrics = {
        "MSE": (train_scores_mse, val_scores_mse),
        "RMSE": (train_scores_rmse, val_scores_rmse),
        "R²": (train_scores_r2, val_scores_r2),
        "MAE": (train_scores_mae, val_scores_mae)
    }

    means_stds = {metric: (np.mean(train, axis=1), np.std(train, axis=1),
                           np.mean(val, axis=1), np.std(val, axis=1))
                  for metric, (train, val) in metrics.items()}

    # Plot the learning curves
    plt.figure(figsize=(30, 8))

    metric_titles = {
        "MSE": "Mean Squared Error (MSE)",
        "RMSE": "Root Mean Squared Error (RMSE)",
        "R²": "R-squared (R²)",
        "MAE": "Mean Absolute Error (MAE)"
    }

    colors = {"train": 'r', "val": 'g'}

    for i, (metric, (train_mean, train_std, val_mean, val_std)) in enumerate(means_stds.items()):
        plt.subplot(1, 4, i + 1)
        plt.plot(train_sizes, train_mean, 'o-', color=colors["train"], label=f'Training {metric}')
        plt.plot(train_sizes, val_mean, 'o-', color=colors["val"], label=f'Validation {metric}')
        plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, alpha=0.1, color=colors["train"])
        plt.fill_between(train_sizes, val_mean - val_std, val_mean + val_std, alpha=0.1, color=colors["val"])
        plt.xlabel('Training Set Size')
        plt.ylabel(metric_titles[metric])
        plt.title(f'{metric_titles[metric]} Learning Curve for {type(estimator).__name__}')
        plt.legend(loc='best')
        plt.grid(True)

    plt.tight_layout()
    plt.show()

# Plot the Learning Curve including MAE
plot_learning_curve(svr_model, X_train, y_train)

In [None]:
from lime.lime_tabular import LimeTabularExplainer
import matplotlib.pyplot as plt

def visualize_lime(estimator, X_train_scaled, X_test_scaled, feature_names, num_features=5, instance_index=0):
    # Initialize LIME explainer
    explainer = LimeTabularExplainer(
        X_train_scaled,
        feature_names=feature_names,
        mode="regression"
    )

    # Select the specified instance from the test set for explanation
    instance = X_test_scaled[instance_index]

    # Explain the prediction for the selected instance
    exp = explainer.explain_instance(instance, estimator.predict, num_features=num_features)

    # Print textual explanation for context
    print(f"LIME Output for {type(estimator).__name__} Model:")
    print(f"Prediction: {estimator.predict(instance.reshape(1, -1))[0]:.4f}")
    print("\nTop Contributing Features:")
    for feature, contribution in exp.as_list():
        print(f"{feature}: {contribution:.4f}")

    # Display explanation in notebook
    exp.show_in_notebook(show_table=True)

    # Customize and display Matplotlib plot
    fig = exp.as_pyplot_figure()
    fig.set_size_inches(8, 6)
    plt.title(f"LIME Explanation for Test Instance {instance_index + 1}", fontsize=14, fontweight='bold')
    plt.xlabel("Feature Contribution to Prediction", fontsize=12)
    plt.ylabel("Feature", fontsize=12)
    plt.xticks(fontsize=10)
    plt.yticks(fontsize=10)
    plt.grid(axis="x", linestyle="--", alpha=0.7)
    plt.tight_layout()
    plt.show()

# Call the LIME visualization function
visualize_lime(svr_model, X_train, X_test, feature_names, num_features=10, instance_index=0)

2. **MLP Regressor**
$$
\hat{y} = f_{\text{out}}\left( \sum_{j=1}^{H} w_j^{(2)} \cdot f_{\text{act}}\left( \sum_{i=1}^{d} w_{ji}^{(1)} x_i + b_j^{(1)} \right) + b^{(2)} \right)
$$

In [None]:
from sklearn.neural_network import MLPRegressor
# Train
mlp_model = MLPRegressor(hidden_layer_sizes=(100, 50), max_iter=500, early_stopping=True, validation_fraction=0.1, random_state=42)
mlp_model.fit(X_train, y_train)

In [None]:
y_pred_mlp = mlp_model.predict(X_test)

In [None]:
# Calculate metrics
mse_mlp = mean_squared_error(y_test, y_pred_mlp)
rmse_mlp = np.sqrt(mse_mlp)
mae_mlp = mean_absolute_error(y_test, y_pred_mlp)
r2_mlp = r2_score(y_test, y_pred_mlp)

# Print results
print("MLP Results:")
print("MSE:", mse_mlp)
print("RMSE:", rmse_mlp)
print("MAE:", mae_mlp)
print("R-squared:", r2_mlp)

In [None]:
# Cross-validation
cv_rmse_mlp = cross_val_score(mlp_model, X_train, y_train, cv=5, scoring='neg_root_mean_squared_error')
cv_rmse_mean_mlp = np.mean(np.abs(cv_rmse_mlp))
cv_rmse_std_mlp = np.std(np.abs(cv_rmse_mlp))

print(f"Cross-validation RMSE: {cv_rmse_mean_mlp} ± {cv_rmse_std_mlp}")

In [None]:
# Plot the learning curve
plot_learning_curve(mlp_model, X_train, y_train)

In [None]:
# Call the function with the trained model and test data
visualize_lime(mlp_model, X_train, X_test, feature_names, num_features=10, instance_index=0)

3. **KNN Regressor**
$$
\hat{y}(\mathbf{x}) = \frac{1}{k} \sum_{i \in \mathcal{N}_k(\mathbf{x})} y_i
$$

In [None]:
from sklearn.neighbors import KNeighborsRegressor
# Training
knn_model = KNeighborsRegressor(n_neighbors=5)
knn_model.fit(X_train, y_train)

In [None]:
y_pred_knn = knn_model.predict(X_test)

In [None]:
# Calculate metrics
mse_knn = mean_squared_error(y_test, y_pred_knn)
rmse_knn = np.sqrt(mse_knn)
mae_knn = mean_absolute_error(y_test, y_pred_knn)
r2_knn = r2_score(y_test, y_pred_knn)

# Print results
print("KNN Results:")
print("MSE:", mse_knn)
print("RMSE:", rmse_knn)
print("MAE:", mae_knn)
print("R-squared:", r2_knn)

In [None]:
# Cross-validation
cv_rmse_knn = cross_val_score(knn_model, X_train, y_train, cv=5, scoring='neg_root_mean_squared_error')
cv_rmse_mean_knn = np.mean(np.abs(cv_rmse_knn))
cv_rmse_std_knn = np.std(np.abs(cv_rmse_knn))

print(f"Cross-validation RMSE: {cv_rmse_mean_knn} ± {cv_rmse_std_knn}")

In [None]:
# Plot the learning curve
plot_learning_curve(knn_model, X_train, y_train)

In [None]:
# Call the function with the trained model and test data
visualize_lime(knn_model, X_train, X_test, feature_names, num_features=10, instance_index=0)