### Importing libraries

In [1]:
from faker import Faker
import random
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score, root_mean_squared_error
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
import streamlit as st

### Dataset Generation using 

In [2]:
fake = Faker()

In [3]:
def generate_inventory_data(num_records=1000):
    data = []
    for _ in range(num_records):
        record = {
            "Product_ID": fake.uuid4()[:8],
            "Product_Category": random.choice(["Electronics", "Groceries", "Clothing", "Furniture"]),
            "Price": round(random.uniform(10, 5000), 2),
            "Stock_Quantity": random.randint(0, 500),
            "Demand": random.randint(0, 300),
            "Season": random.choice(["Summer", "Winter", "Spring", "Fall"]),
            "Promotions": random.choice(["Yes", "No"]),
            "Weekday": random.choice(["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]),
            "Sales_Channel": random.choice(["Online", "In-Store"]),
            "Weather": random.choice(["Sunny", "Rainy", "Snowy", "Cloudy"]),
            "Lead_Time": random.randint(1, 30),
            "Supplier_Reliability": round(random.uniform(1, 5), 2),
            "Price_Change": random.choice(["Yes", "No"]),
            "Economic_Indicator": round(random.uniform(0.5, 5.0), 2),
            "Customer_Sentiment": random.choice(["Positive", "Neutral", "Negative"]),
        }
        data.append(record)
    df = pd.DataFrame(data)
    
    # Introduce inconsistencies
    # 1. Missing values
    for col in ["Demand", "Stock_Quantity"]:
        df.loc[df.sample(frac=0.1).index, col] = np.nan

    # 2. Outliers
    df.loc[df.sample(frac=0.05).index, "Price"] = df["Price"].max() * 10
    df.loc[df.sample(frac=0.05).index, "Demand"] = df["Demand"].max() * 5

    # 3. Duplicates
    duplicate_rows = df.sample(frac=0.05)
    df = pd.concat([df, duplicate_rows], ignore_index=True)

    # 4. Noisy data
    df.loc[df.sample(frac=0.1).index, "Weather"] = "Unknown"
    df.loc[df.sample(frac=0.1).index, "Sales_Channel"] = "Other"

    return df


In [4]:
# Generate and save the dataset
df = generate_inventory_data()
df.to_csv("inventory_data.csv", index=False)

In [5]:
# Load data
df = pd.read_csv("inventory_data.csv")

### Prediction and Evaluation after Pre-processing

In [6]:
# Separate features and target
X = df.drop(columns=["Demand"])  # Features
y = df["Demand"]                # Target variable

# Handle NaNs temporarily for training
y = y.fillna(0)
# Convert categorical features using one-hot encoding
X = pd.get_dummies(X, drop_first=True)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the HistGradientBoostingRegressor (handles NaN natively)
hgb_model = HistGradientBoostingRegressor(random_state=42)
hgb_model.fit(X_train, y_train)

# Predict on the test set
y_pred = hgb_model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("HistGradientBoostingRegressor Evaluation (Raw Dataset):")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"R-squared (R²): {r2:.2f}")

HistGradientBoostingRegressor Evaluation (Raw Dataset):
Mean Squared Error (MSE): 70713.10
R-squared (R²): -0.37


### Pre-processing

In [8]:
df.isnull().sum()

Product_ID                0
Product_Category          0
Price                     0
Stock_Quantity          105
Demand                  102
Season                    0
Promotions                0
Weekday                   0
Sales_Channel             0
Weather                   0
Lead_Time                 0
Supplier_Reliability      0
Price_Change              0
Economic_Indicator        0
Customer_Sentiment        0
dtype: int64

In [9]:
# Step 1: Handle missing values using imputation
# Impute numerical columns with mean
numerical_columns = df.select_dtypes(include=["float64", "int64"]).columns
numerical_imputer = SimpleImputer(strategy="mean")
df[numerical_columns] = numerical_imputer.fit_transform(df[numerical_columns])

In [10]:
df.isnull().sum()

Product_ID              0
Product_Category        0
Price                   0
Stock_Quantity          0
Demand                  0
Season                  0
Promotions              0
Weekday                 0
Sales_Channel           0
Weather                 0
Lead_Time               0
Supplier_Reliability    0
Price_Change            0
Economic_Indicator      0
Customer_Sentiment      0
dtype: int64

In [11]:
# Step 2: Remove duplicates
df_duplicates = df.drop_duplicates(inplace=True)

In [12]:
print(df_duplicates)

None


In [13]:
# Step 3: Feature Engineering
# Create new features such as 'Sales per Week' and 'Demand-to-Stock Ratio'
df['Demand_to_Stock_Ratio'] = df['Demand'] / (df['Stock_Quantity'] + 1e-6)  # Avoid division by zero
df['Sales_per_Week'] = df['Demand'] / 7  # Assuming 'Demand' is for the whole week

In [14]:
df.head()

Unnamed: 0,Product_ID,Product_Category,Price,Stock_Quantity,Demand,Season,Promotions,Weekday,Sales_Channel,Weather,Lead_Time,Supplier_Reliability,Price_Change,Economic_Indicator,Customer_Sentiment,Demand_to_Stock_Ratio,Sales_per_Week
0,03594d34,Electronics,1476.42,281.0,30.0,Winter,Yes,Thursday,In-Store,Sunny,17.0,4.2,No,3.89,Neutral,0.106762,4.285714
1,386aec8b,Clothing,4933.75,280.0,1500.0,Fall,No,Tuesday,In-Store,Sunny,20.0,2.18,Yes,2.51,Positive,5.357143,214.285714
2,bb432ecf,Clothing,3103.73,85.0,276.0,Summer,No,Friday,Other,Snowy,24.0,2.98,No,4.6,Neutral,3.247059,39.428571
3,12128559,Clothing,1482.51,139.0,1500.0,Summer,No,Saturday,Online,Unknown,11.0,2.13,No,2.28,Negative,10.791367,214.285714
4,16ae781e,Clothing,4915.92,97.0,36.0,Fall,Yes,Thursday,Other,Cloudy,6.0,3.57,No,2.29,Neutral,0.371134,5.142857


In [15]:
# Step 4: Feature Scaling (Normalize numerical features like 'Price' and 'Stock Quantity')
scaler = StandardScaler()
df[numerical_columns] = scaler.fit_transform(df[numerical_columns])

In [16]:
df.head()

Unnamed: 0,Product_ID,Product_Category,Price,Stock_Quantity,Demand,Season,Promotions,Weekday,Sales_Channel,Weather,Lead_Time,Supplier_Reliability,Price_Change,Economic_Indicator,Customer_Sentiment,Demand_to_Stock_Ratio,Sales_per_Week
0,03594d34,Electronics,-0.323892,0.212348,-0.650184,Winter,Yes,Thursday,In-Store,Sunny,0.240151,1.070876,No,0.862996,Neutral,0.106762,4.285714
1,386aec8b,Clothing,0.005934,0.204976,4.219209,Fall,No,Tuesday,In-Store,Sunny,0.592885,-0.648662,Yes,-0.185017,Positive,5.357143,214.285714
2,bb432ecf,Clothing,-0.168648,-1.232642,0.164694,Summer,No,Friday,Other,Snowy,1.063196,0.032343,No,1.40219,Neutral,3.247059,39.428571
3,12128559,Clothing,-0.323311,-0.834533,4.219209,Summer,No,Saturday,Online,Unknown,-0.465315,-0.691225,No,-0.359686,Negative,10.791367,214.285714
4,16ae781e,Clothing,0.004233,-1.144173,-0.630309,Fall,Yes,Thursday,Other,Cloudy,-1.053204,0.534584,No,-0.352092,Neutral,0.371134,5.142857


In [17]:
# Step 4: One-Hot Encoding for categorical features
df = pd.get_dummies(df, drop_first=True)

In [18]:
df.head()

Unnamed: 0,Price,Stock_Quantity,Demand,Lead_Time,Supplier_Reliability,Economic_Indicator,Demand_to_Stock_Ratio,Sales_per_Week,Product_ID_0086622e,Product_ID_010f200d,...,Weekday_Wednesday,Sales_Channel_Online,Sales_Channel_Other,Weather_Rainy,Weather_Snowy,Weather_Sunny,Weather_Unknown,Price_Change_Yes,Customer_Sentiment_Neutral,Customer_Sentiment_Positive
0,-0.323892,0.212348,-0.650184,0.240151,1.070876,0.862996,0.106762,4.285714,False,False,...,False,False,False,False,False,True,False,False,True,False
1,0.005934,0.204976,4.219209,0.592885,-0.648662,-0.185017,5.357143,214.285714,False,False,...,False,False,False,False,False,True,False,True,False,True
2,-0.168648,-1.232642,0.164694,1.063196,0.032343,1.40219,3.247059,39.428571,False,False,...,False,False,True,False,True,False,False,False,True,False
3,-0.323311,-0.834533,4.219209,-0.465315,-0.691225,-0.359686,10.791367,214.285714,False,False,...,False,True,False,False,False,False,True,False,False,False
4,0.004233,-1.144173,-0.630309,-1.053204,0.534584,-0.352092,0.371134,5.142857,False,False,...,False,False,True,False,False,False,False,False,True,False


### Prediction and Evaluation after Pre-processing

In [19]:
# Prepare the final dataset
X_p = df.drop(columns=['Demand'])  # Features
y_p = df['Demand']  # Target

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X_p, y_p, test_size=0.2, random_state=42)

# Train the HistGradientBoostingRegressor (handles NaN natively)
hgb_model = HistGradientBoostingRegressor(random_state=42)
hgb_model.fit(X_train, y_train)

# Predict on the test set
y_pred = hgb_model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("HistGradientBoostingRegressor Evaluation (Raw Dataset):")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"R-squared (R²): {r2:.2f}")

HistGradientBoostingRegressor Evaluation (Raw Dataset):
Mean Squared Error (MSE): 0.00
R-squared (R²): 1.00
