In [7]:
from faker import Faker
import random
import pandas as pd
import numpy as np

In [8]:
fake = Faker()

In [9]:
def generate_inventory_data(num_records=1000):
    data = []
    for _ in range(num_records):
        record = {
            "Product_ID": fake.uuid4()[:8],
            "Product_Category": random.choice(["Electronics", "Groceries", "Clothing", "Furniture"]),
            "Price": round(random.uniform(10, 5000), 2),
            "Stock_Quantity": random.randint(0, 500),
            "Demand": random.randint(0, 300),
            "Season": random.choice(["Summer", "Winter", "Spring", "Fall"]),
            "Promotions": random.choice(["Yes", "No"]),
            "Weekday": random.choice(["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]),
            "Sales_Channel": random.choice(["Online", "In-Store"]),
            "Weather": random.choice(["Sunny", "Rainy", "Snowy", "Cloudy"]),
            "Lead_Time": random.randint(1, 30),
            "Supplier_Reliability": round(random.uniform(1, 5), 2),
            "Price_Change": random.choice(["Yes", "No"]),
            "Economic_Indicator": round(random.uniform(0.5, 5.0), 2),
            "Customer_Sentiment": random.choice(["Positive", "Neutral", "Negative"]),
        }
        data.append(record)
    df = pd.DataFrame(data)
    
    # Introduce inconsistencies
    # 1. Missing values
    for col in ["Demand", "Stock_Quantity"]:
        df.loc[df.sample(frac=0.1).index, col] = np.nan

    # 2. Outliers
    df.loc[df.sample(frac=0.05).index, "Price"] = df["Price"].max() * 10
    df.loc[df.sample(frac=0.05).index, "Demand"] = df["Demand"].max() * 5

    # 3. Duplicates
    duplicate_rows = df.sample(frac=0.05)
    df = pd.concat([df, duplicate_rows], ignore_index=True)

    # 4. Noisy data
    df.loc[df.sample(frac=0.1).index, "Weather"] = "Unknown"
    df.loc[df.sample(frac=0.1).index, "Sales_Channel"] = "Other"

    return df


In [10]:
# Generate dataset
inventory_data = generate_inventory_data()
# Save the dataset to a CSV file
file_path = "Inventory_Demand_Forecasting_Synthetic_Dataset.csv"
inventory_data.to_csv(file_path, index=False)

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Load the raw dataset
file_path = "Inventory_Demand_Forecasting_Synthetic_Dataset.csv"
df = pd.read_csv(file_path)

# Separate features and target
X = df.drop(columns=["Demand"])  # Features
y = df["Demand"]                # Target variable

# Handle missing values in the target variable
y = y.fillna(y.mean())  # Fill missing values in the target variable with its mean

# Encode categorical variables using one-hot encoding
X = pd.get_dummies(X, drop_first=True)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the HistGradientBoostingRegressor model
hgb_model = HistGradientBoostingRegressor(random_state=42)

# Train the model on the data
hgb_model.fit(X_train, y_train)

# Predict on the test set
y_pred_hgb = hgb_model.predict(X_test)

# Evaluate the model
mse_hgb = mean_squared_error(y_test, y_pred_hgb)
r2_hgb = r2_score(y_test, y_pred_hgb)

# Print evaluation metrics
print("\nHistGradientBoostingRegressor:")
print(f"  Mean Squared Error (MSE): {mse_hgb:.2f}")
print(f"  R-squared (R²): {r2_hgb:.2f}")



HistGradientBoostingRegressor:
  Mean Squared Error (MSE): 118476.13
  R-squared (R²): -0.06


In [3]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split

# Load the dataset
file_path = "Inventory_Demand_Forecasting_Synthetic_Dataset.csv"
df = pd.read_csv(file_path)

# Step 1: Handle missing values using imputation
# Impute numerical columns with mean
numerical_columns = df.select_dtypes(include=["float64", "int64"]).columns
numerical_imputer = SimpleImputer(strategy="mean")
df[numerical_columns] = numerical_imputer.fit_transform(df[numerical_columns])

# Impute categorical columns with mode (most frequent value)
categorical_columns = df.select_dtypes(include=["object"]).columns
categorical_imputer = SimpleImputer(strategy="most_frequent")
df[categorical_columns] = categorical_imputer.fit_transform(df[categorical_columns])

# Step 2: Remove duplicates
df.drop_duplicates(inplace=True)

# Step 3: Feature Scaling (Normalize numerical features like 'Price' and 'Stock Quantity')
scaler = StandardScaler()
df[numerical_columns] = scaler.fit_transform(df[numerical_columns])

# Step 4: One-Hot Encoding for categorical features
df = pd.get_dummies(df, drop_first=True)

# Step 5: Feature Engineering
# Create new features such as 'Sales per Week' or 'Demand-to-Stock Ratio'
df['Demand_to_Stock_Ratio'] = df['Demand'] / (df['Stock_Quantity'] + 1e-6)  # Avoid division by zero
df['Sales_per_Week'] = df['Demand'] / 7  # Assuming 'Demand' is for the whole week

# Step 6: Dimensionality Reduction (PCA) if needed
# Apply PCA to reduce dimensionality if required (optional step)
pca = PCA(n_components=0.95)  # Retain 95% variance
df_reduced = pca.fit_transform(df.drop(columns=['Demand']))  # Dropping target column for PCA

# Prepare the final dataset
X = df.drop(columns=['Demand'])  # Features
y = df['Demand']  # Target

# Optionally, you can split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Print the first few rows of the preprocessed data
print("Preprocessed Data (first 5 rows):")
print(X_train.head())


Preprocessed Data (first 5 rows):
        Price  Stock_Quantity  Lead_Time  Supplier_Reliability  \
137 -0.324940       -0.787564  -1.004966              1.342719   
377 -0.392315       -0.929180  -0.532720              0.626254   
388 -0.054746        0.941644   1.474328              1.368615   
824 -0.378458       -1.711796  -0.650781             -1.505877   
767 -0.253054       -0.250913  -0.178535             -0.711723   

     Economic_Indicator  Product_ID_00d6056e  Product_ID_019d166a  \
137            1.703162                False                False   
377            1.588231                False                False   
388           -0.426897                False                False   
824           -1.292713                False                False   
767           -1.614520                False                False   

     Product_ID_01bf0e62  Product_ID_0229df92  Product_ID_024f7aae  ...  \
137                False                False                False  ...   
377 

In [5]:
gb_model = HistGradientBoostingRegressor(random_state=42)
hgb_model.fit(X_train, y_train)

# Step 7: Evaluate the model
y_pred_hgb = hgb_model.predict(X_test)

# Calculate performance metrics
mse_hgb = mean_squared_error(y_test, y_pred_hgb)
r2_hgb = r2_score(y_test, y_pred_hgb)

print("\nHistGradientBoostingRegressor Model Evaluation:")
print(f"  Mean Squared Error (MSE): {mse_hgb:.2f}")
print(f"  R-squared (R²): {r2_hgb:.2f}")


HistGradientBoostingRegressor Model Evaluation:
  Mean Squared Error (MSE): 0.00
  R-squared (R²): 1.00
