In [2]:
import os
import pandas as pd
import kagglehub

# Download latest version and get the correct path
dataset_path = kagglehub.dataset_download("anandaramg/global-superstore")

# List all files in the downloaded dataset directory
files = os.listdir(dataset_path)
print("Dataset Files:", files)

# Find the correct file (Global Superstore.txt)
file_name = None
for file in files:
    if file.startswith("Global Superstore") and file.endswith(".txt"):
        file_name = file
        break

# Check if file is found
if file_name:
    file_path = os.path.join(dataset_path, file_name)
    print("Full file path:", file_path)

    # Try loading with different delimiters
    try:
        df = pd.read_csv(file_path, delimiter=",")  # Try comma-separated first
    except:
        try:
            df = pd.read_csv(file_path, delimiter="\t")  # Try tab-separated
        except:
            print("Error: Could not read the file properly. Please check the delimiter.")

    # Display first few rows
    print(df.head())

else:
    print("Error: 'Global Superstore.txt' not found in the dataset.")

Downloading from https://www.kaggle.com/api/v1/datasets/download/anandaramg/global-superstore?dataset_version_number=1...


100%|██████████| 3.18M/3.18M [00:00<00:00, 108MB/s]

Extracting files...
Dataset Files: ['Global Superstore.txt']
Full file path: /root/.cache/kagglehub/datasets/anandaramg/global-superstore/versions/1/Global Superstore.txt





          Category         City        Country Customer ID     Customer Name  \
0  Office Supplies  Los Angeles  United States   LS-172304  Lycoris Saunders   
1  Office Supplies  Los Angeles  United States   MV-174854     Mark Van Huff   
2  Office Supplies  Los Angeles  United States   CS-121304      Chad Sievert   
3  Office Supplies  Los Angeles  United States   CS-121304      Chad Sievert   
4  Office Supplies  Los Angeles  United States   AP-109154    Arthur Prichep   

   Discount Market  记录数               Order Date        Order ID  ... Sales  \
0       0.0     US    1  2011-01-07 00:00:00.000  CA-2011-130813  ...    19   
1       0.0     US    1  2011-01-21 00:00:00.000  CA-2011-148614  ...    19   
2       0.0     US    1  2011-08-05 00:00:00.000  CA-2011-118962  ...    21   
3       0.0     US    1  2011-08-05 00:00:00.000  CA-2011-118962  ...   111   
4       0.0     US    1  2011-09-29 00:00:00.000  CA-2011-146969  ...     6   

    Segment                Ship Date       S

In [3]:
df.shape

(51290, 27)

In [4]:
df.columns

Index(['Category', 'City', 'Country', 'Customer ID', 'Customer Name',
       'Discount', 'Market', '记录数', 'Order Date', 'Order ID', 'Order Priority',
       'Product ID', 'Product Name', 'Profit', 'Quantity', 'Region', 'Row ID',
       'Sales', 'Segment', 'Ship Date', 'Ship Mode', 'Shipping Cost', 'State',
       'Sub-Category', 'Year', 'Market2', 'weeknum'],
      dtype='object')

In [5]:
!pip install pandas numpy matplotlib seaborn plotly scikit-learn statsmodels




In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_absolute_error, accuracy_score, classification_report
import statsmodels.api as sm



In [7]:
# Convert date columns to datetime format
df["Order Date"] = pd.to_datetime(df["Order Date"])
df["Ship Date"] = pd.to_datetime(df["Ship Date"])

# Calculate shipping delay
df["Shipping Delay"] = (df["Ship Date"] - df["Order Date"]).dt.days

# Preview dataset
# Instead of calling an undefined function, use the display function from IPython.display
display(df.head())

Unnamed: 0,Category,City,Country,Customer ID,Customer Name,Discount,Market,记录数,Order Date,Order ID,...,Segment,Ship Date,Ship Mode,Shipping Cost,State,Sub-Category,Year,Market2,weeknum,Shipping Delay
0,Office Supplies,Los Angeles,United States,LS-172304,Lycoris Saunders,0.0,US,1,2011-01-07,CA-2011-130813,...,Consumer,2011-01-09,Second Class,4.37,California,Paper,2011,North America,2,2
1,Office Supplies,Los Angeles,United States,MV-174854,Mark Van Huff,0.0,US,1,2011-01-21,CA-2011-148614,...,Consumer,2011-01-26,Standard Class,0.94,California,Paper,2011,North America,4,5
2,Office Supplies,Los Angeles,United States,CS-121304,Chad Sievert,0.0,US,1,2011-08-05,CA-2011-118962,...,Consumer,2011-08-09,Standard Class,1.81,California,Paper,2011,North America,32,4
3,Office Supplies,Los Angeles,United States,CS-121304,Chad Sievert,0.0,US,1,2011-08-05,CA-2011-118962,...,Consumer,2011-08-09,Standard Class,4.59,California,Paper,2011,North America,32,4
4,Office Supplies,Los Angeles,United States,AP-109154,Arthur Prichep,0.0,US,1,2011-09-29,CA-2011-146969,...,Consumer,2011-10-03,Standard Class,1.32,California,Paper,2011,North America,40,4


# Sales and Demand Analysis:

- Top Selling products and categories

In [8]:
# Grouping and sorting the top 10 best-selling products
top_products = df.groupby("Product Name")["Sales"].sum().sort_values(ascending=False).head(10).reset_index()

# Creating an interactive bar chart using Plotly
fig = px.bar(
    top_products,
    x="Product Name",
    y="Sales",
    title="Top 10 Best-Selling Products",
    labels={"Sales": "Total Sales", "Product Name": "Product"},
    color="Sales",  # Adds color gradient based on sales
    text_auto=True  # Displays values on bars
)

# Improving layout
fig.update_layout(
    xaxis=dict(tickangle=-45),  # Rotates x-axis labels for readability
    yaxis_title="Total Sales",
    xaxis_title="Product Name",
    template="plotly_white"  # Clean theme
)

# Show interactive plot
fig.show()

- Sale Trends over time

In [9]:
# Ensure 'Order Date' is in datetime format
df["Order Date"] = pd.to_datetime(df["Order Date"])

# Extract Year-Month period
df["Year-Month"] = df["Order Date"].dt.to_period("M").astype(str)  # Convert period to string for Plotly

# Aggregate sales by Year-Month
sales_trend = df.groupby("Year-Month")["Sales"].sum().reset_index()

# Create interactive line chart
fig = px.line(
    sales_trend,
    x="Year-Month",
    y="Sales",
    markers=True,
    title="Sales Trend Over Time",
    labels={"Sales": "Total Sales", "Year-Month": "Time Period"},
    line_shape="spline"  # Smooths the line for better visualization
)

# Improving layout
fig.update_layout(
    xaxis=dict(tickangle=-45, tickmode="array", tickvals=sales_trend["Year-Month"][::3]),  # Show every 3rd label
    yaxis_title="Total Sales",
    xaxis_title="Year-Month",
    template="plotly_white",
    hovermode="x unified"  # Shows values for a specific x-point
)

# Show interactive plot
fig.show()


# Inventory and Order Management

- Demand vs Discount Impact

In [10]:
# Create an interactive scatter plot
fig = px.scatter(
    df,
    x="Discount",
    y="Sales",
    title="Impact of Discount on Sales",
    labels={"Discount": "Discount (%)", "Sales": "Total Sales"},
    opacity=0.6,  # Adjust transparency for better visibility
    color="Sales",  # Color scale based on sales values
    size="Sales",  # Bubble size based on sales value
    hover_data=["Discount", "Sales"]  # Show values on hover
)

# Improve layout
fig.update_layout(
    template="plotly_white",
    xaxis=dict(title="Discount (%)"),
    yaxis=dict(title="Total Sales"),
    showlegend=False
)

# Show interactive plot
fig.show()


- Order Quantity Distribution

In [11]:
import plotly.figure_factory as ff

In [12]:
# Create a histogram with KDE (Kernel Density Estimation)
fig = ff.create_distplot(
    [df["Quantity"].dropna()],  # Drop missing values
    group_labels=["Ordered Quantity"],
    show_hist=True,
    show_rug=False,
    bin_size=1  # Adjust bin size for clarity
)

# Update layout for better visualization
fig.update_layout(
    title="Distribution of Ordered Quantities",
    xaxis_title="Quantity Ordered",
    yaxis_title="Density",
    template="plotly_white"
)

# Show interactive plot
fig.show()

# Shipping and Logistics Optimization

- Shipping cost vs Profitability

In [13]:
# Create an interactive scatter plot
fig = px.scatter(
    df,
    x="Shipping Cost",
    y="Profit",
    title="Shipping Cost vs. Profitability",
    labels={"Shipping Cost": "Shipping Cost ($)", "Profit": "Profit ($)"},
    opacity=0.6,  # Adjust transparency for better visibility
    color="Profit",  # Color scale based on profit values
    # Replacing 'size="Profit"' with a modified size calculation
    size=df['Profit'].abs() + 1e-6,  # Taking absolute value and adding a small positive value to avoid zero size
    hover_data=["Shipping Cost", "Profit"]  # Show values on hover
)

# Improve layout
fig.update_layout(
    template="plotly_white",
    xaxis=dict(title="Shipping Cost ($)"),
    yaxis=dict(title="Profit ($)"),
    showlegend=False
)

# Show interactive plot
fig.show()

- Average shipping delays per region

In [14]:
# Calculate average shipping delay per region
avg_delay_region = df.groupby("Region")["Shipping Delay"].mean().sort_values().reset_index()

# Create an interactive bar chart
fig = px.bar(
    avg_delay_region,
    x="Region",
    y="Shipping Delay",
    title="Average Shipping Delay by Region",
    labels={"Shipping Delay": "Average Delay (Days)", "Region": "Region"},
    color="Shipping Delay",  # Color based on delay values
    text_auto=True  # Display values on bars
)

# Improve layout
fig.update_layout(
    xaxis=dict(tickangle=-45),  # Rotates x-axis labels for readability
    yaxis_title="Average Delay (Days)",
    xaxis_title="Region",
    template="plotly_white"
)

# Show interactive plot
fig.show()


- Late Shipment Analysis

In [15]:
import plotly.graph_objects as go

# Define late shipments (delay > 3 days)
df["Late Shipment"] = df["Shipping Delay"] > 3

# Calculate the percentage of late shipments
late_shipment_rate = df["Late Shipment"].mean() * 100  # Convert to percentage

# Create an interactive gauge chart
fig = go.Figure(go.Indicator(
    mode="gauge+number",
    value=late_shipment_rate,
    title={"text": "Late Shipment Rate"},
    gauge={
        "axis": {"range": [0, 100]},  # Percentage range from 0 to 100
        "bar": {"color": "red"},  # Bar color for better visibility
        "steps": [
            {"range": [0, 30], "color": "green"},
            {"range": [30, 70], "color": "yellow"},
            {"range": [70, 100], "color": "red"}
        ],
    }
))

# Show interactive plot
fig.show()


Color Coded Ranges:
Green (0-30%) → Acceptable
Yellow (30-70%) → Moderate concern
Red (70-100%) → Critical issue

# Profitability and Market Insights

- Market-wise profitability

In [16]:
# Calculate total profit by market
market_profit = df.groupby("Market")["Profit"].sum().sort_values().reset_index()

# Create an interactive bar chart
fig = px.bar(
    market_profit,
    x="Market",
    y="Profit",
    title="Profitability by Market",
    labels={"Profit": "Total Profit ($)", "Market": "Market"},
    color="Profit",  # Color scale based on profit values
    text_auto=True  # Display profit values on bars
)

# Improve layout
fig.update_layout(
    xaxis=dict(tickangle=-45),  # Rotate x-axis labels for readability
    yaxis_title="Total Profit ($)",
    xaxis_title="Market",
    template="plotly_white"
)

# Show interactive plot
fig.show()


- High Value Customers (Customer Lifetime)


In [17]:
# Calculate lifetime sales for each customer and get top 10 customers
customer_ltv = df.groupby("Customer Name")["Sales"].sum().sort_values(ascending=False).head(10).reset_index()

# Create an interactive bar chart
fig = px.bar(
    customer_ltv,
    x="Customer Name",
    y="Sales",
    title="Top 10 Customers by Lifetime Sales",
    labels={"Sales": "Total Sales ($)", "Customer Name": "Customer"},
    color="Sales",  # Color scale based on sales values
    text_auto=True  # Display sales values on bars
)

# Improve layout
fig.update_layout(
    xaxis=dict(tickangle=-45),  # Rotate x-axis labels for better readability
    yaxis_title="Total Sales ($)",
    xaxis_title="Customer Name",
    template="plotly_white"
)

# Show interactive plot
fig.show()


# Predictive Analytics

- Demand Forecasting Using Machine Learning

In [18]:
# Ensure Order Date is in datetime format
df["Order Date"] = pd.to_datetime(df["Order Date"])

# Extract weekly periods
df["Week"] = df["Order Date"].dt.to_period("W") # Change to Period objects

# Aggregate weekly sales
weekly_sales = df.groupby("Week")["Sales"].sum().reset_index()

# Convert 'Week' to timestamps for plotting
weekly_sales["Week"] = weekly_sales["Week"].dt.start_time

# Train-test split
X = np.arange(len(weekly_sales)).reshape(-1, 1)
y = weekly_sales["Sales"].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)


# Train model
model = RandomForestRegressor()
model.fit(X_train, y_train)

# Predict sales
y_pred = model.predict(X_test)

# Evaluate model
mae = mean_absolute_error(y_test, y_pred)
print(f"Demand Forecasting MAE: {mae:.2f}")

# Create an interactive plot
fig = go.Figure()

# Add actual sales trace
fig.add_trace(go.Scatter(
    x=weekly_sales["Week"],
    y=weekly_sales["Sales"],
    mode="lines+markers",
    name="Actual Sales",
    line=dict(color="blue")
))

# Add predicted sales trace
fig.add_trace(go.Scatter(
    x=weekly_sales["Week"].iloc[len(X_train):],
    y=y_pred,
    mode="lines+markers",
    name="Predicted Sales",
    line=dict(color="red", dash="dash")
))

# Improve layout
fig.update_layout(
    title="Demand Forecasting: Actual vs Predicted Sales",
    xaxis_title="Week",
    yaxis_title="Total Sales ($)",
    template="plotly_white",
    hovermode="x"
)

# Show interactive plot
fig.show()


Demand Forecasting MAE: 38157.23


In [19]:
# Ensure Order Date is in datetime format
df["Order Date"] = pd.to_datetime(df["Order Date"])

# Extract weekly periods
df["Week"] = df["Order Date"].dt.to_period("W")  # Weekly aggregation

# Aggregate weekly sales
weekly_sales = df.groupby("Week")["Sales"].sum().reset_index()

# Convert 'Week' to timestamps for plotting
weekly_sales["Week"] = weekly_sales["Week"].dt.start_time

# Feature Engineering: Convert weeks to numerical index
weekly_sales["Week_Index"] = np.arange(len(weekly_sales))

# Define features (X) and target (y)
X = weekly_sales[["Week_Index"]]
y = weekly_sales["Sales"]

# Train-test split (keeping last 20% of weeks as test set)
split = int(0.8 * len(weekly_sales))
X_train, X_test = X[:split], X[split:]
y_train, y_test = y[:split], y[split:]

# Train the model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predict sales
y_pred = model.predict(X_test)

# Evaluate model
mae = mean_absolute_error(y_test, y_pred)
print(f"Demand Forecasting MAE: {mae:.2f}")

# Prepare forecasted dates
forecast_dates = weekly_sales["Week"][split:]

# Create an interactive plot
fig = go.Figure()

# Add actual sales trace
fig.add_trace(go.Scatter(
    x=weekly_sales["Week"],
    y=weekly_sales["Sales"],
    mode="lines+markers",
    name="Actual Sales",
    line=dict(color="blue")
))

# Add predicted sales trace (Corrected date alignment)
fig.add_trace(go.Scatter(
    x=forecast_dates,
    y=y_pred,
    mode="lines+markers",
    name="Predicted Sales",
    line=dict(color="red", dash="dash")
))

# Improve layout
fig.update_layout(
    title="Demand Forecasting: Actual vs Predicted Sales",
    xaxis_title="Week",
    yaxis_title="Total Sales ($)",
    template="plotly_white",
    hovermode="x"
)

# Show interactive plot
fig.show()


Demand Forecasting MAE: 39406.87


In [21]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import statsmodels.api as sm
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_absolute_error

# Ensure Order Date is in datetime format
df["Order Date"] = pd.to_datetime(df["Order Date"])

# Aggregate sales data by week
df["Week"] = df["Order Date"].dt.to_period("W")
weekly_sales = df.groupby("Week")["Sales"].sum().reset_index()
weekly_sales["Week"] = weekly_sales["Week"].dt.start_time

# Split into training and test sets
split = int(len(weekly_sales) * 0.8)  # 80% Training, 20% Test
train, test = weekly_sales.iloc[:split], weekly_sales.iloc[split:]

# Train ARIMA Model (Auto-order selection)
order = (5, 1, 2)  # (p,d,q) - Can be auto-selected using AIC minimization
model = ARIMA(train["Sales"], order=order)
model_fit = model.fit()

# Forecasting on test data
forecast_steps = len(test)  # Predict for the same length as test data
forecast = model_fit.forecast(steps=forecast_steps)

# Evaluate Model Performance
mae = mean_absolute_error(test["Sales"], forecast)
print(f"ARIMA Demand Forecasting MAE: {mae:.2f}")

# Create an interactive plot
fig = go.Figure()

# Actual Sales Trace
fig.add_trace(go.Scatter(
    x=weekly_sales["Week"],
    y=weekly_sales["Sales"],
    mode="lines+markers",
    name="Actual Sales",
    line=dict(color="blue")
))

# Forecasted Sales Trace
fig.add_trace(go.Scatter(
    x=test["Week"],
    y=forecast,
    mode="lines+markers",
    name="Predicted Sales (ARIMA)",
    line=dict(color="red", dash="dash")
))

# Improve layout
fig.update_layout(
    title="Demand Forecasting with ARIMA",
    xaxis_title="Week",
    yaxis_title="Total Sales ($)",
    template="plotly_white",
    hovermode="x"
)

# Show interactive plot
fig.show()


ARIMA Demand Forecasting MAE: 30436.46


In [32]:
pip install prophet




In [36]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
from statsmodels.tsa.arima.model import ARIMA
from prophet import Prophet
from sklearn.metrics import mean_absolute_error
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import MinMaxScaler

# Load dataset
df["Order Date"] = pd.to_datetime(df["Order Date"])

# Aggregate weekly sales
df["Week"] = df["Order Date"].dt.to_period("W")
weekly_sales = df.groupby("Week")["Sales"].sum().reset_index()
weekly_sales["Week"] = weekly_sales["Week"].dt.start_time  # Convert period to datetime

# Show dataset
from IPython.display import display

# Show dataset
display(weekly_sales)

Unnamed: 0,Week,Sales
0,2010-12-27,1122
1,2011-01-03,25830
2,2011-01-10,27175
3,2011-01-17,21796
4,2011-01-24,15336
...,...,...
205,2014-12-01,111233
206,2014-12-08,113997
207,2014-12-15,99448
208,2014-12-22,123279


In [37]:
# Prepare Data for Prophet
prophet_df = weekly_sales.rename(columns={"Week": "ds", "Sales": "y"})

# Initialize Prophet model
prophet = Prophet(yearly_seasonality=True, weekly_seasonality=True, daily_seasonality=False)
prophet.fit(prophet_df)

# Forecast future sales
future = prophet.make_future_dataframe(periods=10, freq='W')
forecast = prophet.predict(future)

# Plot Prophet Results
fig = go.Figure()
fig.add_trace(go.Scatter(x=prophet_df["ds"], y=prophet_df["y"], mode="lines+markers", name="Actual Sales"))
fig.add_trace(go.Scatter(x=forecast["ds"], y=forecast["yhat"], mode="lines", name="Prophet Forecast", line=dict(color="red")))
fig.update_layout(title="Prophet-Based Demand Forecasting", xaxis_title="Week", yaxis_title="Total Sales")
fig.show()


DEBUG:cmdstanpy:input tempfile: /tmp/tmps8tzhya9/v9jj8dpz.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmps8tzhya9/olclyhcj.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=81834', 'data', 'file=/tmp/tmps8tzhya9/v9jj8dpz.json', 'init=/tmp/tmps8tzhya9/olclyhcj.json', 'output', 'file=/tmp/tmps8tzhya9/prophet_model960ag6e9/prophet_model-20250310000414.csv', 'method=optimize', 'algorithm=lbfgs', 'iter=10000']
00:04:14 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
00:04:14 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing


In [38]:
# Normalize Data for LSTM
scaler = MinMaxScaler()
weekly_sales["Scaled_Sales"] = scaler.fit_transform(weekly_sales[["Sales"]])

# Create LSTM Input Data
def create_sequences(data, seq_length=5):
    X, y = [], []
    for i in range(len(data) - seq_length):
        X.append(data[i:i+seq_length])
        y.append(data[i+seq_length])
    return np.array(X), np.array(y)

seq_length = 5  # Lookback period
X, y = create_sequences(weekly_sales["Scaled_Sales"].values, seq_length)

# Split Data (80% Train, 20% Test)
train_size = int(len(X) * 0.8)
X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y[:train_size], y[train_size:]

# Reshape for LSTM [Samples, Timesteps, Features]
X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
X_test = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))

# Build LSTM Model
model = Sequential([
    LSTM(50, activation='relu', return_sequences=True, input_shape=(seq_length, 1)),
    LSTM(50, activation='relu'),
    Dense(25),
    Dense(1)
])

# Compile Model
model.compile(optimizer=Adam(learning_rate=0.001), loss='mse')

# Train LSTM
model.fit(X_train, y_train, epochs=50, batch_size=16, verbose=1)

# Predict Sales
y_pred = model.predict(X_test)

# Convert Back to Original Scale
y_test_actual = scaler.inverse_transform(y_test.reshape(-1, 1))
y_pred_actual = scaler.inverse_transform(y_pred.reshape(-1, 1))

# Plot Actual vs Predicted Sales
fig = go.Figure()
fig.add_trace(go.Scatter(x=weekly_sales["Week"][train_size+seq_length:], y=y_test_actual.flatten(), mode="lines+markers", name="Actual Sales"))
fig.add_trace(go.Scatter(x=weekly_sales["Week"][train_size+seq_length:], y=y_pred_actual.flatten(), mode="lines", name="LSTM Forecast", line=dict(color="red")))
fig.update_layout(title="LSTM-Based Demand Forecasting", xaxis_title="Week", yaxis_title="Total Sales")
fig.show()



Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.



Epoch 1/50
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 8ms/step - loss: 0.1131
Epoch 2/50
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 0.0417  
Epoch 3/50
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 0.0153
Epoch 4/50
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - loss: 0.0142
Epoch 5/50
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - loss: 0.0116
Epoch 6/50
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - loss: 0.0131
Epoch 7/50
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - loss: 0.0127
Epoch 8/50
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - loss: 0.0126
Epoch 9/50
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - loss: 0.0112
Epoch 10/50
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - loss: 0.0117


In [39]:
# Average the predictions from Prophet & LSTM
final_forecast = (forecast["yhat"].iloc[-len(y_pred_actual):].values + y_pred_actual.flatten()) / 2

# Plot Final Hybrid Forecast
fig = go.Figure()
fig.add_trace(go.Scatter(x=weekly_sales["Week"][train_size+seq_length:], y=y_test_actual.flatten(), mode="lines+markers", name="Actual Sales"))
fig.add_trace(go.Scatter(x=weekly_sales["Week"][train_size+seq_length:], y=final_forecast, mode="lines", name="Hybrid Forecast (Prophet + LSTM)", line=dict(color="purple")))
fig.update_layout(title="Hybrid Demand Forecasting (Prophet + LSTM)", xaxis_title="Week", yaxis_title="Total Sales")
fig.show()


- Predicting Late Shipments

In [20]:
import numpy as np
import pandas as pd
import plotly.figure_factory as ff
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Encode categorical feature
df["Ship Mode Encoded"] = df["Ship Mode"].astype("category").cat.codes

# Prepare features and target variable
X = df[["Ship Mode Encoded", "Shipping Cost", "Discount", "Quantity"]]
y = df["Late Shipment"].astype(int)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train classification model
clf = RandomForestClassifier()
clf.fit(X_train, y_train)

# Predict late shipments
y_pred = clf.predict(X_test)

# Model evaluation
accuracy = accuracy_score(y_test, y_pred)
print(f"Prediction Accuracy: {accuracy:.2f}")
print(classification_report(y_test, y_pred))

# Compute confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_labels = ["On-Time", "Late"]

# Convert confusion matrix to interactive heatmap
fig_cm = ff.create_annotated_heatmap(
    z=cm,
    x=cm_labels,
    y=cm_labels,
    colorscale="Blues",
    showscale=True
)

fig_cm.update_layout(
    title="Confusion Matrix for Late Shipment Prediction",
    xaxis_title="Predicted Label",
    yaxis_title="Actual Label",
    template="plotly_white"
)

# Display confusion matrix
fig_cm.show()

# Feature Importance Analysis
feature_importances = pd.DataFrame(
    {"Feature": X.columns, "Importance": clf.feature_importances_}
).sort_values(by="Importance", ascending=False)

# Create an interactive bar chart for feature importance
fig_importance = px.bar(
    feature_importances,
    x="Importance",
    y="Feature",
    title="Feature Importance in Late Shipment Prediction",
    text_auto=True,
    orientation="h"
)

fig_importance.update_layout(template="plotly_white")

# Display feature importance chart
fig_importance.show()


Prediction Accuracy: 0.91
              precision    recall  f1-score   support

           0       0.86      0.85      0.86      3337
           1       0.93      0.93      0.93      6921

    accuracy                           0.91     10258
   macro avg       0.89      0.89      0.89     10258
weighted avg       0.91      0.91      0.91     10258



# Monte Carlo Simulation for Inventory Optimization

- Monte Carlo simulations can help optimize inventory levels by modeling demand variability, lead time, and reorder policies. This helps in reducing stockouts, minimizing holding costs, and improving supply chain efficiency.

In [41]:
# Aggregate sales per week
df["Week"] = df["Order Date"].dt.to_period("W")
weekly_demand = df.groupby("Week")["Sales"].sum().reset_index()
weekly_demand["Week"] = weekly_demand["Week"].dt.start_time  # Convert period to datetime

# Summary statistics for demand
mean_demand = weekly_demand["Sales"].mean()
std_demand = weekly_demand["Sales"].std()

print(f"Average Weekly Demand: {mean_demand:.2f}")
print(f"Standard Deviation of Demand: {std_demand:.2f}")

Average Weekly Demand: 60204.31
Standard Deviation of Demand: 27331.54


In [43]:
def monte_carlo_inventory_simulation(
    mean_demand, std_demand, lead_time_weeks=2, reorder_point_factor=1.5, simulations=10000
):
    """
    Simulates inventory levels using Monte Carlo to optimize reorder point.
    """
    results = []

    for _ in range(simulations):
        # Simulate demand variation (Assuming normal distribution)
        demand_sample = np.random.normal(mean_demand, std_demand, lead_time_weeks)

        # Calculate total demand over lead time
        total_demand = sum(demand_sample)

        # Simulate lead time variation (Assuming uniform distribution)
        lead_time = np.random.uniform(1, 3)  # Lead time fluctuates between 1-3 weeks

        # Safety stock calculation (Reorder Point Strategy)
        safety_stock = reorder_point_factor * std_demand
        reorder_point = (mean_demand * lead_time) + safety_stock

        # Stockout risk calculation
        stockout = total_demand > reorder_point

        results.append({
            "Total Demand": total_demand,
            "Lead Time": lead_time,
            "Reorder Point": reorder_point,
            "Stockout Occurrence": stockout
        })

    return pd.DataFrame(results)

# Run Monte Carlo Simulation
sim_results = monte_carlo_inventory_simulation(mean_demand, std_demand)

# Display simulation results
display(sim_results)


Unnamed: 0,Total Demand,Lead Time,Reorder Point,Stockout Occurrence
0,111029.994314,2.483149,190493.604491,False
1,38976.809168,2.271775,177767.952527,False
2,105517.402254,2.432806,187462.692887,False
3,73522.071016,1.557811,134784.264235,False
4,100998.735955,2.725383,205077.112206,False
...,...,...,...,...
9995,86035.580016,2.597114,197354.743328,False
9996,134321.959249,1.811559,150060.950445,False
9997,143459.475064,2.078308,166120.401201,False
9998,147742.329070,1.451834,128403.961466,True


In [45]:
import plotly.express as px

# Create an interactive histogram with KDE
fig = px.histogram(
    sim_results,
    x="Total Demand",
    nbins=30,  # Set number of bins
    title="Simulated Demand Distribution Over Lead Time",
    labels={"Total Demand": "Total Demand During Lead Time"},
    marginal="rug",  # Adds small marks for each data point
    opacity=0.7
)

# Add vertical lines for mean and standard deviations
fig.add_vline(x=mean_demand, line=dict(color="red", dash="dash"), annotation_text="Mean Demand", annotation_position="top left")
fig.add_vline(x=mean_demand + std_demand, line=dict(color="green", dash="dash"), annotation_text="Mean + 1 Std Dev", annotation_position="top left")
fig.add_vline(x=mean_demand - std_demand, line=dict(color="green", dash="dash"), annotation_text="Mean - 1 Std Dev", annotation_position="top left")

# Improve layout
fig.update_layout(
    xaxis_title="Total Demand During Lead Time",
    yaxis_title="Frequency",
    template="plotly_white"
)

# Show interactive plot
fig.show()


In [46]:
# Calculate Stockout Probability
stockout_rate = sim_results["Stockout Occurrence"].mean()
optimal_reorder_point = sim_results["Reorder Point"].mean()

print(f"Stockout Probability: {stockout_rate * 100:.2f}%")
print(f"Optimal Reorder Point: {optimal_reorder_point:.2f} units")

# Plot Reorder Point vs Demand
fig = go.Figure()
fig.add_trace(go.Histogram(x=sim_results["Total Demand"], name="Simulated Demand", opacity=0.75))
fig.add_trace(go.Scatter(
    x=[optimal_reorder_point, optimal_reorder_point],
    y=[0, 500],
    mode="lines",
    name="Optimal Reorder Point",
    line=dict(color="red", dash="dash")
))
fig.update_layout(title="Optimal Inventory Reorder Point", xaxis_title="Demand", yaxis_title="Frequency")
fig.show()


Stockout Probability: 22.13%
Optimal Reorder Point: 160972.34 units
