In [1]:
# Import required libraries
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from datetime import datetime, timedelta
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import ExtraTreesRegressor
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX
from prophet import Prophet
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("Libraries imported successfully!")
print(f"Analysis run on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

StatementMeta(, b705ac7d-48f3-4765-88e8-bae77b912aee, 3, Finished, Available, Finished)

Libraries imported successfully!
Analysis run on: 2025-07-26 08:14:00


In [2]:
# Check whether running in Fabric or locally, and set the data location accordingly
if "AZURE_SERVICE" in os.environ:
    is_fabric = True
    data_location = "abfss://7e373771-c704-4855-bb94-026ffb6be497@onelake.dfs.fabric.microsoft.com/740e989a-d750-4fd9-a4d9-def5fe22a5db/Files/forecasting/"
    print("Running in Fabric, setting data location to /lakehouse/default/Files/")
else:
    is_fabric = False
    data_location = ""
    print("Running locally, setting data location to current directory")

StatementMeta(, b705ac7d-48f3-4765-88e8-bae77b912aee, 4, Finished, Available, Finished)

Running in Fabric, setting data location to /lakehouse/default/Files/


In [3]:
# Load the combined sales economic data
data = pd.read_csv(data_location + 'modelGeneratedData/overall_monthly_with_economic_and_future.csv')
# data = pd.read_csv(data_location + 'modelGeneratedData/overall_monthly_with_trend_seasonality_economic_and_future.csv')
data['Date'] = pd.to_datetime(data['Date'])
data = data.sort_values('Date')

print("=== DATASET OVERVIEW ===")
print(f"Dataset shape: {data.shape}")
print(f"Date range: {data['Date'].min()} to {data['Date'].max()}")
print(f"\nData types:")
print(data.dtypes)
print(f"\nFirst few rows:")
data.head()

StatementMeta(, b705ac7d-48f3-4765-88e8-bae77b912aee, 5, Finished, Available, Finished)

=== DATASET OVERVIEW ===
Dataset shape: (121, 57)
Date range: 2015-01-01 00:00:00 to 2025-01-01 00:00:00

Data types:
Date                                  datetime64[ns]
Quantity Invoiced                              int64
Quantity Invoiced Mean                       float64
Quantity Invoiced Count                        int64
Unique Customers                               int64
Unique Products                                int64
Unique Categories                              int64
Unique SubCategories                           int64
Unique EndMarkets L1                           int64
Unique EndMarkets L2                           int64
future_orders_count                            int64
future_orders_qty_total                        int64
future_orders_qty_next_1m                      int64
future_orders_qty_next_3m                      int64
future_orders_qty_next_6m                      int64
future_orders_qty_next_12m                     int64
future_orders_avg_lead_time       

Unnamed: 0,Date,Quantity Invoiced,Quantity Invoiced Mean,Quantity Invoiced Count,Unique Customers,Unique Products,Unique Categories,Unique SubCategories,Unique EndMarkets L1,Unique EndMarkets L2,...,data_Factory_Utilization,data_Capacity_Utilization,Electricity Price,Electricity Price (Lag6),Gas Price,Gas Price (Lag6),Global Supply Chain Pressure Index,GSCPI (Lag1),Manufacturing Orders Volume Index,MOVI (Lag6)
0,2015-01-01,22725114,68039.26347,334,160,53,4,15,3,5,...,0.4302,76.7556,0.2276,0.2302,14.46,15.23,-0.5,-0.39,86.8,76.1
1,2015-02-01,23032809,63276.9478,364,183,52,4,15,3,5,...,0.4302,76.7556,0.2276,0.2302,14.46,15.23,-0.32,-0.5,98.8,89.3
2,2015-03-01,27527951,76679.52925,359,180,50,4,12,3,5,...,0.4302,76.7556,0.2276,0.2302,14.46,15.23,-0.38,-0.32,90.0,91.5
3,2015-04-01,25864804,68789.37234,376,177,58,4,15,3,6,...,0.4372,76.8014,0.2276,0.2302,14.46,15.23,-0.35,-0.38,83.3,87.4
4,2015-05-01,22517479,77379.65292,291,146,51,4,14,3,5,...,0.4381,76.8657,0.2276,0.2302,14.46,15.23,-0.54,-0.35,98.0,88.1


In [4]:
import pandas as pd
# Load data into pandas DataFrame from "/lakehouse/default/Files/Orders by Created Date Past 10 Years.csv"
orders_created_date = pd.read_csv("/lakehouse/default/Files/Orders by Created Date Past 10 Years.csv")

#Aggregate Data
orders_created_date['Date'] = pd.to_datetime(orders_created_date['Attribute Full Date']).dt.to_period('M').dt.to_timestamp()

# Clean Quantity Invoiced (remove commas and convert to float)
orders_created_date['Quantity Ordered'] = orders_created_date['Quantity Ordered'].replace({',': ''}, regex=True).astype(float)

# Clean Sales - USD
orders_created_date['Net Order Value USD'] = orders_created_date['Net Order Value USD'].replace({',': ''}, regex=True).astype(float)

orders_grouped = orders_created_date.groupby(['Date'])[['Quantity Ordered', 'Net Order Value USD']].sum().reset_index()
display(orders_grouped)

StatementMeta(, b705ac7d-48f3-4765-88e8-bae77b912aee, 6, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 9d4c5518-acb6-4e6d-b169-532bf5f80c16)

In [5]:
data = data.merge(orders_grouped[['Date', 'Net Order Value USD']], on='Date', how='left')
display(data)

StatementMeta(, b705ac7d-48f3-4765-88e8-bae77b912aee, 7, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, ac04e6ec-a931-4629-864a-b16262f6214c)

In [6]:
import pandas as pd
# Load data into pandas DataFrame from "/lakehouse/default/Files/forecasting/userProvidedData/Germany Economic data/CPI_Monthly_2020Base_Germany.csv"
cpi_data = pd.read_csv("/lakehouse/default/Files/forecasting/userProvidedData/Germany Economic data/CPI_Monthly_2020Base_Germany.csv")

cpi_data['Date'] = pd.to_datetime(cpi_data['Date'])
display(cpi_data)

StatementMeta(, b705ac7d-48f3-4765-88e8-bae77b912aee, 8, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 2d0ea34a-a232-4561-adc9-4ff5b6e46594)

In [7]:
data = data.merge(cpi_data[['Date', 'CPI']], on='Date', how='left')
display(data)

StatementMeta(, b705ac7d-48f3-4765-88e8-bae77b912aee, 9, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 2060d175-5e2f-406c-92e4-c50d608b7904)

In [8]:
import pandas as pd
# Load data into pandas DataFrame from "/lakehouse/default/Files/forecasting/userProvidedData/Germany Economic data/hicp_data.csv"
hicp_data = pd.read_csv("/lakehouse/default/Files/forecasting/userProvidedData/Germany Economic data/hicp_data.csv")

hicp_data['Date'] = pd.to_datetime(hicp_data['Date'])
display(hicp_data)

StatementMeta(, b705ac7d-48f3-4765-88e8-bae77b912aee, 10, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, a27b8c2a-3fb1-4af0-aa17-8d0c8c5b9c46)

In [9]:
data = data.merge(hicp_data[['Date', 'HICP']], on='Date', how='left')
display(data)

StatementMeta(, b705ac7d-48f3-4765-88e8-bae77b912aee, 11, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 3e5849a8-afd3-4993-b348-329deb694f5b)

In [10]:
import pandas as pd
# Load data into pandas DataFrame from "/lakehouse/default/Files/forecasting/userProvidedData/Germany Economic data/new_manufacturing_orders_v2.csv"
manufacturing_orders = pd.read_csv("/lakehouse/default/Files/forecasting/userProvidedData/Germany Economic data/new_manufacturing_orders_v2.csv")

manufacturing_orders['Date'] = pd.to_datetime(manufacturing_orders['Date'])
display(manufacturing_orders)

StatementMeta(, b705ac7d-48f3-4765-88e8-bae77b912aee, 12, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 05b6be20-bb85-4938-ab7e-8aa95c8bcde3)

In [11]:
data = data.merge(manufacturing_orders[['Date', 'volume_index']], on='Date', how='left')
display(data)

StatementMeta(, b705ac7d-48f3-4765-88e8-bae77b912aee, 13, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, cfd56b33-6e53-484f-a625-2cff1bb26aa1)

In [12]:
import pandas as pd
# Load data into pandas DataFrame from "/lakehouse/default/Files/forecasting/userProvidedData/Germany Economic data/Crude Oil.csv"
df_crude_oil = pd.read_csv("/lakehouse/default/Files/forecasting/userProvidedData/Germany Economic data/Crude Oil.csv")

df_crude_oil['Date'] = pd.to_datetime(df_crude_oil['Date'])
display(df_crude_oil)

StatementMeta(, b705ac7d-48f3-4765-88e8-bae77b912aee, 14, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 55db7414-4783-47ce-8b95-4f2beb5e0d99)

In [13]:
data = data.merge(df_crude_oil[['Date', 'Crude Oil Brent Europe Price in EUR']], on='Date', how='left')
display(data)

StatementMeta(, b705ac7d-48f3-4765-88e8-bae77b912aee, 15, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 9b093dba-89e3-4269-acbc-54c219b75c50)

In [14]:
import pandas as pd
# Load data into pandas DataFrame from "/lakehouse/default/Files/forecasting/userProvidedData/Germany Economic data/Global Supply Chain Pressure Index (GSCPI) - updated.csv"
df_gscpi = pd.read_csv("/lakehouse/default/Files/forecasting/userProvidedData/Germany Economic data/Global Supply Chain Pressure Index (GSCPI) - updated.csv")

# Convert 'Date' to datetime
df_gscpi['Date'] = pd.to_datetime(df_gscpi['Date'], format='%m/%d/%y')
df_gscpi.set_index('Date', inplace=True)
df_gscpi.index = df_gscpi.index.to_period('M').to_timestamp(how='start')  # Shift to 1st of month
df_gscpi = df_gscpi.reset_index()
display(df_gscpi)

StatementMeta(, b705ac7d-48f3-4765-88e8-bae77b912aee, 16, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, bcb33678-d58c-4721-91ed-5c0fd2c1d829)

In [15]:
data = data.merge(df_gscpi[['Date', 'Global Supply Chain Pressure Index']], on='Date', how='left')
display(data)

StatementMeta(, b705ac7d-48f3-4765-88e8-bae77b912aee, 17, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, b31db4b8-f20f-4647-9fe7-2ef0801b572f)

In [16]:
import pandas as pd
# Load data into pandas DataFrame from "/lakehouse/default/Files/forecasting/userProvidedData/Germany Economic data/USD to EUR historical exchange rates.csv"
df_exchange_rate = pd.read_csv("/lakehouse/default/Files/forecasting/userProvidedData/Germany Economic data/USD to EUR historical exchange rates.csv")

df_exchange_rate['Date'] = pd.to_datetime(df_exchange_rate['Date'])
display(df_exchange_rate)

StatementMeta(, b705ac7d-48f3-4765-88e8-bae77b912aee, 18, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, a98987bc-cdea-4c8c-93b4-387c9e78c678)

In [17]:
data = data.merge(df_exchange_rate[['Date', 'USD to EUR Exchange Rate']], on='Date', how='left')
display(data)

StatementMeta(, b705ac7d-48f3-4765-88e8-bae77b912aee, 19, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 16181b9d-e0a5-41cf-9304-54303b234304)

In [18]:
# Show all rows in the output
pd.set_option('display.max_rows', None)

# Now this will show all 64 rows
data.isna().sum()

StatementMeta(, b705ac7d-48f3-4765-88e8-bae77b912aee, 20, Finished, Available, Finished)

Date                                      0
Quantity Invoiced                         0
Quantity Invoiced Mean                    0
Quantity Invoiced Count                   0
Unique Customers                          0
Unique Products                           0
Unique Categories                         0
Unique SubCategories                      0
Unique EndMarkets L1                      0
Unique EndMarkets L2                      0
future_orders_count                       0
future_orders_qty_total                   0
future_orders_qty_next_1m                 0
future_orders_qty_next_3m                 0
future_orders_qty_next_6m                 0
future_orders_qty_next_12m                0
future_orders_avg_lead_time               0
future_orders_min_lead_time               0
future_orders_max_lead_time               0
future_orders_due_next_month              0
future_orders_due_next_quarter            0
future_orders_unique_customers            0
future_orders_unique_products   

In [19]:
(data['Global Supply Chain Pressure Index_x'] == data['Global Supply Chain Pressure Index_y']).all()


StatementMeta(, b705ac7d-48f3-4765-88e8-bae77b912aee, 21, Finished, Available, Finished)

True