In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [5]:
# ===LOAD RAW DATA ===
df = pd.read_csv("../data/raw_sales_data.csv")
df.columns = df.columns.str.strip()

In [6]:
# ===CLEANING ===
df = df.dropna(subset=["Date"])
df = df.drop_duplicates()
df["Region"] = df["Region"].fillna("Unknown")
df["Product"] = df["Product"].replace("", "Unknown").fillna("Unknown")
df = df.dropna(subset=["Units Sold", "Unit Price"])
df["Date"] = pd.to_datetime(df["Date"])
df["Units Sold"] = df["Units Sold"].astype(int)
df["Unit Price"] = df["Unit Price"].astype(float)
df = df[df["Units Sold"] < 5000]
df["Revenue"] = df["Units Sold"] * df["Unit Price"]

category_map = {
    "Laptop": "Tech",
    "Tablet": "Tech",
    "Phone": "Mobile",
    "Monitor": "Accessories",
    "Unknown": "Other"
}
df["Category"] = df["Product"].map(category_map).fillna("Other")
df["Month"] = df["Date"].dt.to_period("M").astype(str)

In [7]:
# ===KPI CALCULATION ===
kpi = {
    "Total Revenue": df["Revenue"].sum(),
    "Total Units Sold": df["Units Sold"].sum(),
    "Average Order Value": df["Revenue"].sum() / len(df),
    "Revenue per Unit": df["Revenue"].sum() / df["Units Sold"].sum(),
    "Unique Products": df["Product"].nunique(),
    "Unique Regions": df["Region"].nunique()
}
for key, val in kpi.items():
    print(f"{key}: {val:,.2f}")

Total Revenue: 1,412,742.21
Total Units Sold: 2,657.00
Average Order Value: 2,616.19
Revenue per Unit: 531.71
Unique Products: 4.00
Unique Regions: 4.00


In [8]:
# ===FORECASTING ===
df["Month"] = pd.to_datetime(df["Date"]).dt.to_period("M").dt.to_timestamp()
monthly = df.groupby("Month")["Revenue"].sum().sort_index()

train = monthly.iloc[:-3]
test = monthly.iloc[-3:]

model = ExponentialSmoothing(train, trend="add", seasonal=None)
fitted = model.fit()
forecast = fitted.forecast(len(test))
mae = mean_absolute_error(test, forecast)
rmse = np.sqrt(mean_squared_error(test, forecast))

future_dates = pd.date_range(monthly.index[-1] + pd.offsets.MonthBegin(), periods=3, freq="MS")
future_forecast = fitted.forecast(len(test) + 3)[-3:]

print(f"Test MAE: {mae:.2f}")
print(f"Test RMSE: {rmse:.2f}")
for d, val in zip(future_dates.strftime("%Y-%m"), future_forecast):
    print(f"Forecast {d}: {val:,.2f}")

Test MAE: 14362.93
Test RMSE: 16428.03
Forecast 2023-07: 61,755.81
Forecast 2023-08: 58,909.37
Forecast 2023-09: 56,062.92


  self._init_dates(dates, freq)


In [9]:
# ===EXPORT CLEANED ===
df.to_excel("cleaned_sales_data.xlsx", index=False)
df.to_csv("cleaned_sales_data.csv", index=False)

In [10]:
# ===SAVE FORECAST PLOT ===
plt.figure(figsize=(10, 5))
plt.plot(train.index, train.values, label="Train", marker="o")
plt.plot(test.index, test.values, label="Test (actual)", marker="o")
plt.plot(test.index, forecast.values, label="Test (forecast)", linestyle="--", marker="x")
plt.plot(future_dates, future_forecast.values, label="Future Forecast", linestyle="--", marker="^")
plt.title("Monthly Revenue Forecast with Exponential Smoothing")
plt.xlabel("Month")
plt.ylabel("Revenue")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.savefig("monthly_forecast_es.png")
plt.close()