In [1]:
import os
import sys
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import plotly.graph_objects as go

import matplotlib.pyplot as plt
import seaborn as sns
import threading
from datetime import datetime, timezone
from concurrent.futures import ThreadPoolExecutor
from pymongo import MongoClient, errors

import warnings
warnings.filterwarnings("ignore")

pd.set_option("display.max_columns", 999)
pd.set_option("display.max_rows", 999)

In [2]:
def concat_regular_parquet_files(folder_path, output_path=None, recursive=False):
    all_files = []
    
    for root, _, files in os.walk(folder_path):
        for file in files:
            if file.endswith(".parquet") and not file.endswith("_noos_adjusted.parquet"):
                all_files.append(os.path.join(root, file))
        if not recursive:
            break

    if not all_files:
        raise FileNotFoundError("No .parquet files found in the given folder.")

    df_list = [pd.read_parquet(fp) for fp in all_files]
    combined_df = pd.concat(df_list, ignore_index=True)

    if output_path:
        combined_df.to_parquet(output_path, index=False)
        print(f"✅ Saved concatenated file to: {output_path}")

    return combined_df

def concat_noos_parquet_files(folder_path, output_path=None, recursive=False):
    all_files = []
    
    for root, _, files in os.walk(folder_path):
        for file in files:
            if file.endswith(".parquet") and file.endswith("_noos_adjusted.parquet"):
                all_files.append(os.path.join(root, file))
        if not recursive:
            break

    if not all_files:
        raise FileNotFoundError("No .parquet files found in the given folder.")

    df_list = [pd.read_parquet(fp) for fp in all_files]
    combined_df = pd.concat(df_list, ignore_index=True)

    if output_path:
        combined_df.to_parquet(output_path, index=False)
        print(f"✅ Saved concatenated file to: {output_path}")

    return combined_df

In [3]:
regular_forecast = concat_regular_parquet_files("/Users/trentino/Work/OFM/droplet/2026 Forecast/forecasted_data", )

In [4]:
sales_data = pd.read_parquet("/Users/trentino/Work/OFM/droplet/2026 Forecast/data/clean_sales_data.parquet")

In [5]:
internal_brands = ['Campbell','Donkervoort','Dutch Dandies','Recall','The BLUEPRINT Premium','Nobel','Runway PARTY','J.C. RAGS']
internal_brands_list = [618, 427, 301, 438, 228, 1000, 804, 876]

In [6]:
regular_sales_data = sales_data[sales_data['season'].isin(["Summer"])].reset_index(drop=True)

In [7]:
summer_others = ['Skin care', 'Cufflinks', 'Colbert', 'Scents', 'Espadrilles', 'Wedding suit', 'Gadgets', 'Diversen', 'Instapper', 'Trendy shirt SS', 'Apron', 'Bermuda']

regular_articles = ['Polo SS', 'T-shirt SS', 'Short', 'Casual shirt SS', 'Sneakers', 'Jack outdoor', 'Swim short', 'Tie', 'Mix & match trousers',  "Others"]

In [8]:
regular_sales_data.loc[(regular_sales_data["articleGroupDescription"].isin(summer_others)) & (regular_sales_data['season'] == "Summer"), "articleGroupDescription"] = "Others"

regular_sales_data = regular_sales_data[regular_sales_data["articleGroupDescription"].isin(regular_articles)].reset_index(drop=True)
uncombined_sales_data = regular_sales_data.copy()
regular_sales_data = regular_sales_data.groupby(["salesDate", "articleGroupDescription", "brandDescription", "Inhouse_Brand"]).agg({"quantity":"sum"}).reset_index()

In [9]:
regular_forecast.loc[(regular_forecast['articleGroupDescription'].isin(regular_articles)) & (regular_forecast['Season'].isna()), "Season"] = "Summer"

In [10]:
regular_forecast[regular_forecast['Season'] == 'Summer']

Unnamed: 0,salesDate,articleGroupDescription,Approach,SalesForecast,Season
2,2026-01-01 to 2027-04-01,Jack outdoor,Adj: Historical,7721.0,Summer
12,2026-01-01 to 2027-04-01,Short,Adj: Historical,14480.0,Summer
13,2026-01-01 to 2027-04-01,Swim short,Adj: Pace,5372.0,Summer
15,2026-01-01 to 2027-04-01,Mix & match trousers,Adj: Historical,6251.0,Summer
16,2026-01-01 to 2027-04-01,Polo SS,Adj: Historical,52255.0,Summer
19,2026-01-01 to 2027-04-01,Tie,Adj: Historical,3738.0,Summer
21,2026-01-01 to 2027-04-01,Others,Raw Model,958.0,Summer
23,2026-01-01 to 2027-04-01,Sneakers,Adj: Historical,11227.0,Summer
30,2026-01-01 to 2027-04-01,T-shirt SS,Adj: Historical,38927.0,Summer
33,2026-01-01 to 2027-04-01,Casual shirt SS,Adj: Historical,17726.0,Summer


In [11]:
regular_forecast = regular_forecast[regular_forecast['Season'] == 'Summer']

In [12]:
inhouse_brands = ['Campbell',
 'Donkervoort',
 'Dutch Dandies',
 'Recall',
 'The BLUEPRINT Premium',
 'Nobel',
 'Runway PARTY',
 'J.C. RAGS']

regular_sales_data.loc[~(regular_sales_data["brandDescription"].isin(inhouse_brands)), "brandDescription"] = "Others"

In [13]:
def allocate_forecast_to_brands(sales_df, forecast_df, inhouse_brands, historical_weight=0.5):
    sales_df = sales_df.copy()
    sales_df["salesDate"] = pd.to_datetime(sales_df["salesDate"])

    # Define date ranges
    hist_mask = (sales_df["salesDate"].dt.month >= 1) & (sales_df["salesDate"].dt.month <= 12) & (sales_df["salesDate"].dt.year.isin([2023, 2024]))
    recent_mask = (sales_df["salesDate"] >= "2025-01-01") & (sales_df["salesDate"] <= "2025-12-31")

    def compute_brand_share(df, label):
        group = df.groupby(["articleGroupDescription", "brandDescription"])["quantity"].sum().reset_index()
        total = group.groupby("articleGroupDescription")["quantity"].transform("sum")
        group["share"] = group["quantity"] / total
        group.rename(columns={"share": f"{label}_share"}, inplace=True)
        return group[["articleGroupDescription", "brandDescription", f"{label}_share"]]

    hist_share = compute_brand_share(sales_df[hist_mask], "hist")
    recent_share = compute_brand_share(sales_df[recent_mask], "recent")

    # Merge and blend
    merged = pd.merge(hist_share, recent_share, on=["articleGroupDescription", "brandDescription"], how="outer").fillna(0)
    merged["blended_share"] = historical_weight * merged["hist_share"] + (1 - historical_weight) * merged["recent_share"]

    # Merge forecasted article sales
    forecast_df = forecast_df.copy()
    merged_forecast = pd.merge(forecast_df, merged, on="articleGroupDescription", how="left")

    # Calculate brand-level forecast
    merged_forecast["BrandForecast"] = merged_forecast["SalesForecast"] * merged_forecast["blended_share"]
    merged_forecast["BrandForecast"] = round(merged_forecast["BrandForecast"], 0)
    
    # Tag internal/external brands
    merged_forecast["Inhouse_Brand"] = merged_forecast["brandDescription"].isin(inhouse_brands)

    return merged_forecast[[
        "salesDate", "articleGroupDescription", "brandDescription", "Inhouse_Brand", 
        "Approach", "SalesForecast", "blended_share", "BrandForecast"
    ]]

def historical_brand_breakdown(sales_data, start_year=2023, end_year=2024, inhouse_brands=None):
    if inhouse_brands is None:
        inhouse_brands = []

    # Filter for historical Mar–Sept sales (exclude 2025)
    hist_mask = (
        (sales_data["salesDate"].dt.year >= start_year) &
        (sales_data["salesDate"].dt.year <= end_year) &
        (sales_data["salesDate"].dt.month >= 1) &
        (sales_data["salesDate"].dt.month <= 12)
    )
    hist_df = sales_data.loc[hist_mask].copy()

    # Group by article + brand and sum quantities
    group = hist_df.groupby(["articleGroupDescription", "brandDescription"])["quantity"].sum().reset_index()

    # Compute total sales per article to get brand share
    total = group.groupby("articleGroupDescription")["quantity"].transform("sum")
    group["brand_share"] = group["quantity"] / total

    # Also calculate total article-level sales
    article_totals = group.groupby("articleGroupDescription")["quantity"].sum().reset_index()
    article_totals = article_totals.rename(columns={"quantity": "article_total_sales"})

    # Merge article totals back to group
    merged = pd.merge(group, article_totals, on="articleGroupDescription")

    # Estimate brand-level "forecast-like" numbers from historical totals
    merged["brand_quantity_estimate"] = merged["brand_share"] * merged["article_total_sales"]

    # Flag in-house brands
    merged["Inhouse_Brand"] = merged["brandDescription"].isin(inhouse_brands)

    return merged

In [14]:
historical_brand_summary = historical_brand_breakdown(regular_sales_data, start_year=2023, end_year=2025, inhouse_brands=None)

grouped = historical_brand_summary.groupby("Inhouse_Brand").agg({"brand_quantity_estimate": "sum"}).reset_index()
total_quantity = grouped["brand_quantity_estimate"].sum()
grouped["contribution_%"] = grouped["brand_quantity_estimate"] /  total_quantity * 100
grouped["contribution_%"] = grouped["contribution_%"].round(2)
grouped

Unnamed: 0,Inhouse_Brand,brand_quantity_estimate,contribution_%
0,False,397391.0,100.0


In [15]:
# brands = list(sales_data['brandDescription'].dropna().unique())

In [16]:
bifurcated_sales = allocate_forecast_to_brands(regular_sales_data, regular_forecast, inhouse_brands)
grouped = bifurcated_sales.groupby("Inhouse_Brand").agg({
    "BrandForecast": "sum"
}).reset_index()
total_quantity = grouped["BrandForecast"].sum()
grouped["contribution_%"] = grouped["BrandForecast"] / total_quantity * 100
grouped["contribution_%"] = grouped["contribution_%"].round(2)
grouped

Unnamed: 0,Inhouse_Brand,BrandForecast,contribution_%
0,False,104754.0,66.03
1,True,53901.0,33.97


In [17]:
sales_forecasted = bifurcated_sales[["salesDate", "articleGroupDescription", "brandDescription", "Inhouse_Brand", "BrandForecast"]]

sales_forecasted = sales_forecasted.rename(columns = {"BrandForecast":"quantity"})

sales_forecasted["category"] = "Forecasted Sales"

sales_forecasted["Inhouse_Brand"] = sales_forecasted["Inhouse_Brand"].replace({True:"Internal", False:"External"})

sales_forecasted["salesDate"] = pd.to_datetime(datetime(2026, 12, 31))

In [18]:
sales_forecasted.head(1)

Unnamed: 0,salesDate,articleGroupDescription,brandDescription,Inhouse_Brand,quantity,category
0,2026-12-31,Jack outdoor,Campbell,Internal,1962.0,Forecasted Sales


In [19]:
uncombined_sales_data.loc[(uncombined_sales_data["Discount%"] == 0), "IsDiscount"] = True
uncombined_sales_data["IsDiscount"] = uncombined_sales_data["IsDiscount"].fillna(False)

# Step 1: Calculate sales proportions (discount vs non-discount)
grouped = uncombined_sales_data[(uncombined_sales_data["salesDate"] >= "2022-01-01")].groupby(
    ["articleGroupDescription", "brandDescription", "IsDiscount"]
).agg({"quantity": "sum"}).reset_index()

# Total sales per article-brand
totals = grouped.groupby(
    ["articleGroupDescription", "brandDescription"]
)["quantity"].sum().reset_index().rename(columns={"quantity": "total_quantity"})

# Merge to compute proportion
merged = pd.merge(grouped, totals, on=["articleGroupDescription", "brandDescription"])
merged["sales_proportion"] = merged["quantity"] / merged["total_quantity"]

# Pivot into separate columns for True and False
proportions_pivot = merged.pivot(
    index=["articleGroupDescription", "brandDescription"],
    columns="IsDiscount",
    values="sales_proportion"
).reset_index()

# Handle missing columns by filling with 0, then rename
proportions_pivot = proportions_pivot.rename(columns={
    True: "discount_prop",
    False: "non_discount_prop"
})

# Ensure both columns exist
if "discount_prop" not in proportions_pivot.columns:
    proportions_pivot["discount_prop"] = 0.0
if "non_discount_prop" not in proportions_pivot.columns:
    proportions_pivot["non_discount_prop"] = 0.0

# Step 2: Pricing - max price (non-discount), avg price (discounted but less than max)
non_discount_price_df = uncombined_sales_data[(uncombined_sales_data["IsDiscount"] == False) & (uncombined_sales_data["salesDate"] >= "2024-01-01")] \
    .groupby(["articleGroupDescription", "brandDescription"])["retailPrice"].mean().reset_index() \
    .rename(columns={"retailPrice": "non_discount_price"})

discount_df = pd.merge(
    uncombined_sales_data[uncombined_sales_data["IsDiscount"] == True],
    non_discount_price_df,
    on=["articleGroupDescription", "brandDescription"],
    how="inner"
)

discount_df = discount_df[discount_df["retailPrice"] < discount_df["non_discount_price"]]

discount_price_df = uncombined_sales_data[(uncombined_sales_data["salesDate"] >= "2024-01-01")].groupby(["articleGroupDescription", "brandDescription"]) \
    .agg({"retailPrice": "mean"}).reset_index() \
    .rename(columns={"retailPrice": "discount_price"})

# Step 3: Combine price info
price_data_cleaned = pd.merge(non_discount_price_df, discount_price_df,
                              on=["articleGroupDescription", "brandDescription"], how="left")

# Step 4: Combine proportion + pricing
pricing_data = pd.merge(proportions_pivot, price_data_cleaned,
                        on=["articleGroupDescription", "brandDescription"], how="left")

# Step 5: Merge with forecast
sales_forecasted = pd.merge(sales_forecasted, pricing_data,
                            on=["articleGroupDescription", "brandDescription"], how="left")

# Ensure fill for proportions (if any are missing)
sales_forecasted["discount_prop"] = sales_forecasted["discount_prop"].fillna(0)
sales_forecasted["non_discount_prop"] = sales_forecasted["non_discount_prop"].fillna(1)

# Step 6: Project sales and revenue
sales_forecasted["discount_sales"] = sales_forecasted["quantity"] * sales_forecasted["discount_prop"]
sales_forecasted["non_discount_sales"] = sales_forecasted["quantity"] * sales_forecasted["non_discount_prop"]

sales_forecasted["discount_revenue"] = sales_forecasted["discount_sales"] * sales_forecasted["discount_price"]
sales_forecasted["non_discount_revenue"] = sales_forecasted["non_discount_sales"] * sales_forecasted["non_discount_price"]

sales_forecasted["retailPrice"] = (
    sales_forecasted["discount_revenue"].fillna(0) + sales_forecasted["non_discount_revenue"].fillna(0)
).round(2)

# Final output
sales_forecasted = sales_forecasted[[
"salesDate", "articleGroupDescription", "brandDescription", "Inhouse_Brand", "quantity", "category", "retailPrice"
]]

print(sales_forecasted["retailPrice"].sum())

sales_forecasted.head()

2241435.02


Unnamed: 0,salesDate,articleGroupDescription,brandDescription,Inhouse_Brand,quantity,category,retailPrice
0,2026-12-31,Jack outdoor,Campbell,Internal,1962.0,Forecasted Sales,147329.56
1,2026-12-31,Jack outdoor,Donkervoort,Internal,218.0,Forecasted Sales,10704.45
2,2026-12-31,Jack outdoor,Dutch Dandies,Internal,1.0,Forecasted Sales,0.0
3,2026-12-31,Jack outdoor,J.C. RAGS,Internal,521.0,Forecasted Sales,25291.21
4,2026-12-31,Jack outdoor,Others,External,5019.0,Forecasted Sales,0.0


In [20]:
regular_articles

['Polo SS',
 'T-shirt SS',
 'Short',
 'Casual shirt SS',
 'Sneakers',
 'Jack outdoor',
 'Swim short',
 'Tie',
 'Mix & match trousers',
 'Others']

In [21]:
uncombined_sales_data[uncombined_sales_data['articleGroupDescription'].isin(regular_articles)].groupby(["year","articleGroupDescription"]).agg({"quantity":"sum", "retailPrice":"sum"})

Unnamed: 0_level_0,Unnamed: 1_level_0,quantity,retailPrice
year,articleGroupDescription,Unnamed: 2_level_1,Unnamed: 3_level_1
2019,Casual shirt SS,10166.0,542418.44
2019,Jack outdoor,2242.0,473609.5
2019,Others,3632.0,259812.42
2019,Polo SS,30583.0,1910710.36
2019,Short,11095.0,696681.25
2019,Sneakers,3661.0,511238.95
2019,Swim short,1701.0,81416.78
2019,T-shirt SS,12492.0,513994.58
2019,Tie,2878.0,143606.29
2020,Casual shirt SS,9120.0,454609.62


In [22]:
sales_forecasted.groupby("articleGroupDescription").agg({"quantity":"sum", "retailPrice":"sum"})

Unnamed: 0_level_0,quantity,retailPrice
articleGroupDescription,Unnamed: 1_level_1,Unnamed: 2_level_1
Casual shirt SS,17726.0,381033.55
Jack outdoor,7721.0,183325.22
Mix & match trousers,6251.0,321691.98
Others,957.0,25291.29
Polo SS,52256.0,824681.98
Short,14481.0,149036.42
Sneakers,11227.0,31995.11
Swim short,5372.0,35861.37
T-shirt SS,38926.0,274789.25
Tie,3738.0,13728.85


In [23]:
external_others_articles = list(sales_forecasted[(sales_forecasted['retailPrice'] == 0.0)]['articleGroupDescription'].unique())

temp_data = sales_data[(sales_data['articleGroupDescription'].isin(external_others_articles)) &
                       (sales_data['Inhouse_Brand'] == False)&
                       (sales_data['season'].isin(['Summer', "Winter"]))].groupby("articleGroupDescription").agg({'quantity':"sum","retailPrice":"sum"}).reset_index()

temp_data['avg_cost'] = round(temp_data['retailPrice'] / temp_data['quantity'], 1)

sales_forecasted = pd.merge(sales_forecasted, temp_data[['articleGroupDescription', 'avg_cost']], on = 'articleGroupDescription', how = 'left')

sales_forecasted.loc[(sales_forecasted['retailPrice'] == 0.0), 'retailPrice'] = round(sales_forecasted['quantity'] * sales_forecasted['avg_cost'], 2)

sales_forecasted = sales_forecasted.drop(columns = 'avg_cost')

In [24]:
sales_forecasted['quantity'].sum(), sales_forecasted['retailPrice'].sum()

(np.float64(158655.0), np.float64(11691293.62))

In [25]:
uncombined_sales_data["year"] = uncombined_sales_data["salesDate"].dt.year
uncombined_sales_data["month"] = uncombined_sales_data["salesDate"].dt.month

In [26]:
uncombined_sales_data[(uncombined_sales_data["month"].isin([1,2,3,4,5,6,7,8,9,10,11,12]))].groupby("year").agg({"retailPrice":"sum", "quantity":'sum'}).reset_index()

Unnamed: 0,year,retailPrice,quantity
0,2019,5133488.57,78450.0
1,2020,5770478.91,86080.0
2,2021,7814637.29,102512.0
3,2022,9546280.84,123345.0
4,2023,10099323.93,139114.0
5,2024,10668821.43,135913.0
6,2025,10034412.51,122364.0


In [27]:
sales_forecasted.loc[sales_forecasted["articleGroupDescription"].isin(regular_articles), "season"] = "Summer"

In [28]:
# sales_forecasted['season'] = sales_forecasted['season'].fillna("Year Round")

In [35]:
# sales_forecasted.to_csv("/Users/trentino/Work/OFM/droplet/2026 Forecast/data/sales_forecast_summer_regular_2026_season.csv", index=False)

In [30]:
uncombined_sales_data.loc[uncombined_sales_data["articleGroupDescription"].isin(regular_articles), "season"] = "Summer"

In [31]:
combined_summ = uncombined_sales_data.groupby(['year', "Inhouse_Brand", 'season', 'articleGroupDescription']).agg({"quantity":"sum", "retailPrice":"sum"}).reset_index()

In [38]:
combined_summ[(combined_summ['year'] == 2024) & (combined_summ['articleGroupDescription'] == "Others")]

Unnamed: 0,year,Inhouse_Brand,season,articleGroupDescription,quantity,retailPrice
93,2024,False,Summer,Others,17.0,2338.54
103,2024,True,Summer,Others,648.0,45143.29


In [33]:
combined_summ[(combined_summ['year'] == 2025) & (combined_summ['articleGroupDescription'] == "Others")]

Unnamed: 0,year,Inhouse_Brand,season,articleGroupDescription,quantity,retailPrice
113,2025,False,Summer,Others,467.0,11318.07
123,2025,True,Summer,Others,116.0,15143.94


In [42]:
combined_summ[(combined_summ['year'] == 2024) & (combined_summ['Inhouse_Brand'] == True)]

Unnamed: 0,year,Inhouse_Brand,season,articleGroupDescription,quantity,retailPrice
100,2024,True,Summer,Casual shirt SS,8517.0,301044.77
101,2024,True,Summer,Jack outdoor,1855.0,208252.1
102,2024,True,Summer,Mix & match trousers,1529.0,214948.82
103,2024,True,Summer,Others,648.0,45143.29
104,2024,True,Summer,Polo SS,19433.0,972290.88
105,2024,True,Summer,Short,5834.0,293629.8
106,2024,True,Summer,Sneakers,297.0,28795.98
107,2024,True,Summer,Swim short,831.0,30712.85
108,2024,True,Summer,T-shirt SS,6966.0,247729.28
109,2024,True,Summer,Tie,98.0,4278.81
