In [1]:
import os
import sys
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import plotly.graph_objects as go

import matplotlib.pyplot as plt
import seaborn as sns
import threading
from datetime import datetime, timezone
from concurrent.futures import ThreadPoolExecutor
from pymongo import MongoClient, errors

import warnings
warnings.filterwarnings("ignore")

pd.set_option("display.max_columns", 999)
pd.set_option("display.max_rows", 999)

In [2]:
def concat_regular_parquet_files(folder_path, output_path=None, recursive=False):
    all_files = []
    
    for root, _, files in os.walk(folder_path):
        for file in files:
            if file.endswith(".parquet") and not file.endswith("_noos_adjusted.parquet"):
                all_files.append(os.path.join(root, file))
        if not recursive:
            break

    if not all_files:
        raise FileNotFoundError("No .parquet files found in the given folder.")

    df_list = [pd.read_parquet(fp) for fp in all_files]
    combined_df = pd.concat(df_list, ignore_index=True)

    if output_path:
        combined_df.to_parquet(output_path, index=False)
        print(f"✅ Saved concatenated file to: {output_path}")

    return combined_df

def concat_noos_parquet_files(folder_path, output_path=None, recursive=False):
    all_files = []
    
    for root, _, files in os.walk(folder_path):
        for file in files:
            if file.endswith(".parquet") and file.endswith("_noos_adjusted.parquet"):
                all_files.append(os.path.join(root, file))
        if not recursive:
            break

    if not all_files:
        raise FileNotFoundError("No .parquet files found in the given folder.")

    df_list = [pd.read_parquet(fp) for fp in all_files]
    combined_df = pd.concat(df_list, ignore_index=True)

    if output_path:
        combined_df.to_parquet(output_path, index=False)
        print(f"✅ Saved concatenated file to: {output_path}")

    return combined_df

In [3]:
regular_forecast = concat_regular_parquet_files("/Users/trentino/Work/OFM/droplet/2026 Forecast/forecasted_data")

In [5]:
sales_data = pd.read_parquet("/Users/trentino/Work/OFM/droplet/2026 Forecast/data/clean_sales_data.parquet")

In [6]:
internal_brands = ['Campbell','Donkervoort','Dutch Dandies','Recall','The BLUEPRINT Premium','Nobel','Runway PARTY','J.C. RAGS']
internal_brands_list = [618, 427, 301, 438, 228, 1000, 804, 876]

In [7]:
regular_sales_data = sales_data[sales_data['season'].isin(["Winter"])].reset_index(drop=True)

In [8]:
winter_others = ['Dress Shirt short sleeve', 'T-shirt KM', 'Scarf', 'Mix & match waistcoat', 'Jack/jas', 'Pajama', 'Accessorry', 'Cap', 'Belt', 'Tuxedo shirt', 'Tuxedo', 'Gift', 'Dresshirt', 'Shoes', 'Umbrella']

regular_articles = ['Crew-neck sweater', 'Pull over half zip', 'Pullover rollneck', 'Padded jacket', 'Coat', "Others"]

In [9]:
regular_sales_data.loc[(regular_sales_data["articleGroupDescription"].isin(winter_others)) & (regular_sales_data['season'] == "Winter"), "articleGroupDescription"] = "Others"

regular_sales_data = regular_sales_data[regular_sales_data["articleGroupDescription"].isin(regular_articles)].reset_index(drop=True)
uncombined_sales_data = regular_sales_data.copy()
regular_sales_data = regular_sales_data.groupby(["salesDate", "articleGroupDescription", "brandDescription", "Inhouse_Brand"]).agg({"quantity":"sum"}).reset_index()

In [10]:
regular_forecast.loc[(regular_forecast['articleGroupDescription'].isin(regular_articles)) & (regular_forecast['Season'].isna()), "Season"] = "Winter"

In [11]:
regular_forecast[regular_forecast['Season'] == 'Winter']

Unnamed: 0,salesDate,articleGroupDescription,Approach,SalesForecast,Season
0,2026-01-01 to 2027-04-01,Crew-neck sweater,Adj: Historical,30791.0,Winter
5,2026-01-01 to 2027-04-01,Padded jacket,Adj: Historical,6923.0,Winter
8,2026-01-01 to 2027-04-01,Coat,Adj: Historical,3915.0,Winter
9,2026-01-01 to 2027-04-01,Others,Adj: Historical,364.0,Winter
11,2026-01-01 to 2027-04-01,Pull over half zip,Adj: Historical,11082.0,Winter
26,2026-01-01 to 2027-04-01,Pullover rollneck,Adj: Historical,11520.0,Winter


In [12]:
regular_forecast = regular_forecast[regular_forecast['Season'] == 'Winter']

In [13]:
inhouse_brands = ['Campbell',
 'Donkervoort',
 'Dutch Dandies',
 'Recall',
 'The BLUEPRINT Premium',
 'Nobel',
 'Runway PARTY',
 'J.C. RAGS']

regular_sales_data.loc[~(regular_sales_data["brandDescription"].isin(inhouse_brands)), "brandDescription"] = "Others"

In [14]:
def allocate_forecast_to_brands(sales_df, forecast_df, inhouse_brands, historical_weight=0.5):
    sales_df = sales_df.copy()
    sales_df["salesDate"] = pd.to_datetime(sales_df["salesDate"])

    # Define date ranges
    hist_mask = (sales_df["salesDate"].dt.month >= 1) & (sales_df["salesDate"].dt.month <= 12) & (sales_df["salesDate"].dt.year.isin([2023, 2024]))
    recent_mask = (sales_df["salesDate"] >= "2025-01-01") & (sales_df["salesDate"] <= "2025-12-31")

    def compute_brand_share(df, label):
        group = df.groupby(["articleGroupDescription", "brandDescription"])["quantity"].sum().reset_index()
        total = group.groupby("articleGroupDescription")["quantity"].transform("sum")
        group["share"] = group["quantity"] / total
        group.rename(columns={"share": f"{label}_share"}, inplace=True)
        return group[["articleGroupDescription", "brandDescription", f"{label}_share"]]

    hist_share = compute_brand_share(sales_df[hist_mask], "hist")
    recent_share = compute_brand_share(sales_df[recent_mask], "recent")

    # Merge and blend
    merged = pd.merge(hist_share, recent_share, on=["articleGroupDescription", "brandDescription"], how="outer").fillna(0)
    merged["blended_share"] = historical_weight * merged["hist_share"] + (1 - historical_weight) * merged["recent_share"]

    # Merge forecasted article sales
    forecast_df = forecast_df.copy()
    merged_forecast = pd.merge(forecast_df, merged, on="articleGroupDescription", how="left")

    # Calculate brand-level forecast
    merged_forecast["BrandForecast"] = merged_forecast["SalesForecast"] * merged_forecast["blended_share"]
    merged_forecast["BrandForecast"] = round(merged_forecast["BrandForecast"], 0)
    
    # Tag internal/external brands
    merged_forecast["Inhouse_Brand"] = merged_forecast["brandDescription"].isin(inhouse_brands)

    return merged_forecast[[
        "salesDate", "articleGroupDescription", "brandDescription", "Inhouse_Brand", 
        "Approach", "SalesForecast", "blended_share", "BrandForecast"
    ]]

def historical_brand_breakdown(sales_data, start_year=2023, end_year=2024, inhouse_brands=None):
    if inhouse_brands is None:
        inhouse_brands = []

    # Filter for historical Mar–Sept sales (exclude 2025)
    hist_mask = (
        (sales_data["salesDate"].dt.year >= start_year) &
        (sales_data["salesDate"].dt.year <= end_year) &
        (sales_data["salesDate"].dt.month >= 1) &
        (sales_data["salesDate"].dt.month <= 12)
    )
    hist_df = sales_data.loc[hist_mask].copy()

    # Group by article + brand and sum quantities
    group = hist_df.groupby(["articleGroupDescription", "brandDescription"])["quantity"].sum().reset_index()

    # Compute total sales per article to get brand share
    total = group.groupby("articleGroupDescription")["quantity"].transform("sum")
    group["brand_share"] = group["quantity"] / total

    # Also calculate total article-level sales
    article_totals = group.groupby("articleGroupDescription")["quantity"].sum().reset_index()
    article_totals = article_totals.rename(columns={"quantity": "article_total_sales"})

    # Merge article totals back to group
    merged = pd.merge(group, article_totals, on="articleGroupDescription")

    # Estimate brand-level "forecast-like" numbers from historical totals
    merged["brand_quantity_estimate"] = merged["brand_share"] * merged["article_total_sales"]

    # Flag in-house brands
    merged["Inhouse_Brand"] = merged["brandDescription"].isin(inhouse_brands)

    return merged

In [15]:
historical_brand_summary = historical_brand_breakdown(regular_sales_data, start_year=2023, end_year=2025, inhouse_brands=None)

grouped = historical_brand_summary.groupby("Inhouse_Brand").agg({"brand_quantity_estimate": "sum"}).reset_index()
total_quantity = grouped["brand_quantity_estimate"].sum()
grouped["contribution_%"] = grouped["brand_quantity_estimate"] /  total_quantity * 100
grouped["contribution_%"] = grouped["contribution_%"].round(2)
grouped

Unnamed: 0,Inhouse_Brand,brand_quantity_estimate,contribution_%
0,False,156387.0,100.0


In [16]:
# brands = list(sales_data['brandDescription'].dropna().unique())

In [17]:
bifurcated_sales = allocate_forecast_to_brands(regular_sales_data, regular_forecast, inhouse_brands)
grouped = bifurcated_sales.groupby("Inhouse_Brand").agg({
    "BrandForecast": "sum"
}).reset_index()
total_quantity = grouped["BrandForecast"].sum()
grouped["contribution_%"] = grouped["BrandForecast"] / total_quantity * 100
grouped["contribution_%"] = grouped["contribution_%"].round(2)
grouped

Unnamed: 0,Inhouse_Brand,BrandForecast,contribution_%
0,False,25922.0,40.13
1,True,38671.0,59.87


In [18]:
sales_forecasted = bifurcated_sales[["salesDate", "articleGroupDescription", "brandDescription", "Inhouse_Brand", "BrandForecast"]]

sales_forecasted = sales_forecasted.rename(columns = {"BrandForecast":"quantity"})

sales_forecasted["category"] = "Forecasted Sales"

sales_forecasted["Inhouse_Brand"] = sales_forecasted["Inhouse_Brand"].replace({True:"Internal", False:"External"})

sales_forecasted["salesDate"] = pd.to_datetime(datetime(2026, 12, 31))

In [19]:
sales_forecasted.head(1)

Unnamed: 0,salesDate,articleGroupDescription,brandDescription,Inhouse_Brand,quantity,category
0,2026-12-31,Crew-neck sweater,Campbell,Internal,1985.0,Forecasted Sales


In [20]:
uncombined_sales_data.loc[(uncombined_sales_data["Discount%"] == 0), "IsDiscount"] = True
uncombined_sales_data["IsDiscount"] = uncombined_sales_data["IsDiscount"].fillna(False)

# Step 1: Calculate sales proportions (discount vs non-discount)
grouped = uncombined_sales_data[(uncombined_sales_data["salesDate"] >= "2022-01-01")].groupby(
    ["articleGroupDescription", "brandDescription", "IsDiscount"]
).agg({"quantity": "sum"}).reset_index()

# Total sales per article-brand
totals = grouped.groupby(
    ["articleGroupDescription", "brandDescription"]
)["quantity"].sum().reset_index().rename(columns={"quantity": "total_quantity"})

# Merge to compute proportion
merged = pd.merge(grouped, totals, on=["articleGroupDescription", "brandDescription"])
merged["sales_proportion"] = merged["quantity"] / merged["total_quantity"]

# Pivot into separate columns for True and False
proportions_pivot = merged.pivot(
    index=["articleGroupDescription", "brandDescription"],
    columns="IsDiscount",
    values="sales_proportion"
).reset_index()

# Handle missing columns by filling with 0, then rename
proportions_pivot = proportions_pivot.rename(columns={
    True: "discount_prop",
    False: "non_discount_prop"
})

# Ensure both columns exist
if "discount_prop" not in proportions_pivot.columns:
    proportions_pivot["discount_prop"] = 0.0
if "non_discount_prop" not in proportions_pivot.columns:
    proportions_pivot["non_discount_prop"] = 0.0

# Step 2: Pricing - max price (non-discount), avg price (discounted but less than max)
non_discount_price_df = uncombined_sales_data[(uncombined_sales_data["IsDiscount"] == False) & (uncombined_sales_data["salesDate"] >= "2024-01-01")] \
    .groupby(["articleGroupDescription", "brandDescription"])["retailPrice"].mean().reset_index() \
    .rename(columns={"retailPrice": "non_discount_price"})

discount_df = pd.merge(
    uncombined_sales_data[uncombined_sales_data["IsDiscount"] == True],
    non_discount_price_df,
    on=["articleGroupDescription", "brandDescription"],
    how="inner"
)

discount_df = discount_df[discount_df["retailPrice"] < discount_df["non_discount_price"]]

discount_price_df = uncombined_sales_data[(uncombined_sales_data["salesDate"] >= "2024-01-01")].groupby(["articleGroupDescription", "brandDescription"]) \
    .agg({"retailPrice": "mean"}).reset_index() \
    .rename(columns={"retailPrice": "discount_price"})

# Step 3: Combine price info
price_data_cleaned = pd.merge(non_discount_price_df, discount_price_df,
                              on=["articleGroupDescription", "brandDescription"], how="left")

# Step 4: Combine proportion + pricing
pricing_data = pd.merge(proportions_pivot, price_data_cleaned,
                        on=["articleGroupDescription", "brandDescription"], how="left")

# Step 5: Merge with forecast
sales_forecasted = pd.merge(sales_forecasted, pricing_data,
                            on=["articleGroupDescription", "brandDescription"], how="left")

# Ensure fill for proportions (if any are missing)
sales_forecasted["discount_prop"] = sales_forecasted["discount_prop"].fillna(0)
sales_forecasted["non_discount_prop"] = sales_forecasted["non_discount_prop"].fillna(1)

# Step 6: Project sales and revenue
sales_forecasted["discount_sales"] = sales_forecasted["quantity"] * sales_forecasted["discount_prop"]
sales_forecasted["non_discount_sales"] = sales_forecasted["quantity"] * sales_forecasted["non_discount_prop"]

sales_forecasted["discount_revenue"] = sales_forecasted["discount_sales"] * sales_forecasted["discount_price"]
sales_forecasted["non_discount_revenue"] = sales_forecasted["non_discount_sales"] * sales_forecasted["non_discount_price"]

sales_forecasted["retailPrice"] = (
    sales_forecasted["discount_revenue"].fillna(0) + sales_forecasted["non_discount_revenue"].fillna(0)
).round(2)

# Final output
sales_forecasted = sales_forecasted[[
"salesDate", "articleGroupDescription", "brandDescription", "Inhouse_Brand", "quantity", "category", "retailPrice"
]]

print(sales_forecasted["retailPrice"].sum())

sales_forecasted.head()

2111998.08


Unnamed: 0,salesDate,articleGroupDescription,brandDescription,Inhouse_Brand,quantity,category,retailPrice
0,2026-12-31,Crew-neck sweater,Campbell,Internal,1985.0,Forecasted Sales,100469.33
1,2026-12-31,Crew-neck sweater,Donkervoort,Internal,326.0,Forecasted Sales,13665.16
2,2026-12-31,Crew-neck sweater,Dutch Dandies,Internal,1294.0,Forecasted Sales,116714.62
3,2026-12-31,Crew-neck sweater,J.C. RAGS,Internal,2554.0,Forecasted Sales,137565.81
4,2026-12-31,Crew-neck sweater,Others,External,13251.0,Forecasted Sales,0.0


In [21]:
external_others_articles = list(sales_forecasted[(sales_forecasted['retailPrice'] == 0.0)]['articleGroupDescription'].unique())

temp_data = sales_data[(sales_data['articleGroupDescription'].isin(external_others_articles)) &
                       (sales_data['Inhouse_Brand'] == False)&
                       (sales_data['season'].isin(['Summer', "Winter"]))].groupby("articleGroupDescription").agg({'quantity':"sum","retailPrice":"sum"}).reset_index()

temp_data['avg_cost'] = round(temp_data['retailPrice'] / temp_data['quantity'], 1)

sales_forecasted = pd.merge(sales_forecasted, temp_data[['articleGroupDescription', 'avg_cost']], on = 'articleGroupDescription', how = 'left')

sales_forecasted.loc[(sales_forecasted['retailPrice'] == 0.0), 'retailPrice'] = round(sales_forecasted['quantity'] * sales_forecasted['avg_cost'], 2)

sales_forecasted = sales_forecasted.drop(columns = 'avg_cost')

In [22]:
sales_forecasted['quantity'].sum(), sales_forecasted['retailPrice'].sum()

(np.float64(64593.0), np.float64(6164651.9799999995))

In [23]:
uncombined_sales_data["year"] = uncombined_sales_data["salesDate"].dt.year
uncombined_sales_data["month"] = uncombined_sales_data["salesDate"].dt.month

In [24]:
uncombined_sales_data[(uncombined_sales_data["month"].isin([1,2,3,4,5,6,7,8,9,10,11,12]))].groupby("year").agg({"retailPrice":"sum", "quantity":'sum'}).reset_index()

Unnamed: 0,year,retailPrice,quantity
0,2019,3242631.86,21662.0
1,2020,4161587.42,34231.0
2,2021,4236880.24,36384.0
3,2022,5088121.79,49406.0
4,2023,6198356.31,64931.0
5,2024,6688131.27,63501.0
6,2025,3402569.27,27955.0


In [25]:
sales_forecasted["quantity"].sum(), sales_forecasted["retailPrice"].sum()

(np.float64(64593.0), np.float64(6164651.9799999995))

In [26]:
winter_clothes = ['Crew-neck sweater', 'Pull over half zip', 'Pullover rollneck', 'Padded jacket', 'Coat', "Others"]

In [27]:
sales_forecasted[sales_forecasted['articleGroupDescription'] =='Others']['quantity'].sum(), sales_forecasted[sales_forecasted['articleGroupDescription'] =='Others']['retailPrice'].sum()

(np.float64(364.0), np.float64(25838.940000000002))

In [28]:
sales_forecasted.loc[sales_forecasted["articleGroupDescription"].isin(winter_clothes), "season"] = "Winter"

In [29]:
sales_forecasted

Unnamed: 0,salesDate,articleGroupDescription,brandDescription,Inhouse_Brand,quantity,category,retailPrice,season
0,2026-12-31,Crew-neck sweater,Campbell,Internal,1985.0,Forecasted Sales,100469.33,Winter
1,2026-12-31,Crew-neck sweater,Donkervoort,Internal,326.0,Forecasted Sales,13665.16,Winter
2,2026-12-31,Crew-neck sweater,Dutch Dandies,Internal,1294.0,Forecasted Sales,116714.62,Winter
3,2026-12-31,Crew-neck sweater,J.C. RAGS,Internal,2554.0,Forecasted Sales,137565.81,Winter
4,2026-12-31,Crew-neck sweater,Others,External,13251.0,Forecasted Sales,1466885.7,Winter
5,2026-12-31,Crew-neck sweater,Recall,Internal,599.0,Forecasted Sales,34414.68,Winter
6,2026-12-31,Crew-neck sweater,The BLUEPRINT Premium,Internal,10780.0,Forecasted Sales,376414.95,Winter
7,2026-12-31,Padded jacket,Campbell,Internal,918.0,Forecasted Sales,74536.83,Winter
8,2026-12-31,Padded jacket,Donkervoort,Internal,471.0,Forecasted Sales,39908.24,Winter
9,2026-12-31,Padded jacket,J.C. RAGS,Internal,459.0,Forecasted Sales,39629.66,Winter


In [42]:
sales_forecasted['retailPrice'].sum(), sales_forecasted['quantity'].sum()

(np.float64(6164651.9799999995), np.float64(64593.0))

In [30]:
check = [sales_forecasted['quantity'].notna()].groupby(["Inhouse_Brand","articleGroupDescription", "brandDescription"]).agg({"quantity":'sum', "retailPrice":"sum"}).reset_index()

In [31]:
check['quantity'].sum(), check['retailPrice'].sum()

(np.float64(64593.0), np.float64(6164651.9799999995))

In [32]:
check[check['articleGroupDescription'] == 'Padded jacket']

Unnamed: 0,Inhouse_Brand,articleGroupDescription,brandDescription,quantity,retailPrice
3,External,Padded jacket,Others,4849.0,1455669.8
19,Internal,Padded jacket,Campbell,918.0,74536.83
20,Internal,Padded jacket,Donkervoort,471.0,39908.24
21,Internal,Padded jacket,J.C. RAGS,459.0,39629.66
22,Internal,Padded jacket,Recall,35.0,5387.43
23,Internal,Padded jacket,The BLUEPRINT Premium,191.0,5334.07


In [34]:
# sales_forecasted['season'] = sales_forecasted['season'].fillna("Year Round")

In [45]:
# sales_forecasted.to_csv("/Users/trentino/Work/OFM/droplet/2026 Forecast/data/sales_forecast_winter_regular_2026_season.csv", index=False)

In [36]:
uncombined_sales_data.loc[uncombined_sales_data["articleGroupDescription"].isin(winter_clothes), "season"] = "Winter"

In [37]:
combined_summ = uncombined_sales_data.groupby(['year', "Inhouse_Brand", 'season', 'articleGroupDescription']).agg({"quantity":"sum", "retailPrice":"sum"}).reset_index()

In [49]:
combined_summ[(combined_summ['articleGroupDescription'] == "Others") & (combined_summ['year'] == 2025)]

Unnamed: 0,year,Inhouse_Brand,season,articleGroupDescription,quantity,retailPrice
72,2025,False,Winter,Others,538.0,11521.63
78,2025,True,Winter,Others,97.0,25874.52


In [38]:
combined_summ.groupby("year").agg({"quantity":"sum", "retailPrice":"sum"}).reset_index()

Unnamed: 0,year,quantity,retailPrice
0,2019,21662.0,3242631.86
1,2020,34231.0,4161587.42
2,2021,36384.0,4236880.24
3,2022,49406.0,5088121.79
4,2023,64931.0,6198356.31
5,2024,63501.0,6688131.27
6,2025,27955.0,3402569.27


In [50]:
combined_summ

Unnamed: 0,year,Inhouse_Brand,season,articleGroupDescription,quantity,retailPrice
0,2019,False,Winter,Crew-neck sweater,6926.0,682184.94
1,2019,False,Winter,Others,21.0,427.93
2,2019,False,Winter,Padded jacket,3975.0,1130202.55
3,2019,False,Winter,Pull over half zip,2027.0,224933.91
4,2019,False,Winter,Pullover rollneck,1359.0,147520.42
5,2019,True,Winter,Crew-neck sweater,3358.0,268695.07
6,2019,True,Winter,Others,105.0,18045.91
7,2019,True,Winter,Padded jacket,2586.0,618036.12
8,2019,True,Winter,Pull over half zip,510.0,64141.12
9,2019,True,Winter,Pullover rollneck,795.0,88443.89


In [60]:
combined_summ[(combined_summ['year'] == 2025) & (combined_summ['Inhouse_Brand'] == True)]

Unnamed: 0,year,Inhouse_Brand,season,articleGroupDescription,quantity,retailPrice
76,2025,True,Winter,Coat,1725.0,421400.86
77,2025,True,Winter,Crew-neck sweater,5478.0,319481.92
78,2025,True,Winter,Others,97.0,25874.52
79,2025,True,Winter,Padded jacket,766.0,102934.47
80,2025,True,Winter,Pull over half zip,4331.0,298603.62
81,2025,True,Winter,Pullover rollneck,3185.0,164133.75


In [57]:
combined_summ[(combined_summ['year'] == 2024) & (combined_summ['Inhouse_Brand'] == True)]

Unnamed: 0,year,Inhouse_Brand,season,articleGroupDescription,quantity,retailPrice
64,2024,True,Winter,Coat,2349.0,620088.46
65,2024,True,Winter,Crew-neck sweater,10851.0,589862.19
66,2024,True,Winter,Others,21.0,10102.23
67,2024,True,Winter,Padded jacket,1265.0,151851.43
68,2024,True,Winter,Pull over half zip,15443.0,857148.99
69,2024,True,Winter,Pullover rollneck,7817.0,408389.81
