In [1]:
import os
import sys
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import plotly.graph_objects as go

import matplotlib.pyplot as plt
import seaborn as sns
import threading
from datetime import datetime, timezone
from concurrent.futures import ThreadPoolExecutor
from pymongo import MongoClient, errors

import warnings
warnings.filterwarnings("ignore")

pd.set_option("display.max_columns", 999)
pd.set_option("display.max_rows", 999)

In [2]:
def concat_regular_parquet_files(folder_path, output_path=None, recursive=False):
    all_files = []
    
    for root, _, files in os.walk(folder_path):
        for file in files:
            if file.endswith(".parquet") and not file.endswith("_noos_adjusted.parquet"):
                all_files.append(os.path.join(root, file))
        if not recursive:
            break

    if not all_files:
        raise FileNotFoundError("No .parquet files found in the given folder.")

    df_list = [pd.read_parquet(fp) for fp in all_files]
    combined_df = pd.concat(df_list, ignore_index=True)

    if output_path:
        combined_df.to_parquet(output_path, index=False)
        print(f"✅ Saved concatenated file to: {output_path}")

    return combined_df

def concat_noos_parquet_files(folder_path, output_path=None, recursive=False):
    all_files = []
    
    for root, _, files in os.walk(folder_path):
        for file in files:
            if file.endswith(".parquet") and file.endswith("_noos_adjusted.parquet"):
                all_files.append(os.path.join(root, file))
        if not recursive:
            break

    if not all_files:
        raise FileNotFoundError("No .parquet files found in the given folder.")

    df_list = [pd.read_parquet(fp) for fp in all_files]
    combined_df = pd.concat(df_list, ignore_index=True)

    if output_path:
        combined_df.to_parquet(output_path, index=False)
        print(f"✅ Saved concatenated file to: {output_path}")

    return combined_df

In [3]:
noos_forecast = concat_noos_parquet_files("/Users/trentino/Work/OFM/droplet/2026 Forecast/forecasted_data")

In [4]:
sales_data = pd.read_parquet("/Users/trentino/Work/OFM/droplet/2026 Forecast/data/clean_sales_data.parquet")

In [5]:
internal_brands = ['Campbell','Donkervoort','Dutch Dandies','Recall','The BLUEPRINT Premium','Nobel','Runway PARTY','J.C. RAGS']
internal_brands_list = [618, 427, 301, 438, 228, 1000, 804, 876]

In [6]:
noos_sales_data = sales_data[sales_data['season'].isin(["NOOS"])].reset_index(drop=True)

In [7]:
noos_articles = ['Casual belt', 'Shoe care', 'Mix & match jacket', 'Crew-neck sweater', 'T-shirt SS', 'Dress belt', 'Jeans', 'Polo SS', 'Boxershort', 'Chino', 'Dress Shirt LS', 'Socks', 'Short', '5-pocket',
                 'Mix & match trousers', 'Suit', 'V-neck sweater', 'Casual shirt LS', "Others"]

noos_others = ['Trousers', 'Jack outdoor', 'Padded jacket', 'Dress Shirt extra long sleeve', 'Cardigan', 'Pullover rollneck', 'Spencer', 'Pull over half zip', 'Polo LS', 'Dress Shoe', 'Tie', 'Bow', 'Jewel', 
               'Book', 'Scents', 'Wallet', 'Others', 'Verpakking', 'Tuxedo', 'Hoodie', 'T-shirt LS', 'Cufflinks', 'Wedding suit', 'Waistcoat', 'Tuxedo shirt', 'Watch', 'Skin care', 'Pocket square', 'Suspenders',
               'Sweatpants', 'Sweatshirt', 'Loafers', 'Sneakers', 'Gloves', 'Jacket', 'Boots', 'Lang model', '3/4 model', 'Gadgets', 'Vest', 'Casual Shoe', 'Shawl', 'Tailor-made shirt', 'Dames blouse dress LM', 
               'Swim short', 'Apron', 'Casual shirt SS', 'Dames blazer', 'Dames pantalon', 'Dames rok', 'Jack/jas', 'Bermuda', 'Merchandise', 'Knitwear', 'Dress Shirt SS', 'Umbrella', 'Blazer', 'Coat', 
               'Giftcard', 'Bag', 'Sweater', 'Tailor-made suit 2 pcs', 'Polo KM', 'Headwear', 'Trendy shirt LS', 'Leather jacket', 'Bodywarmer', 'Shoes', 'Mix & match waistcoat', 'Sunglasses', 'Accessorry', 
               'Cap', 'Parka', 'Polo LM', 'Overshirt', 'Dresshirt', 'Casual shirt long sleeve', 'Dress Shirt long sleeve', 'T-shirt short sleeve', 'Polo short sleeve', 'Pullover half zip', 'Polo long sleeve', 
               'Slippers', 'Casual shirt short sleeve', 'T-shirt long sleeve']


In [8]:
noos_sales_data.loc[(noos_sales_data["articleGroupDescription"].isin(noos_others)) & (noos_sales_data['season'] == "NOOS"), "articleGroupDescription"] = "Others"
# noos_sales_data.loc[~(noos_sales_data["articleGroupDescription"].isin(noos_articles)), "articleGroupDescription"] = "Others"
noos_sales_data = noos_sales_data[noos_sales_data["articleGroupDescription"].isin(noos_articles)].reset_index(drop=True)
uncombined_sales_data = noos_sales_data.copy()
noos_sales_data = noos_sales_data.groupby(["salesDate", "articleGroupDescription", "brandDescription", "Inhouse_Brand"]).agg({"quantity":"sum"}).reset_index()

In [9]:
inhouse_brands = ['Campbell',
 'Donkervoort',
 'Dutch Dandies',
 'Recall',
 'The BLUEPRINT Premium',
 'Nobel',
 'Runway PARTY',
 'J.C. RAGS']

noos_sales_data.loc[~(noos_sales_data["brandDescription"].isin(inhouse_brands)), "brandDescription"] = "Others"

In [10]:
def allocate_forecast_to_brands(sales_df, forecast_df, inhouse_brands, historical_weight=0.5):
    sales_df = sales_df.copy()
    sales_df["salesDate"] = pd.to_datetime(sales_df["salesDate"])

    # Define date ranges
    hist_mask = (sales_df["salesDate"].dt.month >= 1) & (sales_df["salesDate"].dt.month <= 12) & (sales_df["salesDate"].dt.year.isin([2023, 2024]))
    recent_mask = (sales_df["salesDate"] >= "2025-01-01") & (sales_df["salesDate"] <= "2025-12-31")

    def compute_brand_share(df, label):
        group = df.groupby(["articleGroupDescription", "brandDescription"])["quantity"].sum().reset_index()
        total = group.groupby("articleGroupDescription")["quantity"].transform("sum")
        group["share"] = group["quantity"] / total
        group.rename(columns={"share": f"{label}_share"}, inplace=True)
        return group[["articleGroupDescription", "brandDescription", f"{label}_share"]]

    hist_share = compute_brand_share(sales_df[hist_mask], "hist")
    recent_share = compute_brand_share(sales_df[recent_mask], "recent")

    # Merge and blend
    merged = pd.merge(hist_share, recent_share, on=["articleGroupDescription", "brandDescription"], how="outer").fillna(0)
    merged["blended_share"] = historical_weight * merged["hist_share"] + (1 - historical_weight) * merged["recent_share"]

    # Merge forecasted article sales
    forecast_df = forecast_df.copy()
    merged_forecast = pd.merge(forecast_df, merged, on="articleGroupDescription", how="left")

    # Calculate brand-level forecast
    merged_forecast["BrandForecast"] = merged_forecast["SalesForecast"] * merged_forecast["blended_share"]
    merged_forecast["BrandForecast"] = round(merged_forecast["BrandForecast"], 0)
    
    # Tag internal/external brands
    merged_forecast["Inhouse_Brand"] = merged_forecast["brandDescription"].isin(inhouse_brands)

    return merged_forecast[[
        "salesDate", "articleGroupDescription", "brandDescription", "Inhouse_Brand", 
        "Approach", "SalesForecast", "blended_share", "BrandForecast"
    ]]

def historical_brand_breakdown(sales_data, start_year=2023, end_year=2024, inhouse_brands=None):
    if inhouse_brands is None:
        inhouse_brands = []

    # Filter for historical Mar–Sept sales (exclude 2025)
    hist_mask = (
        (sales_data["salesDate"].dt.year >= start_year) &
        (sales_data["salesDate"].dt.year <= end_year) &
        (sales_data["salesDate"].dt.month >= 1) &
        (sales_data["salesDate"].dt.month <= 12)
    )
    hist_df = sales_data.loc[hist_mask].copy()

    # Group by article + brand and sum quantities
    group = hist_df.groupby(["articleGroupDescription", "brandDescription"])["quantity"].sum().reset_index()

    # Compute total sales per article to get brand share
    total = group.groupby("articleGroupDescription")["quantity"].transform("sum")
    group["brand_share"] = group["quantity"] / total

    # Also calculate total article-level sales
    article_totals = group.groupby("articleGroupDescription")["quantity"].sum().reset_index()
    article_totals = article_totals.rename(columns={"quantity": "article_total_sales"})

    # Merge article totals back to group
    merged = pd.merge(group, article_totals, on="articleGroupDescription")

    # Estimate brand-level "forecast-like" numbers from historical totals
    merged["brand_quantity_estimate"] = merged["brand_share"] * merged["article_total_sales"]

    # Flag in-house brands
    merged["Inhouse_Brand"] = merged["brandDescription"].isin(inhouse_brands)

    return merged

In [11]:
historical_brand_summary = historical_brand_breakdown(noos_sales_data, start_year=2023, end_year=2025, inhouse_brands=inhouse_brands)

grouped = historical_brand_summary.groupby("Inhouse_Brand").agg({"brand_quantity_estimate": "sum"}).reset_index()
total_quantity = grouped["brand_quantity_estimate"].sum()
grouped["contribution_%"] = grouped["brand_quantity_estimate"] /  total_quantity * 100
grouped["contribution_%"] = grouped["contribution_%"].round(2)
grouped

Unnamed: 0,Inhouse_Brand,brand_quantity_estimate,contribution_%
0,False,510678.0,80.79
1,True,121439.0,19.21


In [12]:
bifurcated_sales = allocate_forecast_to_brands(noos_sales_data, noos_forecast, inhouse_brands)
grouped = bifurcated_sales.groupby("Inhouse_Brand").agg({
    "BrandForecast": "sum"
}).reset_index()
total_quantity = grouped["BrandForecast"].sum()
grouped["contribution_%"] = grouped["BrandForecast"] / total_quantity * 100
grouped["contribution_%"] = grouped["contribution_%"].round(2)
grouped

Unnamed: 0,Inhouse_Brand,BrandForecast,contribution_%
0,False,167076.0,74.4
1,True,57489.0,25.6


In [13]:
sales_forecasted = bifurcated_sales[["salesDate", "articleGroupDescription", "brandDescription", "Inhouse_Brand", "BrandForecast"]]

sales_forecasted = sales_forecasted.rename(columns = {"BrandForecast":"quantity"})

sales_forecasted["category"] = "Forecasted Sales"

sales_forecasted["Inhouse_Brand"] = sales_forecasted["Inhouse_Brand"].replace({True:"Internal", False:"External"})

sales_forecasted["salesDate"] = pd.to_datetime(datetime(2026, 12, 31))

In [14]:
sales_forecasted.head(1)

Unnamed: 0,salesDate,articleGroupDescription,brandDescription,Inhouse_Brand,quantity,category
0,2026-12-31,Casual shirt LS,Campbell,Internal,2856.0,Forecasted Sales


In [15]:
uncombined_sales_data.loc[(uncombined_sales_data["Discount%"] == 0), "IsDiscount"] = True
uncombined_sales_data["IsDiscount"] = uncombined_sales_data["IsDiscount"].fillna(False)

# Step 1: Calculate sales proportions (discount vs non-discount)
grouped = uncombined_sales_data[(uncombined_sales_data["salesDate"] >= "2022-01-01")].groupby(
    ["articleGroupDescription", "brandDescription", "IsDiscount"]
).agg({"quantity": "sum"}).reset_index()

# Total sales per article-brand
totals = grouped.groupby(
    ["articleGroupDescription", "brandDescription"]
)["quantity"].sum().reset_index().rename(columns={"quantity": "total_quantity"})

# Merge to compute proportion
merged = pd.merge(grouped, totals, on=["articleGroupDescription", "brandDescription"])
merged["sales_proportion"] = merged["quantity"] / merged["total_quantity"]

# Pivot into separate columns for True and False
proportions_pivot = merged.pivot(
    index=["articleGroupDescription", "brandDescription"],
    columns="IsDiscount",
    values="sales_proportion"
).reset_index()

# Handle missing columns by filling with 0, then rename
proportions_pivot = proportions_pivot.rename(columns={
    True: "discount_prop",
    False: "non_discount_prop"
})

# Ensure both columns exist
if "discount_prop" not in proportions_pivot.columns:
    proportions_pivot["discount_prop"] = 0.0
if "non_discount_prop" not in proportions_pivot.columns:
    proportions_pivot["non_discount_prop"] = 0.0

# Step 2: Pricing - max price (non-discount), avg price (discounted but less than max)
non_discount_price_df = uncombined_sales_data[(uncombined_sales_data["IsDiscount"] == False) & (uncombined_sales_data["salesDate"] >= "2024-01-01")] \
    .groupby(["articleGroupDescription", "brandDescription"])["retailPrice"].mean().reset_index() \
    .rename(columns={"retailPrice": "non_discount_price"})

discount_df = pd.merge(
    uncombined_sales_data[uncombined_sales_data["IsDiscount"] == True],
    non_discount_price_df,
    on=["articleGroupDescription", "brandDescription"],
    how="inner"
)

discount_df = discount_df[discount_df["retailPrice"] < discount_df["non_discount_price"]]

discount_price_df = uncombined_sales_data[(uncombined_sales_data["salesDate"] >= "2024-01-01")].groupby(["articleGroupDescription", "brandDescription"]) \
    .agg({"retailPrice": "mean"}).reset_index() \
    .rename(columns={"retailPrice": "discount_price"})

# Step 3: Combine price info
price_data_cleaned = pd.merge(non_discount_price_df, discount_price_df,
                              on=["articleGroupDescription", "brandDescription"], how="left")

# Step 4: Combine proportion + pricing
pricing_data = pd.merge(proportions_pivot, price_data_cleaned,
                        on=["articleGroupDescription", "brandDescription"], how="left")

# Step 5: Merge with forecast
sales_forecasted = pd.merge(sales_forecasted, pricing_data,
                            on=["articleGroupDescription", "brandDescription"], how="left")

# Ensure fill for proportions (if any are missing)
sales_forecasted["discount_prop"] = sales_forecasted["discount_prop"].fillna(0)
sales_forecasted["non_discount_prop"] = sales_forecasted["non_discount_prop"].fillna(1)

# Step 6: Project sales and revenue
sales_forecasted["discount_sales"] = sales_forecasted["quantity"] * sales_forecasted["discount_prop"]
sales_forecasted["non_discount_sales"] = sales_forecasted["quantity"] * sales_forecasted["non_discount_prop"]

sales_forecasted["discount_revenue"] = sales_forecasted["discount_sales"] * sales_forecasted["discount_price"]
sales_forecasted["non_discount_revenue"] = sales_forecasted["non_discount_sales"] * sales_forecasted["non_discount_price"]

sales_forecasted["retailPrice"] = (
    sales_forecasted["discount_revenue"].fillna(0) + sales_forecasted["non_discount_revenue"].fillna(0)
).round(2)

# Final output
sales_forecasted = sales_forecasted[[
"salesDate", "articleGroupDescription", "brandDescription", "Inhouse_Brand", "quantity", "category", "retailPrice"
]]

print(sales_forecasted["retailPrice"].sum())

sales_forecasted.head()

4468992.199999999


Unnamed: 0,salesDate,articleGroupDescription,brandDescription,Inhouse_Brand,quantity,category,retailPrice
0,2026-12-31,Casual shirt LS,Campbell,Internal,2856.0,Forecasted Sales,122184.8
1,2026-12-31,Casual shirt LS,Donkervoort,Internal,576.0,Forecasted Sales,19762.57
2,2026-12-31,Casual shirt LS,J.C. RAGS,Internal,376.0,Forecasted Sales,25799.52
3,2026-12-31,Casual shirt LS,Others,External,3517.0,Forecasted Sales,0.0
4,2026-12-31,Dress belt,Dutch Dandies,Internal,93.0,Forecasted Sales,7082.18


In [16]:
external_others_articles = list(sales_forecasted[(sales_forecasted['retailPrice'] == 0.0)]['articleGroupDescription'].unique())

temp_data = sales_data[(sales_data['articleGroupDescription'].isin(external_others_articles)) &
                       (sales_data['Inhouse_Brand'] == False)&
                       (sales_data['season'].isin(['Summer', "Winter"]))].groupby("articleGroupDescription").agg({'quantity':"sum","retailPrice":"sum"}).reset_index()

temp_data['avg_cost'] = round(temp_data['retailPrice'] / temp_data['quantity'], 1)

sales_forecasted = pd.merge(sales_forecasted, temp_data[['articleGroupDescription', 'avg_cost']], on = 'articleGroupDescription', how = 'left')

sales_forecasted.loc[(sales_forecasted['retailPrice'] == 0.0), 'retailPrice'] = round(sales_forecasted['quantity'] * sales_forecasted['avg_cost'], 2)

sales_forecasted = sales_forecasted.drop(columns = 'avg_cost')

In [17]:
sales_forecasted['quantity'].sum(), sales_forecasted['retailPrice'].sum()

(np.float64(224565.0), np.float64(16487874.700000005))

In [18]:
uncombined_sales_data["year"] = uncombined_sales_data["salesDate"].dt.year
uncombined_sales_data["month"] = uncombined_sales_data["salesDate"].dt.month

In [19]:
uncombined_sales_data[(uncombined_sales_data["month"].isin([1,2,3,4,5,6,7,8,9,10,11,12]))].groupby("year").agg({"retailPrice":"sum", "quantity":'sum'}).reset_index()

Unnamed: 0,year,retailPrice,quantity
0,2019,10411014.94,108006.0
1,2020,11399523.86,135067.0
2,2021,14578137.22,153372.0
3,2022,20018178.14,202032.0
4,2023,20608680.19,212169.0
5,2024,21941335.65,219206.0
6,2025,21048359.51,200742.0


In [20]:
sales_forecasted[sales_forecasted['articleGroupDescription'] == 'Others']['retailPrice'].sum()

np.float64(1599614.1199999999)

In [21]:
winter_clothes = ['V-neck sweater', 'Crew-neck sweater'] 

summer_clothes = ['Polo SS', 'Dress belt', 'Mix & match trousers', 'Mix & match jacket', 'Short'] 

year_round_clothes =  ['Jeans', 'Dress Shirt LS', 'Socks', 'T-shirt SS', 'Chino', 'Suit', 'Casual belt', '5-pocket', 'Casual shirt LS', 'Boxershort', 'Shoe care'] 


In [22]:
sales_forecasted.loc[sales_forecasted["articleGroupDescription"].isin(summer_clothes), "season"] = "Summer"
sales_forecasted.loc[sales_forecasted["articleGroupDescription"].isin(winter_clothes), "season"] = "Winter"
sales_forecasted.loc[sales_forecasted["articleGroupDescription"].isin(year_round_clothes), "season"] = "Year Round"

In [23]:
sales_forecasted['season'] = sales_forecasted['season'].fillna("Year Round")

In [24]:
sales_forecasted.to_csv("/Users/trentino/Work/OFM/droplet/2026 Forecast/data/sales_forecast_noos_2026_season.csv", index=False)

In [25]:
sales_data[sales_data['articleGroupDescription'] == 'Shoe care'].groupby("year").agg({"quantity":"sum", 'retailPrice':"sum"}).reset_index()

Unnamed: 0,year,quantity,retailPrice
0,2019,442.0,3067.71
1,2020,1749.0,18256.88
2,2021,1718.0,16625.04
3,2022,3232.0,35283.14
4,2023,4114.0,48018.61
5,2024,3740.0,48184.66
6,2025,3114.0,42932.96


In [26]:
sales_forecasted.loc[sales_forecasted['articleGroupDescription'] == 'Shoe care', 'retailPrice'] = 88908.56

In [27]:
sales_forecasted

Unnamed: 0,salesDate,articleGroupDescription,brandDescription,Inhouse_Brand,quantity,category,retailPrice,season
0,2026-12-31,Casual shirt LS,Campbell,Internal,2856.0,Forecasted Sales,122184.8,Year Round
1,2026-12-31,Casual shirt LS,Donkervoort,Internal,576.0,Forecasted Sales,19762.57,Year Round
2,2026-12-31,Casual shirt LS,J.C. RAGS,Internal,376.0,Forecasted Sales,25799.52,Year Round
3,2026-12-31,Casual shirt LS,Others,External,3517.0,Forecasted Sales,325674.2,Year Round
4,2026-12-31,Dress belt,Dutch Dandies,Internal,93.0,Forecasted Sales,7082.18,Summer
5,2026-12-31,Dress belt,Others,External,11531.0,Forecasted Sales,792179.7,Summer
6,2026-12-31,Dress belt,Recall,Internal,15.0,Forecasted Sales,299.25,Summer
7,2026-12-31,T-shirt SS,J.C. RAGS,Internal,1744.0,Forecasted Sales,45194.84,Year Round
8,2026-12-31,T-shirt SS,Others,External,23450.0,Forecasted Sales,1207675.0,Year Round
9,2026-12-31,Mix & match trousers,Donkervoort,Internal,107.0,Forecasted Sales,8923.27,Summer


In [28]:
sorted(uncombined_sales_data['articleGroupDescription'].unique())

['5-pocket',
 'Boxershort',
 'Casual belt',
 'Casual shirt LS',
 'Chino',
 'Crew-neck sweater',
 'Dress Shirt LS',
 'Dress belt',
 'Jeans',
 'Mix & match jacket',
 'Mix & match trousers',
 'Others',
 'Polo SS',
 'Shoe care',
 'Short',
 'Socks',
 'Suit',
 'T-shirt SS',
 'V-neck sweater']

In [29]:
uncombined_sales_data.loc[uncombined_sales_data["articleGroupDescription"].isin(summer_clothes), "season"] = "Summer"
uncombined_sales_data.loc[uncombined_sales_data["articleGroupDescription"].isin(winter_clothes), "season"] = "Winter"
uncombined_sales_data.loc[uncombined_sales_data["articleGroupDescription"].isin(year_round_clothes), "season"] = "Year Round"

In [30]:
combined_summ = uncombined_sales_data.groupby(['year', "Inhouse_Brand", 'season', 'articleGroupDescription']).agg({"quantity":"sum", "retailPrice":"sum"}).reset_index()

In [31]:
combined_summ[(combined_summ['year'] == 2025) & (combined_summ['season'] == "Summer")]

Unnamed: 0,year,Inhouse_Brand,season,articleGroupDescription,quantity,retailPrice
198,2025,False,Summer,Dress belt,7514.0,434524.85
199,2025,False,Summer,Mix & match jacket,5205.0,1515402.23
200,2025,False,Summer,Mix & match trousers,5693.0,809696.98
201,2025,False,Summer,Polo SS,3479.0,321252.38
202,2025,False,Summer,Short,2211.0,189602.47
217,2025,True,Summer,Dress belt,2.0,335.57
218,2025,True,Summer,Mix & match jacket,496.0,70254.78
219,2025,True,Summer,Mix & match trousers,639.0,53896.04


In [32]:
combined_summ[(combined_summ['year'] == 2025) & (combined_summ['articleGroupDescription'] == "Others")]

Unnamed: 0,year,Inhouse_Brand,season,articleGroupDescription,quantity,retailPrice
197,2025,False,NOOS,Others,29275.0,3299093.95
216,2025,True,NOOS,Others,8634.0,1413126.86


In [34]:
combined_summ[(combined_summ['year'] == 2024) & (combined_summ['Inhouse_Brand'] == True) & (combined_summ['season'] == "NOOS")]

Unnamed: 0,year,Inhouse_Brand,season,articleGroupDescription,quantity,retailPrice
181,2024,True,NOOS,Others,2732.0,1163246.01
