In [1]:
# !pip3 install fastparquet

In [2]:
import os
import sys
import numpy as np
import pandas as pd
from datetime import datetime, timedelta


pd.set_option("display.max_columns", 999)
pd.set_option("display.max_rows", 999)
pd.set_option('future.no_silent_downcasting', True)

In [3]:
sales_data = pd.read_parquet("/Users/trentino/Work/OFM/droplet/2026 Forecast/data/sales_data.parquet",  engine='fastparquet')
weather_data = pd.read_parquet("/Users/trentino/Work/OFM/droplet/2026 Forecast/data/open_meteo_weather.parquet",  engine='fastparquet')
noos_sales_data = pd.read_csv('/Users/trentino/Work/OFM/droplet/sales_forecasting/src/nos_data/processed_data.csv')

  noos_sales_data = pd.read_csv('/Users/trentino/Work/OFM/droplet/sales_forecasting/src/nos_data/processed_data.csv')


In [179]:
sales_data['brandDescription'].nunique()

343

In [4]:
sales_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5939285 entries, 0 to 5939284
Data columns (total 43 columns):
 #   Column                     Dtype         
---  ------                     -----         
 0   lineType                   object        
 1   sender                     object        
 2   receiver                   object        
 3   ichRef                     object        
 4   testIndication             object        
 5   messageType                object        
 6   messageNumber              object        
 7   messageDate                datetime64[ns]
 8   salesDateStart             datetime64[ns]
 9   salesDateEnd               datetime64[ns]
 10  supplierType               object        
 11  supplierIdentifier         object        
 12  corporateOfficeType        object        
 13  corporateOfficeIdentifier  object        
 14  locationType               object        
 15  locationIdentifier         object        
 16  currency                   object   

In [5]:
# Data Type Correction
def process_sales_data(sales_data, noos_sales_data):
    sales_data.rename(columns = {"articleIndentifier":"articleIdentifier"}, inplace=True)
    noos_sales_data.rename(columns = {"articleIndentifier":"articleIdentifier"}, inplace=True)

    sales_data = sales_data.replace({'':np.nan})

    dtype_mapping = {
        "locationIdentifier": "string",
        "salesDate": "datetime64[ns]",
        "articleIdentifier": "string",
        "articleGroupIdentifier": "int64",
        "articleGroupDescription": "string",
        "brandIdentifier": "float64",
        "brandDescription": "string",
        "seasonIdentifier": "float32",
        "receiptNumber": "string",
        "customerIdentifier": "float64",
        "quantity": "float64",
        "purchaseValue": "float64",
        "adviceRetailValue": "float64",
        "retailValueWithoutTax": "float64",
        "retailValue": "float64",
        "purchasePrice": "float64",
        "adviceRetailPrice": "float64",
        "retailPriceWithoutTax": "float64",
        "retailPrice": "float64",
        "locationIdentifier":"float32"
    }
    
    sales_data = sales_data.astype(dtype_mapping)
    noos_sales_data = noos_sales_data.astype(dtype_mapping)
    sales_data = pd.concat([sales_data, noos_sales_data], ignore_index=True)
    
    mask = sales_data["salesDate"].between("2019-01-01", "2025-12-31", inclusive="both")
    
    sales_data.loc[mask] = (
        sales_data.loc[mask]
        .drop_duplicates([
            "salesDate",
            "articleIdentifier",
            "receiptNumber",
            "brandIdentifier",
            'articleGroupIdentifier',
            "quantity",
        ])
    )

    sales_data.reset_index(drop=True, inplace=True)

    columns_to_drop = [
    "lineType", "sender", "receiver", "ichRef", "testIndication", "messageType", "messageNumber", "messageDate", "supplierType", "supplierIdentifier", "corporateOfficeType",
    "corporateOfficeIdentifier","locationType", "currency", 'lineNumber', 'updateCode',"articleType", "articleGroupType", "brandType", "seasonType", "seasonDescription", "customerType",
    "salesDateStart","salesDateEnd", "customerIdentifier",  "receiptNumber"] # "adviceRetailValue", "retailValueWithoutTax", "retailValue",  "retailPriceWithoutTax",

    sales_data = sales_data.drop(columns = columns_to_drop)

    sales_data = sales_data.drop_duplicates().reset_index(drop=True)

    # Filtering Relevant Stores & Seasons
    relevant_stores = [1, 2, 4, 5, 8, 9, 10, 11, 12, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 41, 54, 98]
    relevant_seasons = [4, 5, 9, 75, 76, 77, 78, 79, 80, 211, 212, 213, 214, 221, 222, 223, 224, 231, 232, 233, 234, 241, 242, 243, 244, 251, 252, 253, 254]

    sales_data = sales_data[sales_data["locationIdentifier"].isin(relevant_stores)].reset_index(drop=True)
    sales_data = sales_data[sales_data["seasonIdentifier"].isin(relevant_seasons)].reset_index(drop=True)

    sales_data["Revenue"] = (sales_data["quantity"] * sales_data["retailPrice"]).round(2)
    
    # Configuring inhouse brands
    internal_brand_identifiers = [228, 301, 427, 804, 618, 438, 1000, 876]
    internal_brand_descriptions = ["Runway PARTY", "Recall", "Dutch Dandies", "J.C. RAGS", "Nobel", "Donkervoort", "Campbell", "The BLUEPRINT Premium"]

    sales_data["Inhouse_Brand"] = sales_data["brandIdentifier"].isin(internal_brand_identifiers)

    # Marking Returns
    sales_data["Is_Return"] = sales_data["quantity"] < 0

    # Marking Seasons
    season_mapping = {4.0:"NOOS", 5.0:"NOOS", 9.0:"NOOS", 75.0:"Summer", 76.0:"Winter", 77.0:"Summer", 78.0:"Winter", 79.0:"Summer", 80.0:"Winter",
    211.0:"Spring", 212.0:"Summer", 213.0:"Fall", 214.0:"Winter", 221.0:"Spring", 222.0:"Summer", 223.0:"Fall", 224.0:"Winter", 
    231.0:"Spring", 232.0:"Summer", 233.0:"Fall", 234.0:"Winter", 241.0:"Spring", 242.0:"Summer", 243.0:"Fall", 244.0:"Winter",
    251.0:"Spring", 252.0:"Summer", 253.0:"Fall", 254.0:"Winter"}

    sales_data["season"] = sales_data["seasonIdentifier"].map(season_mapping)
    
    sales_data["season"] = sales_data["season"].replace({"Spring":"Summer", "Fall":"Winter"})

    # sales_data = sales_data[sales_data["quantity"] >= 0]

    # Create Month Year Columns
    sales_data["year"] = sales_data["salesDate"].dt.year
    sales_data["month"] = sales_data["salesDate"].dt.month
    sales_data["year_month"] = sales_data['salesDate'].dt.to_period('M')

    return sales_data

def process_price_data():
    advised_retail_data = pd.read_parquet("/Users/trentino/Work/OFM/droplet/sales_forecasting/src/data/processed/avp_total.parquet")
    barcode_data = pd.read_parquet("/Users/trentino/Work/OFM/droplet/sales_forecasting/src/data/processed/clean_barcode_data.parquet")
    
    price_tag_data = pd.merge(barcode_data, advised_retail_data, right_on="Art. nr", left_on="serial_number", how = "left")
    price_tag_data = price_tag_data.rename(columns = {"AVP":"tag_price"}).drop(columns = ["articleGroupIdentifier", "advisedRetailPriceWithTaxes_barcode", "description"])
    
    price_tag_data = price_tag_data[["articleIdentifier", "tag_price", "purchasePrice_barcode"]].dropna().reset_index(drop=True)
    
    price_tag_data["purchasePrice_barcode"] = price_tag_data["purchasePrice_barcode"].round(2)
    return price_tag_data


def create_discounts(sales_data):
    sales_data["adviceRetailPrice_with_tax"] = round((sales_data['adviceRetailPrice'] * 0.21 + sales_data['adviceRetailPrice']), 2)

    sales_data = sales_data.drop(index = sales_data[(sales_data["purchaseValue"] == 0.0) & (sales_data["purchasePrice_barcode"].isna())].index).reset_index(drop=True)
    
    sales_data.loc[(sales_data["purchasePrice_barcode"].isna(), "purchasePrice_barcode")] = sales_data["purchasePrice"]
    
    sales_data["purchaseValue"] = (sales_data["purchasePrice_barcode"] * sales_data["quantity"]).round(2)
    
    sales_data.loc[(sales_data["tag_price"] < 1), "tag_price"] = sales_data["adviceRetailPrice_with_tax"]
    
    # Correcting the retailPrice where the quantity is psotive but retailPrice is negative
    sales_data.loc[(sales_data["quantity"] > 0) & (sales_data["retailPrice"] < 0), "retailPrice"] *= -1
    
    # Correcting the Revenue where the quantity is negative but Revenue is Positive
    sales_data.loc[sales_data["quantity"] < 0, "Revenue"] *= -1
    
    # Discount Percentage Calculation
    sales_data["Discount%"] = np.where(
        sales_data["tag_price"].notna() & (sales_data["tag_price"] > 0),
        ((sales_data["tag_price"] - sales_data["retailPrice"]) / sales_data["tag_price"]) * 100,
        np.nan
    ).round(2)
    
    print("Total Rows where Discount% is negative (-) :",sales_data[(sales_data["Discount%"] < 0)].shape[0])
    
    sales_data["Margin%"] = np.where(
        (sales_data["retailPrice"].notna()) & (sales_data["retailPrice"] > 0) & (sales_data["purchasePrice"].notna()) & (sales_data["purchasePrice"] > 0),
        ((sales_data["retailPrice"] - sales_data["purchasePrice"]) / sales_data["retailPrice"]) * 100,
        np.nan
    ).round(2)

    return sales_data

def handle_missing_sales_price_data(sales_data):
    sales_data['salesDate'] = pd.to_datetime(sales_data['salesDate'])
    
    sales_data['year_month'] = sales_data['salesDate'].dt.to_period('M')
    
    cols_to_fill = ['purchaseValue', 'purchasePrice_barcode', 'purchasePrice']
    
    monthly_means = (
        sales_data.groupby(['brandIdentifier', 'articleGroupDescription', 'year_month'])[cols_to_fill]
        .mean()
        .reset_index()
        .rename(columns={col: f"{col}_monthly_mean" for col in cols_to_fill})
    )
    
    sales_data = sales_data.merge(monthly_means, on=['brandIdentifier', 'articleGroupDescription', 'year_month'], how='left')
    
    for col in cols_to_fill:
        mean_col = f"{col}_monthly_mean"
        sales_data[col] = np.where(sales_data[col] < 1, sales_data[mean_col], sales_data[col])
    
    sales_data.drop(columns=[f"{col}_monthly_mean" for col in cols_to_fill] + ['year_month'], inplace=True)
    return sales_data

def process_weather_data(weather_data):
    store_location_name_mapping = {
        "41": "Reusel", "22": "Goes", "1": "Geldermalsen", "17": "Almere", "11": "Oosterhout", "19": "Beek", "24": "Naaldwijk",
        "54": "Woerden", "15": "Voorburg", "10": "Doesburg", "4": "Waalwijk", "16": "Utrecht", "23": "Sliedrecht", "12": "Reusel", 
        "98": "Geldermalsen", "20": "Sluis", "9": "Veldhoven", "18": "Alkmaar", "21": "Weert", "8": "Druten", "14": "Nieuwegein",
        "2": "Rosmalen", "5": "Ede"
    }
    
    location_store_mapping = {}
    
    for store_id, city in store_location_name_mapping.items():
        location_store_mapping.setdefault(city, []).append(store_id)
    
    weather_data["locationIdentifier"] = weather_data["city"].map(location_store_mapping)
    
    weather_data = weather_data.explode("locationIdentifier")

    weather_data["locationIdentifier"] = weather_data["locationIdentifier"].astype("float32")

    relevant_stores = [1, 2, 4, 5, 8, 9, 10, 11, 12, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 41, 54, 98]
    weather_data = weather_data[weather_data["locationIdentifier"].isin(relevant_stores)]
    weather_data['date'] = pd.to_datetime(weather_data['date'].dt.strftime("%Y-%m-%d"))

    weather_data = (
    weather_data.groupby("date", as_index=False)
    .agg(
        temperature_2m_mean=("temperature_2m_mean", "mean"),
        apparent_temperature_mean=("apparent_temperature_mean", "mean")
    )
    .round(2)
    )
    
    return weather_data

def analyze_article_seasonality(df):
    df["salesDate"] = pd.to_datetime(df["salesDate"])

    df["month"] = df["salesDate"].dt.month
    df["season_period"] = np.where(df["month"].isin([3, 4, 5, 6, 7, 8, 9]), "Mar-Sep", "Oct-Feb")

    sales_by_period = df.groupby(["articleGroupDescription", "season_period"])["quantity"].sum().unstack()

    sales_by_period = sales_by_period.fillna(0)

    sales_by_period["sales_ratio"] = sales_by_period["Mar-Sep"] / (sales_by_period["Oct-Feb"] + 1e-5)

    sales_by_month = df.groupby(["articleGroupDescription", "month"])["quantity"].sum().unstack()
    sales_by_month = sales_by_month.fillna(0)
    sales_by_month["CV"] = sales_by_month.std(axis=1) / (sales_by_month.mean(axis=1) + 1e-5)

    result = sales_by_period.merge(sales_by_month["CV"], on="articleGroupDescription")

    result["sales_pattern"] = np.where(
        result["CV"] < 0.2, "Stable Year-round",
        np.where(result["sales_ratio"] > 2, "Summer-Heavy", 
                 np.where(result["sales_ratio"] < 0.5, "Winter-Heavy", "Mixed Pattern"))
    )

    return result.reset_index()

In [6]:
# Processed Sales Data
data = process_sales_data(sales_data=sales_data, noos_sales_data=noos_sales_data)

# Processed Weather Data
processed_weather_data = process_weather_data(weather_data)

In [7]:
price_tag_data = process_price_data()

In [8]:
data = pd.merge(data, processed_weather_data, left_on='salesDate', right_on='date', how = "left").drop(columns = "date")

In [9]:
data = pd.merge(data, price_tag_data, on = "articleIdentifier", how = "left")

print("Missing Barcode Data : ", data[data["tag_price"].isna()]["articleIdentifier"].nunique())

Missing Barcode Data :  14966


In [10]:
# Handle Missing Price Data (<1)
data = handle_missing_sales_price_data(data)

# Create Discount Columns
data = create_discounts(data)

  sales_data.loc[(sales_data["purchasePrice_barcode"].isna(), "purchasePrice_barcode")] = sales_data["purchasePrice"]


Total Rows where Discount% is negative (-) : 11012


In [11]:
data.groupby(["year"]).agg({"quantity":"sum", 'Revenue':"sum"}).reset_index()

Unnamed: 0,year,quantity,Revenue
0,2019,431052.0,40969382.65
1,2020,446187.0,38418472.78
2,2021,477182.0,45873777.23
3,2022,594225.0,58823748.95
4,2023,668356.0,63380595.96
5,2024,667411.0,68299457.79
6,2025,611118.0,63675312.86


|Year  | NOOS      | Seasonals |Total      |
|-----|-----------|-----------|-----------|
|2020  |€8,136,425 |€24,256,182|€32,392,607|
|2021  |€9,747,913 |€26,642,841|€36,390,755|
|2022  |€15,159,234|€33,474,872|€48,634,106|
|2023  |€15,999,940|€38,288,726|€54,288,667|
|2024  |€17,618,167|€40,832,701|€58,450,868|
|2025  |€9,864,079 |€21,627,674|€31,491,753|


|year |quantity   |Revenue     |
|-----|-----------|------------|
2020  |484122.0	  |€29696771.51|
2021  |517737.0	  |€34515535.05|
2022  |601556.0	  |€45755509.59|
2023  |657480.0	  |€50360893.08|
2024  |649789.0	  |€54239282.10|
2025  |708582.0	  |€49963913.50|


In [12]:
main_data = data.copy() #.groupby(["salesDate", "season", "articleGroupIdentifier", "articleGroupDescription", "brandIdentifier", "brandDescription", "Inhouse_Brand", "temperature_2m_mean", "apparent_temperature_mean"]).agg({"quantity":"sum"}).reset_index()

In [13]:
data = data.groupby(["salesDate", "season", "articleGroupIdentifier", "articleGroupDescription", "temperature_2m_mean", "apparent_temperature_mean"]).agg({"quantity":"sum"}).reset_index()

#### Identifying top-selling articles by season for other categories without NOOS

In [14]:
data_for_breakdown = data.copy()

In [15]:
# Here am converting all negative numbers to positive so that we know the exact number of sales.
data_for_breakdown['quantity'] = data_for_breakdown['quantity'].abs() 

In [16]:
# Identifying which articles sells more in the respective seasons
without_noos = data_for_breakdown[data_for_breakdown['season'] != 'NOOS'].reset_index(drop=True)
article_sales_patterns = analyze_article_seasonality(without_noos)

In [17]:
# Identify the articles where the total sales is more than 15K units.
article_sales_patterns[article_sales_patterns["sales_pattern"] == "Summer-Heavy"].sort_values("Mar-Sep", ascending=False).reset_index(drop=True)

Unnamed: 0,articleGroupDescription,Mar-Sep,Oct-Feb,sales_ratio,CV,sales_pattern
0,Polo SS,324422.0,17332.0,18.71809,0.94416,Summer-Heavy
1,T-shirt SS,169648.0,22932.0,7.397872,0.788541,Summer-Heavy
2,Short,119713.0,4364.0,27.43194,1.170273,Summer-Heavy
3,Casual shirt SS,85425.0,3861.0,22.1251,1.004321,Summer-Heavy
4,Sneakers,44534.0,21064.0,2.114223,0.295124,Summer-Heavy
5,Jack outdoor,29269.0,8910.0,3.284961,0.553778,Summer-Heavy
6,Swim short,19512.0,1442.0,13.53121,1.095289,Summer-Heavy
7,Tie,17115.0,6006.0,2.84965,0.369763,Summer-Heavy
8,Mix & match trousers,14300.0,6902.0,2.071863,0.238437,Summer-Heavy
9,Bermuda,12859.0,245.0,52.48571,1.319684,Summer-Heavy


In [18]:
# Identify the articles where the total sales is more than 15K units.
article_sales_patterns[article_sales_patterns["sales_pattern"] == "Winter-Heavy"].sort_values("Oct-Feb", ascending=False).reset_index(drop=True)

Unnamed: 0,articleGroupDescription,Mar-Sep,Oct-Feb,sales_ratio,CV,sales_pattern
0,Crew-neck sweater,50491.0,105279.0,0.479592,0.597733,Winter-Heavy
1,Pull over half zip,15929.0,69001.0,0.230852,0.970004,Winter-Heavy
2,Pullover rollneck,6169.0,50578.0,0.12197,1.137703,Winter-Heavy
3,Padded jacket,9067.0,34638.0,0.261765,0.87621,Winter-Heavy
4,Coat,2529.0,10317.0,0.245129,0.851228,Winter-Heavy
5,Shawl,1590.0,8849.0,0.179681,1.020908,Winter-Heavy
6,Parka,2092.0,8793.0,0.237917,0.910844,Winter-Heavy
7,Boots,2322.0,7530.0,0.308367,0.81287,Winter-Heavy
8,Gloves,394.0,2503.0,0.157411,1.189245,Winter-Heavy
9,Umbrella,1.0,263.0,0.003802,2.866356,Winter-Heavy


In [19]:
# Identify the articles where the total sales is more than 15K units.
subset = article_sales_patterns.loc[
    article_sales_patterns["sales_pattern"].isin(["Stable Year-round", "Mixed Pattern"])
].copy()

subset["total"] = subset["Mar-Sep"] + subset["Oct-Feb"]

subset = subset.sort_values("total", ascending=False, ignore_index=True)
subset

Unnamed: 0,articleGroupDescription,Mar-Sep,Oct-Feb,sales_ratio,CV,sales_pattern,total
0,Trendy shirt LS,102991.0,96550.0,1.066712,0.252908,Mixed Pattern,199541.0
1,Casual shirt LS,89501.0,95007.0,0.942046,0.236117,Mixed Pattern,184508.0
2,Chino,83788.0,47797.0,1.752997,0.163217,Stable Year-round,131585.0
3,Dress Shirt LS,81022.0,48404.0,1.67387,0.149592,Stable Year-round,129426.0
4,Cardigan,36511.0,54516.0,0.66973,0.420684,Mixed Pattern,91027.0
5,Jeans,52287.0,38240.0,1.367338,0.158584,Stable Year-round,90527.0
6,Sweatshirt,24502.0,36127.0,0.678219,0.43107,Mixed Pattern,60629.0
7,Jacket,37221.0,22561.0,1.649794,0.183262,Stable Year-round,59782.0
8,5-pocket,27744.0,19259.0,1.440573,0.152035,Stable Year-round,47003.0
9,Dress Shoe,27926.0,17263.0,1.617679,0.313811,Mixed Pattern,45189.0


In [20]:
# Identifying individual articles with high sales in their respective seasons, thereby providing more data for modeling.
summer_articles = list(article_sales_patterns[article_sales_patterns["sales_pattern"] == "Summer-Heavy"].sort_values("Mar-Sep", ascending=False).reset_index(drop=True).head(10)["articleGroupDescription"].unique())
winter_articles = list(article_sales_patterns[article_sales_patterns["sales_pattern"] == "Winter-Heavy"].sort_values("Oct-Feb", ascending=False).reset_index(drop=True).head(5)["articleGroupDescription"].unique())
stable_articles = list(subset.sort_values("total", ascending=False, ignore_index=True).head(20)["articleGroupDescription"].unique())

#### Identifying top-selling articles by season for the NOOS category.

In [21]:
# Identifying which articles sells more in the respective seasons
noos_data = data_for_breakdown[data_for_breakdown['season'] == 'NOOS'].reset_index(drop=True)
noos_article_sales_patterns = analyze_article_seasonality(noos_data)

In [22]:
# Identify the articles where the total sales is more than 15K units.
noos_article_sales_patterns[noos_article_sales_patterns["sales_pattern"] == "Summer-Heavy"].sort_values("Mar-Sep", ascending=False).reset_index(drop=True)

Unnamed: 0,articleGroupDescription,Mar-Sep,Oct-Feb,sales_ratio,CV,sales_pattern
0,Polo SS,25739.0,3331.0,7.727109,0.804964,Summer-Heavy
1,Dress belt,25652.0,11529.0,2.224998,0.276026,Summer-Heavy
2,Mix & match trousers,18096.0,8590.0,2.106636,0.240448,Summer-Heavy
3,Mix & match jacket,15799.0,7180.0,2.200418,0.270639,Summer-Heavy
4,Short,12675.0,473.0,26.79704,1.299227,Summer-Heavy
5,Tie,9664.0,4286.0,2.254783,0.270692,Summer-Heavy
6,Dress Shirt long sleeve,6555.0,2815.0,2.328597,1.663331,Summer-Heavy
7,Trousers,5646.0,2545.0,2.218468,0.269791,Summer-Heavy
8,Sneakers,4139.0,1939.0,2.134605,0.281402,Summer-Heavy
9,Wedding suit,3702.0,1346.0,2.750371,0.582094,Summer-Heavy


In [23]:
# Identify the articles where the total sales is more than 15K units.
noos_article_sales_patterns[noos_article_sales_patterns["sales_pattern"] == "Winter-Heavy"].sort_values("Mar-Sep", ascending=False).reset_index(drop=True)

Unnamed: 0,articleGroupDescription,Mar-Sep,Oct-Feb,sales_ratio,CV,sales_pattern
0,V-neck sweater,17504.0,35648.0,0.491023,0.580404,Winter-Heavy
1,Crew-neck sweater,3964.0,11190.0,0.354245,0.743127,Winter-Heavy
2,Pullover rollneck,1909.0,7756.0,0.246132,0.980607,Winter-Heavy
3,Pull over half zip,487.0,1679.0,0.290054,1.091137,Winter-Heavy
4,Merchandise,171.0,557.0,0.307002,1.152204,Winter-Heavy
5,Coat,93.0,261.0,0.356322,0.923548,Winter-Heavy
6,Padded jacket,78.0,417.0,0.18705,1.022168,Winter-Heavy
7,Shawl,7.0,81.0,0.08642,1.169194,Winter-Heavy
8,Gloves,4.0,80.0,0.05,1.429867,Winter-Heavy
9,Sweater,1.0,5.0,0.2,2.891937,Winter-Heavy


In [24]:
# Identify the articles where the total sales is more than 15K units.
subset_noos = noos_article_sales_patterns.loc[
    noos_article_sales_patterns["sales_pattern"].isin(["Stable Year-round", "Mixed Pattern"])
].copy()

subset_noos["total"] = subset_noos["Mar-Sep"] + subset_noos["Oct-Feb"]

subset_noos = subset_noos.sort_values("total", ascending=False, ignore_index=True)
subset_noos

Unnamed: 0,articleGroupDescription,Mar-Sep,Oct-Feb,sales_ratio,CV,sales_pattern,total
0,Jeans,163816.0,152300.0,1.075614,0.317023,Mixed Pattern,316116.0
1,Dress Shirt LS,81495.0,50377.0,1.617703,0.159168,Stable Year-round,131872.0
2,Socks,78965.0,52302.0,1.509789,0.18663,Stable Year-round,131267.0
3,T-shirt SS,55506.0,35022.0,1.584889,0.179298,Stable Year-round,90528.0
4,Chino,30175.0,17744.0,1.700575,0.169968,Stable Year-round,47919.0
5,Suit,27672.0,14060.0,1.968137,0.230147,Mixed Pattern,41732.0
6,Casual belt,24580.0,12775.0,1.92407,0.261296,Mixed Pattern,37355.0
7,5-pocket,15297.0,10428.0,1.466916,0.270549,Mixed Pattern,25725.0
8,Casual shirt LS,14558.0,10997.0,1.323816,0.124898,Stable Year-round,25555.0
9,Boxershort,12189.0,7705.0,1.58196,0.239174,Mixed Pattern,19894.0


In [25]:
# Identifying individual articles with high sales in their respective seasons, thereby providing more data for modeling.
noos_summer_articles = list(noos_article_sales_patterns[noos_article_sales_patterns["sales_pattern"] == "Summer-Heavy"].sort_values("Mar-Sep", ascending=False).reset_index(drop=True).head(5)["articleGroupDescription"].unique())
noos_winter_articles = list(noos_article_sales_patterns[noos_article_sales_patterns["sales_pattern"] == "Winter-Heavy"].sort_values("Oct-Feb", ascending=False).reset_index(drop=True).head(2)["articleGroupDescription"].unique())
noos_stable_articles = list(subset_noos.sort_values("total", ascending=False, ignore_index=True).head(11)["articleGroupDescription"].unique())

# Final Articles for Forecast

In [26]:
final_articles_for_forecast = list(set([*summer_articles, *winter_articles, *stable_articles]))

final_noos_articles_for_forecast = list(set([*noos_summer_articles, *noos_winter_articles, *noos_stable_articles]))

print(f"There are {len(final_noos_articles_for_forecast)} NOOS articles and {len(final_articles_for_forecast)} other articles for forecasts, totaling {len(final_articles_for_forecast) + len(final_noos_articles_for_forecast)}.")

There are 18 NOOS articles and 35 other articles for forecasts, totaling 53.


In [27]:
# Excluding NOOS Category breakdown by season

print(f"Winter Articles ({len(winter_articles)}): {winter_articles} \n")
print(f"Summer Articles ({len(summer_articles)}): {summer_articles} \n")
print(f"Stable Year Round Articles ({len(stable_articles)}): {stable_articles} \n")
print(f"All articles combined ({len(final_articles_for_forecast)}) : {final_articles_for_forecast}")

Winter Articles (5): ['Crew-neck sweater', 'Pull over half zip', 'Pullover rollneck', 'Padded jacket', 'Coat'] 

Summer Articles (10): ['Polo SS', 'T-shirt SS', 'Short', 'Casual shirt SS', 'Sneakers', 'Jack outdoor', 'Swim short', 'Tie', 'Mix & match trousers', 'Bermuda'] 

Stable Year Round Articles (20): ['Trendy shirt LS', 'Casual shirt LS', 'Chino', 'Dress Shirt LS', 'Cardigan', 'Jeans', 'Sweatshirt', 'Jacket', '5-pocket', 'Dress Shoe', 'Overshirt', 'Polo LS', 'Hoodie', 'Trousers', 'V-neck sweater', 'Casual Shoe', 'Suit', 'Mix & match jacket', 'Headwear', 'Socks'] 

All articles combined (35) : ['Mix & match jacket', 'Trendy shirt LS', 'Crew-neck sweater', 'Pull over half zip', 'Sneakers', 'Dress Shoe', 'Casual shirt LS', 'Dress Shirt LS', 'T-shirt SS', 'Chino', 'Short', 'Casual Shoe', 'Jeans', 'Suit', 'Overshirt', 'Coat', 'V-neck sweater', 'Trousers', 'Socks', 'Casual shirt SS', 'Headwear', 'Polo SS', 'Mix & match trousers', 'Tie', 'Polo LS', 'Hoodie', 'Sweatshirt', '5-pocket', 'B

In [28]:
# NOOS Category breakdown by season

print(f"NOOS Winter Articles ({len(noos_winter_articles)}): {noos_winter_articles} \n")
print(f"NOOS Summer Articles ({len(noos_summer_articles)}): {noos_summer_articles} \n")
print(f"NOOS Stable Year Round Articles ({len(noos_stable_articles)}): {noos_stable_articles} \n")
print(f"NOOS all combined articles ({len(final_noos_articles_for_forecast)}): {final_noos_articles_for_forecast}")

NOOS Winter Articles (2): ['V-neck sweater', 'Crew-neck sweater'] 

NOOS Summer Articles (5): ['Polo SS', 'Dress belt', 'Mix & match trousers', 'Mix & match jacket', 'Short'] 

NOOS Stable Year Round Articles (11): ['Jeans', 'Dress Shirt LS', 'Socks', 'T-shirt SS', 'Chino', 'Suit', 'Casual belt', '5-pocket', 'Casual shirt LS', 'Boxershort', 'Shoe care'] 

NOOS all combined articles (18): ['Mix & match jacket', 'Mix & match trousers', '5-pocket', 'Boxershort', 'Crew-neck sweater', 'Jeans', 'Suit', 'Dress belt', 'V-neck sweater', 'Socks', 'Dress Shirt LS', 'Chino', 'Casual belt', 'T-shirt SS', 'Casual shirt LS', 'Shoe care', 'Short', 'Polo SS']


In [29]:
data.to_parquet("/Users/trentino/Work/OFM/droplet/2026 Forecast/data/processed_sales_data_for_model.parquet")

In [30]:
main_data.head()

Unnamed: 0,locationIdentifier,salesDate,articleIdentifier,articleGroupIdentifier,articleGroupDescription,brandIdentifier,brandDescription,seasonIdentifier,quantity,purchaseValue,adviceRetailValue,retailValueWithoutTax,retailValue,purchasePrice,adviceRetailPrice,retailPriceWithoutTax,retailPrice,Revenue,Inhouse_Brand,Is_Return,season,year,month,temperature_2m_mean,apparent_temperature_mean,tag_price,purchasePrice_barcode,adviceRetailPrice_with_tax,Discount%,Margin%
0,41.0,2024-07-02,2015000049971,162.0,T-shirt SS,505.0,Fred Perry,242.0,1.0,22.0,45.41,36.32,43.95,20.35,45.41,36.32,43.95,43.95,False,False,Summer,2024,7,14.74,13.07,54.95,22.0,54.95,20.02,53.7
1,41.0,2024-07-02,2015000064691,156.0,Sweatshirt,831.0,Peuterey,242.0,1.0,55.1,132.19,92.52,111.95,55.1,132.19,92.52,111.95,111.95,False,False,Summer,2024,7,14.74,13.07,159.95,55.099998,159.95,30.01,50.78
2,41.0,2024-07-02,8058700181255,125.0,Short,867.0,Replay,4.0,1.0,39.6,81.82,56.98,68.95,25.74,81.82,56.98,68.95,68.95,False,False,NOOS,2024,7,14.74,13.07,99.0,39.599998,99.0,30.35,62.67
3,41.0,2024-07-02,8719625578147,162.0,T-shirt SS,618.0,J.C. RAGS,233.0,1.0,9.0,37.15,18.14,21.95,10.68,37.15,18.14,21.95,21.95,True,False,Winter,2024,7,14.74,13.07,44.95,9.0,44.95,51.17,51.34
4,41.0,2024-07-02,8720199615794,122.0,Jeans,377.0,Denham,242.0,1.0,64.0,132.23,105.74,127.95,62.72,132.23,105.74,127.95,127.95,False,False,Summer,2024,7,14.74,13.07,160.0,64.0,160.0,20.03,50.98


In [31]:
main_data.to_parquet("/Users/trentino/Work/OFM/droplet/2026 Forecast/data/clean_sales_data.parquet")

In [32]:
main_data.groupby(['year', 'month']).agg({"quantity":"sum", "retailPrice":"sum"}).reset_index()

Unnamed: 0,year,month,quantity,retailPrice
0,2019,1,34188.0,3375900.63
1,2019,2,26619.0,2626830.07
2,2019,3,26158.0,2798223.29
3,2019,4,32501.0,3091790.58
4,2019,5,42304.0,4057980.1
5,2019,6,35784.0,3306139.2
6,2019,7,37446.0,3234312.5
7,2019,8,47331.0,3383446.14
8,2019,9,26363.0,2909862.96
9,2019,10,31409.0,3452170.03


In [33]:
main_data[main_data['articleGroupDescription'].isin(winter_articles)].groupby(['year']).agg({"quantity":"sum", "retailPrice":"sum"}).reset_index()

Unnamed: 0,year,quantity,retailPrice
0,2019,33505.0,4203711.97
1,2020,44974.0,5082429.73
2,2021,45186.0,5258705.12
3,2022,61057.0,6463838.1
4,2023,73697.0,7163861.4
5,2024,74845.0,7937190.25
6,2025,39546.0,4595032.3


In [157]:
others_check = list(main_data[(main_data['season'].isin(['Winter', "Summer"])) & ~(main_data['articleGroupDescription'].isin([*stable_articles,
                                                                                                                             *summer_articles,
                                                                                                                             *winter_articles]))]['articleGroupDescription'].unique())

In [143]:
len(stable_articles), len(summer_articles), len(winter_articles)

(20, 10, 5)

In [158]:
summer_others_check = list(main_data[(main_data["season"].isin(['Summer'])) & (main_data['articleGroupDescription'].isin(others_check))]['articleGroupDescription'].unique())
winter_others_check = list(main_data[(main_data["season"].isin(['Winter'])) & (main_data['articleGroupDescription'].isin(others_check))]['articleGroupDescription'].unique())

In [165]:
summer_actual_others = list(set(set(summer_others_check) - set(winter_others_check)))
winter_actual_others = list(set(set(winter_others_check) - set(summer_others_check)))
all_season_actual_others = list(set(set(winter_others_check).intersection(set(summer_others_check))))

In [166]:
len(summer_actual_others), len(winter_actual_others), len(all_season_actual_others)

(11, 15, 38)

In [168]:
len(set(winter_articles)) + len(set(summer_articles)) + len(set(stable_articles)) + len(set(summer_actual_others)) + len(set(winter_actual_others)) + len(set(all_season_actual_others))

99

In [174]:
print(winter_articles)

['Crew-neck sweater', 'Pull over half zip', 'Pullover rollneck', 'Padded jacket', 'Coat']


In [170]:
print(winter_actual_others)

['Dress Shirt short sleeve', 'T-shirt KM', 'Scarf', 'Mix & match waistcoat', 'Jack/jas', 'Pajama', 'Accessorry', 'Cap', 'Belt', 'Tuxedo shirt', 'Tuxedo', 'Gift', 'Dresshirt', 'Shoes', 'Umbrella']


In [175]:
print(summer_articles)

['Polo SS', 'T-shirt SS', 'Short', 'Casual shirt SS', 'Sneakers', 'Jack outdoor', 'Swim short', 'Tie', 'Mix & match trousers', 'Bermuda']


In [172]:
print(summer_actual_others)

['Skin care', 'Cufflinks', 'Colbert', 'Scents', 'Espadrilles', 'Wedding suit', 'Gadgets', 'Diversen', 'Instapper', 'Trendy shirt SS', 'Apron']


In [176]:
print(stable_articles)

['Trendy shirt LS', 'Casual shirt LS', 'Chino', 'Dress Shirt LS', 'Cardigan', 'Jeans', 'Sweatshirt', 'Jacket', '5-pocket', 'Dress Shoe', 'Overshirt', 'Polo LS', 'Hoodie', 'Trousers', 'V-neck sweater', 'Casual Shoe', 'Suit', 'Mix & match jacket', 'Headwear', 'Socks']


In [173]:
print(all_season_actual_others)

['Rugby', 'Slippers', 'Parka', 'Spencer', 'Dress Shirt extra long sleeve', 'Grandad', 'Casual shirt short sleeve', 'Sweatpants', 'T-shirt LS', 'Dress Shirt long sleeve', 'Gloves', 'T-shirt long sleeve', 'Polo short sleeve', 'Pocket square', 'Bow', 'T-shirt short sleeve', 'Pullover half zip', 'Boat Shoes', 'Knitwear', 'Dress belt', 'Boots', 'Casual belt', 'Leather jacket', 'Boxershort', 'Bag', 'Shirt', 'Suspenders', 'Waistcoat', 'Others', 'Dress Shirt SS', 'Loafers', 'Trendy shirt long sleeve', 'Polo long sleeve', 'Shawl', 'Casual shirt long sleeve', 'Blazer', 'Bodywarmer', 'Watch']
