In [None]:
# %%  Low-CPU mode
#import os
#for k in ("ARROW_NUM_THREADS", "OMP_NUM_THREADS", "MKL_NUM_THREADS", "OPENBLAS_NUM_THREADS", "NUMEXPR_MAX_THREADS"):
#    os.environ.setdefault(k, "1")


In [1]:
import pandas as pd

tx = pd.read_parquet("../data/processed/transactions_clean.parquet")
customers = pd.read_parquet("../data/processed/customers_clean.parquet")
articles  = pd.read_parquet("../data/processed/articles_clean.parquet")

In [2]:
# === Debug: unique Age / Gender in customers only ===
import pandas as pd

def _find_col(df, candidates):
    for c in candidates:
        if c in df.columns:
            return c
    return None

age_col = _find_col(customers, ["Age","age","customer_age"])
gen_col = _find_col(customers, ["Gender","gender","customer_gender"])

print("--- customers ---")
if age_col:
    ages = pd.to_numeric(customers[age_col], errors="coerce")
    print(f"Age column: '{age_col}' dtype={customers[age_col].dtype}")
    print(f" non-null={ages.notna().sum()}, unique={ages.dropna().nunique()}")
    if ages.notna().any():
        print(f" min={float(ages.min())}, max={float(ages.max())}")
        print(" sample uniques:", sorted(ages.dropna().unique().tolist())[:20])
else:
    print("Age column: NOT FOUND")

if gen_col:
    g = customers[gen_col].astype("string[python]").str.strip().str.lower()
    print(f"Gender column: '{gen_col}' dtype={customers[gen_col].dtype}")
    print(" unique (non-null):", sorted(g.dropna().unique().tolist()))
    print("\n value counts:\n", g.value_counts(dropna=True))
else:
    print("Gender column: NOT FOUND")


--- customers ---
Age column: 'Age' dtype=float64
 non-null=36592, unique=87
 min=17.0, max=103.0
 sample uniques: [17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0]
Gender column: 'Gender' dtype=object
 unique (non-null): ['female', 'male']

 value counts:
 Gender
female    34836
male       1756
Name: count, dtype: Int64


In [3]:
customers.dtypes

shopUserId                  object
invoiceFirstName    string[python]
invoiceLastName     string[python]
invoiceSSN                  object
invoiceZip          string[python]
invoiceCity                 object
invoiceCountryId            object
invoiceEmail                object
_row                         int64
Gender                      object
Age                        float64
Country                     object
dtype: object

## Build json for mental mapping

In [4]:
# %% [markdown]
# # Setup & Constants
# Lightweight imports and shared constants.

# %%
import re, json
import numpy as np
import pandas as pd
from pathlib import Path

NORDICS = ["Sweden", "Denmark", "Finland", "Norway"]

In [5]:
# %% [markdown]
# # Generic helpers
# Small utilities reused across the pipeline.

# %%
def status_from_orders(n: int) -> str:
    return "new" if n <= 1 else ("returning" if n <= 3 else "loyal")

def mode_or_first(s: pd.Series):
    m = s.mode()
    if not m.empty:
        return m.iat[0]
    s = s.dropna()
    return s.iat[0] if not s.empty else None

def _pick(df: pd.DataFrame, names, default=None) -> pd.Series:
    for n in names:
        if n in df.columns:
            return df[n]
    return pd.Series([default] * len(df), index=df.index)


In [6]:
# %% [markdown]
# # Normalization helpers
# Standardize ids and small categorical fields.

# %%
def _norm_id(s: pd.Series) -> pd.Series:
    # to string, strip, drop trailing ".0" if present (common after Parquet/float cast)
    s = s.astype("string[python]").str.strip()
    return s.str.replace(r"\.0+$", "", regex=True)

def _norm_gender(s: pd.Series):
    if s is None:
        return s
    s = s.astype("string[python]").str.strip().str.lower()
    return s.replace({"f": "female", "m": "male", "kvinnan": "female", "man": "male"})


In [7]:
# %% [markdown]
# # Tx base-frame builder
# Make a clean base transaction frame with unified schema.

# %%
def _build_base_tx(tx: pd.DataFrame) -> pd.DataFrame:
    country = _pick(tx, ["currency_country", "Country"], "Unknown").astype("string[python]").str.strip()
    country = country.replace({"": pd.NA}).fillna("Unknown")
    city = _pick(tx, ["invoiceCity", "city"], "Unknown").astype(object).fillna("Unknown")
    shop = _norm_id(tx["shopUserId"])
    order_id = tx["orderId"].astype("string[python]").str.strip()

    created_raw = tx["created"]
    created = created_raw if np.issubdtype(created_raw.dtype, np.datetime64) else pd.to_datetime(created_raw, errors="coerce")
    if isinstance(created, pd.Series) and np.issubdtype(created.dtype, np.datetime64):
        try:
            if getattr(created.dt, "tz", None) is not None:
                created = created.dt.tz_localize(None)
        except Exception:
            pass

    rev = pd.to_numeric(tx.get("line_total_sek"), errors="coerce").fillna(0)
    typ = tx["type"] if "type" in tx.columns else pd.Series([None] * len(tx), index=tx.index)
    price = tx["price"] if "price" in tx.columns else pd.Series([None] * len(tx), index=tx.index)

    return pd.DataFrame(
        {
            "country": country,
            "city": city,
            "shopUserId": shop,
            "orderId": order_id,
            "rev": rev,
            "created": created,
            "type": typ,
            "price": price,
        }
    )


In [8]:
# %% [markdown]
# # Tx base-frame builder
# Make a clean base transaction frame with unified schema.

# %%
def _build_base_tx(tx: pd.DataFrame) -> pd.DataFrame:
    country = _pick(tx, ["currency_country", "Country"], "Unknown").astype("string[python]").str.strip()
    country = country.replace({"": pd.NA}).fillna("Unknown")
    city = _pick(tx, ["invoiceCity", "city"], "Unknown").astype(object).fillna("Unknown")
    shop = _norm_id(tx["shopUserId"])
    order_id = tx["orderId"].astype("string[python]").str.strip()

    created_raw = tx["created"]
    created = created_raw if np.issubdtype(created_raw.dtype, np.datetime64) else pd.to_datetime(created_raw, errors="coerce")
    if isinstance(created, pd.Series) and np.issubdtype(created.dtype, np.datetime64):
        try:
            if getattr(created.dt, "tz", None) is not None:
                created = created.dt.tz_localize(None)
        except Exception:
            pass

    rev = pd.to_numeric(tx.get("line_total_sek"), errors="coerce").fillna(0)
    typ = tx["type"] if "type" in tx.columns else pd.Series([None] * len(tx), index=tx.index)
    price = tx["price"] if "price" in tx.columns else pd.Series([None] * len(tx), index=tx.index)

    return pd.DataFrame(
        {
            "country": country,
            "city": city,
            "shopUserId": shop,
            "orderId": order_id,
            "rev": rev,
            "created": created,
            "type": typ,
            "price": price,
        }
    )


In [9]:
# %% [markdown]
# # Customer normalization & collapse
# Normalize ids, pick Age/Gender variants, and collapse duplicates.

# %%
def _prep_customers(customers: pd.DataFrame) -> pd.DataFrame:
    c = customers.copy()
    c["shopUserId_norm"] = _norm_id(c["shopUserId"])
    c["Age"] = pd.to_numeric(_pick(c, ["Age", "age", "customer_age"]), errors="coerce").astype("Float64")
    c["Gender"] = _pick(c, ["Gender", "gender", "customer_gender"])
    c_agg = (
        c.groupby("shopUserId_norm", dropna=False)
        .agg(Age=("Age", mode_or_first), Gender=("Gender", mode_or_first))
        .reset_index()
    )
    return c_agg


In [10]:
# %% [markdown]
# # Customer normalization & collapse
# Normalize ids, pick Age/Gender variants, and collapse duplicates.

# %%
def _prep_customers(customers: pd.DataFrame) -> pd.DataFrame:
    c = customers.copy()
    c["shopUserId_norm"] = _norm_id(c["shopUserId"])
    c["Age"] = pd.to_numeric(_pick(c, ["Age", "age", "customer_age"]), errors="coerce").astype("Float64")
    c["Gender"] = _pick(c, ["Gender", "gender", "customer_gender"])
    c_agg = (
        c.groupby("shopUserId_norm", dropna=False)
        .agg(Age=("Age", mode_or_first), Gender=("Gender", mode_or_first))
        .reset_index()
    )
    return c_agg


In [11]:
# %% [markdown]
# # Merge tx + customers
# Attach age/gender and apply normalization.

# %%
def _merge_tx_customers(base_df: pd.DataFrame, c_agg: pd.DataFrame) -> pd.DataFrame:
    df = base_df.merge(c_agg, left_on="shopUserId", right_on="shopUserId_norm", how="left")
    df.drop(columns=["shopUserId_norm"], inplace=True)

    df["age"] = pd.to_numeric(df["Age"], errors="coerce").astype("Float64")
    df["gender"] = _norm_gender(df["Gender"])
    df.drop(columns=["Age", "Gender"], inplace=True)
    return df


In [12]:
# %% [markdown]
# # Optional backfill of country from customers
# If tx country is Unknown, pull from customers.Country when available.

# %%
def _backfill_country_from_customers(df: pd.DataFrame, customers: pd.DataFrame) -> pd.DataFrame:
    if "Country" in customers.columns and (df["country"] == "Unknown").any():
        country_map = (
            customers.assign(shopUserId_norm=_norm_id(customers["shopUserId"]))
            .dropna(subset=["shopUserId_norm"])
            .drop_duplicates("shopUserId_norm")
            .set_index("shopUserId_norm")["Country"]
            .astype("string[python]")
            .str.strip()
        )
        mask = df["country"].eq("Unknown")
        df.loc[mask, "country"] = df.loc[mask, "shopUserId"].map(country_map).fillna("Unknown")
    return df


In [13]:
# %% [markdown]
# # Splitter
# Produce a dict of country → dataframe restricted to NORDICS.

# %%
def split_nordics(tx: pd.DataFrame, customers: pd.DataFrame) -> dict:
    base_df = _build_base_tx(tx)
    c_agg = _prep_customers(customers)
    df = _merge_tx_customers(base_df, c_agg)
    df = _backfill_country_from_customers(df, customers)
    return {cname: df[df["country"] == cname].copy() for cname in NORDICS}


In [14]:
# %% [markdown]
# # Export helpers: totals & aggregates
# Small builders used by the JSON exporter.
# (UPDATED: added _agg_city_monthly)

# %%
def _country_totals(df: pd.DataFrame):
    total_revenue = int(np.rint(df["rev"].sum()))
    customers_cnt = int(df["shopUserId"].nunique())
    total_orders = int(df["orderId"].nunique())
    aov_country = None if total_orders == 0 else int(round(float(total_revenue) / total_orders))
    return total_revenue, customers_cnt, total_orders, aov_country

def _agg_city(df: pd.DataFrame):
    agg_city = (
        df.groupby("city", dropna=False, sort=False)
        .agg(total_revenue_sek=("rev", "sum"), customers_count=("shopUserId", "nunique"))
    )
    agg_city["total_revenue_sek"] = np.rint(agg_city["total_revenue_sek"]).astype("int64")
    agg_city["customers_count"] = agg_city["customers_count"].astype("int64")
    city_orders = df.groupby("city", dropna=False)["orderId"].nunique().rename("total_orders").astype("int64")
    return agg_city, city_orders

def _agg_customer(df: pd.DataFrame):
    df_cust = df[df["shopUserId"].notna()].copy()
    agg_customer = (
        df_cust.groupby(["city", "shopUserId"], dropna=False, sort=False)
        .agg(
            total_spent_sek=("rev", "sum"),
            total_orders=("orderId", "nunique"),
            first_order=("created", "min"),
            last_order=("created", "max"),
            age=("age", mode_or_first),
            gender=("gender", mode_or_first),
        )
    )
    agg_customer["total_spent_sek"] = np.rint(agg_customer["total_spent_sek"]).astype("int64")
    agg_customer = agg_customer.sort_index(level=["city", "shopUserId"])
    return agg_customer

def _agg_order(df: pd.DataFrame):
    df_cust = df[df["shopUserId"].notna()].copy()
    agg_order = (
        df_cust.groupby(["city", "shopUserId", "orderId"], dropna=False, sort=False)
        .agg(
            order_total_sek=("rev", "sum"),
            n_items=("orderId", "size"),
            created=("created", "min"),
            order_type=("type", mode_or_first),
            price=("price", mode_or_first),
        )
    )
    agg_order["order_total_sek"] = np.rint(agg_order["order_total_sek"]).astype("int64")
    agg_order = agg_order.sort_index(level=["city", "shopUserId", "orderId"])
    return agg_order

def _agg_city_monthly(df: pd.DataFrame) -> dict[str, dict[str, int]]:
    """
    NEW: sum revenue by city and calendar month (YYYY-MM).
    Returns {city -> { 'YYYY-MM': total_revenue_sek_int, ... }, ...}
    """
    dfm = df.copy()
    # Ensure naive, month string
    ym = dfm["created"]
    if getattr(ym.dt, "tz", None) is not None:
        ym = ym.dt.tz_localize(None)
    dfm["year_month"] = ym.dt.to_period("M").astype(str)

    g = (
        dfm.groupby(["city", "year_month"], dropna=False)["rev"]
        .sum()
        .round()
        .astype("int64")
    )
    out: dict[str, dict[str, int]] = {}
    for (cty, ym), val in g.items():
        ckey = "Unknown" if pd.isna(cty) else str(cty)
        out.setdefault(ckey, {})[ym] = int(val)
    return out

def _customers_by_channel(df: pd.DataFrame) -> dict[str, int]:
    """
    Count distinct customers per 'type' (channel).
    Assumes 'type' is always one of: 'telephone', 'web', or 'Email'.
    """
    if "type" not in df.columns:
        return {}
    tmp = df[["shopUserId", "type"]].dropna().copy()

    # No normalization needed; just use the type as channel
    tmp["channel"] = tmp["type"]

    # distinct customers per channel
    g = (
        tmp.dropna(subset=["channel", "shopUserId"])
           .groupby("channel")["shopUserId"]
           .nunique()
           .astype(int)
    )
    return {k: int(v) for k, v in g.items()}


In [15]:
# %% [markdown]
# # Export helpers: items extraction
# Build items_df from original tx aligned to country df indices.

# %%
def _build_items_grouped(df_country: pd.DataFrame, tx: pd.DataFrame, articles: pd.DataFrame | None = None):
    item_cols = ["sku","groupId","created","quantity","price_sek","name","line_total_sek","type","brand","category","price"]
    present   = [c for c in item_cols if c in tx.columns]

    items_df = pd.DataFrame({"city": df_country["city"], "shopUserId": df_country["shopUserId"], "orderId": df_country["orderId"]})
    for c in present:
        if c == "created":
            col_created = df_country["created"]
            try:
                if getattr(col_created.dt, "tz", None) is not None:
                    col_created = col_created.dt.tz_localize(None)
            except Exception:
                pass
            items_df[c] = col_created
        else:
            items_df[c] = tx.loc[df_country.index, c] if c in tx.columns else None

    # --- NEW: enrich brand/category from articles ---
    if articles is not None:
        art = articles.copy()

        # normalize join keys
        for key in ("sku", "groupId"):
            if key in art.columns:
                art[key] = art[key].astype("string[python]").str.strip()
        if "sku" in items_df.columns:
            items_df["sku"] = items_df["sku"].astype("string[python]").str.strip()
        if "groupId" in items_df.columns:
            items_df["groupId"] = items_df["groupId"].astype("string[python]").str.strip()

        # ensure target columns exist
        if "brand" not in items_df.columns:
            items_df["brand"] = pd.NA
        if "category" not in items_df.columns:
            items_df["category"] = pd.NA

        # maps by SKU (preferred)
        if "sku" in art.columns:
            if "brand" in art.columns:
                sku_brand = art.dropna(subset=["sku"]).drop_duplicates("sku").set_index("sku")["brand"]
                items_df["brand"] = items_df["brand"].fillna(items_df.get("sku").map(sku_brand))
            if "category" in art.columns:
                sku_cat = art.dropna(subset=["sku"]).drop_duplicates("sku").set_index("sku")["category"]
                items_df["category"] = items_df["category"].fillna(items_df.get("sku").map(sku_cat))

        # fallback maps by groupId
        if "groupId" in art.columns:
            if "brand" in art.columns:
                gid_brand = art.dropna(subset=["groupId"]).drop_duplicates("groupId").set_index("groupId")["brand"]
                items_df["brand"] = items_df["brand"].fillna(items_df.get("groupId").map(gid_brand))
            if "category" in art.columns:
                gid_cat = art.dropna(subset=["groupId"]).drop_duplicates("groupId").set_index("groupId")["category"]
                items_df["category"] = items_df["category"].fillna(items_df.get("groupId").map(gid_cat))

    items_df = items_df[df_country["shopUserId"].notna()].copy()
    items_df["city"] = items_df["city"].fillna("Unknown")
    return items_df.groupby(["city","shopUserId","orderId"], dropna=False, sort=False)


In [16]:
# %% [markdown]
# # Export helpers: JSON node builders
# Convert rows to JSON-friendly dicts.

# %%
def _item_dict(row: pd.Series):
    cr = row.get("created")
    if isinstance(cr, pd.Timestamp):
        cr = cr.isoformat(sep=" ")
    def nz(v): return None if pd.isna(v) else v
    def to_int(v): return None if pd.isna(v) else int(v)
    def to_float(v): return None if pd.isna(v) else float(v)
    return {
        "sku": nz(row.get("sku")),
        "groupId": nz(row.get("groupId")),
        "created": nz(cr),
        "quantity": to_int(row.get("quantity")),
        "price_sek": to_int(row.get("price_sek")),
        "name": nz(row.get("name")),
        "line_total_sek": to_int(row.get("line_total_sek")),
        "type": nz(row.get("type")),
        "brand": nz(row.get("brand")),
        "category": nz(row.get("category")),
        "price": to_float(row.get("price")),
    }


In [17]:
# %% [markdown]
# # Export: main function
# Assemble the JSON structure and write to disk.
# (UPDATED: inject per-city monthly revenue)

# %%
def export_country_json(df_country: pd.DataFrame, tx: pd.DataFrame, country_name: str, out_dir="/workspace/data/processed", articles: pd.DataFrame | None = None):
    df = df_country.copy()
    df["city"] = df["city"].fillna("Unknown")

    # --- totals ---
    total_revenue, customers_cnt, total_orders, aov_country = _country_totals(df)

    # --- city/ customer / order aggregates ---
    agg_city, city_orders = _agg_city(df)
    agg_customer = _agg_customer(df)
    agg_order = _agg_order(df)

    # --- monthly revenue per city (existing new feature) ---
    city_monthly_map = _agg_city_monthly(df)

    # --- NEW: unique customers by channel (from 'type') ---
    customers_by_channel = _customers_by_channel(df)

    # --- items ---
    items_grouped = _build_items_grouped(df_country, tx, articles=articles)  

    # ---------- build JSON ----------
    top_key = country_name.lower()
    result = {
        top_key: {
            "total_revenue_sek": int(total_revenue),
            "customers_count": int(customers_cnt),
            "total_orders": int(total_orders),
            "avg_order_value_sek": aov_country,
            "customers_by_channel": customers_by_channel,     # << NEW
            "cities": {},
        }
    }

    # cities
    for cty, row in agg_city.iterrows():
        ckey = "Unknown" if pd.isna(cty) else str(cty)
        orders_c = int(city_orders.get(cty, 0))
        rev_c = int(row["total_revenue_sek"])
        aov_c = None if orders_c == 0 else int(round(float(rev_c) / orders_c))

        result[top_key]["cities"][ckey] = {
            "total_revenue_sek": rev_c,
            "customers_count": int(row["customers_count"]),
            "total_orders": orders_c,
            "avg_order_value_sek": aov_c,
            # NEW: monthly breakdown here
            "monthly_revenue_sek": city_monthly_map.get(ckey, {}),
            "customers": {},
        }

    # customers + orders + items (unchanged)
    for (cty, uid), row in agg_customer.iterrows():
        status = status_from_orders(int(row["total_orders"]))
        first_iso = row["first_order"].isoformat(sep=" ") if pd.notna(row["first_order"]) else None
        last_iso = row["last_order"].isoformat(sep=" ") if pd.notna(row["last_order"]) else None

        age_val = None
        if "age" in row and pd.notna(row["age"]):
            try:
                age_val = int(row["age"])
            except Exception:
                age_val = None
        gender_val = None if "gender" not in row or pd.isna(row["gender"]) else str(row["gender"])

        cust_node = {
            "summary": {
                "total_orders": int(row["total_orders"]),
                "total_spent_sek": int(row["total_spent_sek"]),
                "first_order": first_iso,
                "last_order": last_iso,
                "status": status,
                "age": age_val,
                "gender": gender_val,
            },
            "orders": {},
        }

        try:
            cust_orders = agg_order.loc[(cty, uid)]
            if isinstance(cust_orders, pd.Series):
                cust_orders = cust_orders.to_frame().T
            for oid, orow in cust_orders.iterrows():
                try:
                    items_for_order = items_grouped.get_group((cty, uid, oid))
                    items = [_item_dict(r) for _, r in items_for_order.iterrows()]
                except KeyError:
                    items = []
                cust_node["orders"][str(oid)] = {
                    "created": orow["created"].isoformat(sep=" ") if pd.notna(orow["created"]) else None,
                    "order_total_sek": int(orow["order_total_sek"]),
                    "n_items": int(orow["n_items"]),
                    "order_type": None if pd.isna(orow["order_type"]) else orow["order_type"],
                    "price": None if pd.isna(orow.get("price")) else float(orow.get("price")),
                    "items": items,
                }
        except KeyError:
            pass

        ckey = "Unknown" if pd.isna(cty) else str(cty)
        result[top_key]["cities"][ckey]["customers"][str(uid)] = cust_node

    # write
    out_path = Path(out_dir) / f"{country_name}.json"
    out_path.parent.mkdir(parents=True, exist_ok=True)
    with out_path.open("w", encoding="utf-8") as f:
        json.dump(result, f, ensure_ascii=False, indent=2)
    print(f"Saved: {out_path}")


In [18]:
# %% [markdown]
# # Example usage
# Split once, export four times.

# %%
countries = split_nordics(tx, customers)
export_country_json(countries["Sweden"],  tx, "Sweden")
export_country_json(countries["Denmark"], tx, "Denmark")
export_country_json(countries["Finland"], tx, "Finland")
export_country_json(countries["Norway"],  tx, "Norway")


Saved: /workspace/data/processed/Sweden.json
Saved: /workspace/data/processed/Denmark.json
Saved: /workspace/data/processed/Finland.json
Saved: /workspace/data/processed/Norway.json


## Flatten json for quick math

In [19]:
# %% [markdown]
# # Imports & Futures

# %%
from __future__ import annotations
from pathlib import Path
import json
import pandas as pd


In [20]:
# %% [markdown]
# # Paths & Config

# %%
# ---- configure these paths ----
INPUT_DIR  = Path("../data/processed")     # Sweden.json, Denmark.json, Finland.json, Norway.json
OUTPUT_DIR = Path("../data/parquet_out")   # will contain the 5 Parquet files

COUNTRY_FILES = {
    "Sweden":  INPUT_DIR / "Sweden.json",
    "Denmark": INPUT_DIR / "Denmark.json",
    "Finland": INPUT_DIR / "Finland.json",
    "Norway":  INPUT_DIR / "Norway.json",
}


In [21]:
# %% [markdown]
# # Stable Column Schemas
# (UPDATED: added CS_COUNTRY_CHANNEL)

# %%
CS_COUNTRY = ["country","total_revenue_sek","customers_count","total_orders","avg_order_value_sek"]
CS_CITY    = ["country","city","total_revenue_sek","customers_count","total_orders","avg_order_value_sek"]

# Added age & gender here
CS_CUST    = ["country","city","customer_id","total_orders","total_spent_sek",
              "first_order","last_order","status","age","gender"]

CS_ORDERS  = ["country","city","customer_id","order_id","created","order_total_sek","n_items","order_type","price"]
CS_ITEMS   = ["country","city","customer_id","order_id","sku","groupId","created","quantity","price_sek","name","line_total_sek","type","brand","category","price"]

# monthly revenue per city (existing)
CS_CITY_MONTHLY = ["country","city","year_month","total_revenue_sek"]

# NEW: customers by channel per country
CS_COUNTRY_CHANNEL = ["country","channel","customers_count"]


In [22]:
# %% [markdown]
# # I/O Helpers

# %%
def load_json(path: Path) -> dict:
    with path.open("r", encoding="utf-8") as f:
        return json.load(f)

def save_parquet(df: pd.DataFrame, path: Path) -> None:
    path.parent.mkdir(parents=True, exist_ok=True)
    df.to_parquet(path, index=False)


In [23]:
# %% [markdown]
# # DataFrame Utilities

# %%
def _ensure(df: pd.DataFrame | None, cols: list[str]) -> pd.DataFrame:
    """Ensure columns exist, preserve dtypes, and reorder."""
    if df is None or df.empty:
        return pd.DataFrame({c: pd.Series([], dtype="object") for c in cols})[cols]
    for c in cols:
        if c not in df.columns:
            df[c] = pd.NA
    return df[cols]


In [24]:
# %% [markdown]
# # JSON Shape Helpers

# %%
def _unwrap(obj: dict, hint: str) -> tuple[str, dict]:
    """
    Accept {"denmark": {...}} or {...}.
    Returns (country_name_capitalized, payload_dict).
    """
    if isinstance(obj, dict) and len(obj) == 1 and isinstance(next(iter(obj.values())), dict):
        k = next(iter(obj.keys()))
        return k.capitalize(), next(iter(obj.values()))
    if not isinstance(obj, dict):
        raise ValueError("Top-level JSON must be an object/dict")
    return hint, obj


In [25]:
# %% [markdown]
# # JSON Shape Helpers

# %%
def _unwrap(obj: dict, hint: str) -> tuple[str, dict]:
    """
    Accept {"denmark": {...}} or {...}.
    Returns (country_name_capitalized, payload_dict).
    """
    if isinstance(obj, dict) and len(obj) == 1 and isinstance(next(iter(obj.values())), dict):
        k = next(iter(obj.keys()))
        return k.capitalize(), next(iter(obj.values()))
    if not isinstance(obj, dict):
        raise ValueError("Top-level JSON must be an object/dict")
    return hint, obj


In [26]:

# %% [markdown]
# # Flatten: Country → Row Buckets
# (UPDATED: collect "country_channels" from root.customers_by_channel)

# %%
def flatten_country(obj: dict, country_hint: str) -> dict[str, list[dict]]:
    country, root = _unwrap(obj, country_hint)
    out = {
        "country_summary": [{
            "country": country,
            "total_revenue_sek": root.get("total_revenue_sek"),
            "customers_count": root.get("customers_count"),
            "total_orders": root.get("total_orders"),
            "avg_order_value_sek": root.get("avg_order_value_sek"),
        }],
        "country_channels": [],   # NEW
        "city_summary": [],
        "city_monthly": [],
        "customer_summary": [],
        "orders": [],
        "order_items": [],
    }

    # NEW: expand customers_by_channel dict into rows
    for ch, cnt in (root.get("customers_by_channel") or {}).items():
        out["country_channels"].append({
            "country": country,
            "channel": ch,
            "customers_count": cnt,
        })


    for city, cnode in (root.get("cities") or {}).items():
        out["city_summary"].append({
            "country": country, "city": city,
            "total_revenue_sek": cnode.get("total_revenue_sek"),
            "customers_count": cnode.get("customers_count"),
            "total_orders": cnode.get("total_orders"),
            "avg_order_value_sek": cnode.get("avg_order_value_sek"),
        })

        # NEW: monthly map { 'YYYY-MM': revenue }
        for ym, rev in (cnode.get("monthly_revenue_sek") or {}).items():
            out["city_monthly"].append({
                "country": country,
                "city": city,
                "year_month": ym,
                "total_revenue_sek": rev,
            })

        for cust_id, cst in (cnode.get("customers") or {}).items():
            summ = cst.get("summary") or {}
            out["customer_summary"].append({
                "country": country,
                "city": city,
                "customer_id": cust_id,
                "total_orders": summ.get("total_orders"),
                "total_spent_sek": summ.get("total_spent_sek"),
                "first_order": summ.get("first_order"),
                "last_order": summ.get("last_order"),
                "status": summ.get("status"),
                "age": summ.get("age"),
                "gender": summ.get("gender"),
            })
            for order_id, ordn in (cst.get("orders") or {}).items():
                out["orders"].append({
                    "country": country, "city": city, "customer_id": cust_id, "order_id": order_id,
                    "created": ordn.get("created"),
                    "order_total_sek": ordn.get("order_total_sek"),
                    "n_items": ordn.get("n_items"),
                    "order_type": ordn.get("order_type"),
                    "price": ordn.get("price"),
                })
                for it in (ordn.get("items") or []):
                    out["order_items"].append({
                        "country": country, "city": city, "customer_id": cust_id, "order_id": order_id,
                        "sku": it.get("sku"), "groupId": it.get("groupId"), "created": it.get("created"),
                        "quantity": it.get("quantity"), "price_sek": it.get("price_sek"), "name": it.get("name"),
                        "line_total_sek": it.get("line_total_sek"), "type": it.get("type"),
                        "brand": it.get("brand"), "category": it.get("category"), "price": it.get("price"),
                    })
    return out


In [27]:
# %% [markdown]
# # Aggregation Orchestrator
# (UPDATED: include "city_monthly" bucket)

# %%
def collect_buckets(country_files: dict[str, Path]) -> dict[str, list[dict]]:
    buckets = {k: [] for k in [
        "country_summary","country_channels","city_summary","city_monthly","customer_summary","orders","order_items"
    ]}
    for name, path in country_files.items():
        if not path.exists():
            print(f"[warn] missing: {path}")
            continue
        rows = flatten_country(load_json(path), name)
        for k, v in rows.items():
            buckets[k].extend(v)
        print(f"[ok] parsed {name}")
    return buckets


In [28]:
# %% [markdown]
# # Materialize DataFrames (Fixed Schemas)
# (UPDATED: return df_city_monthly)

# %%
def to_dataframes(buckets: dict[str, list[dict]]) -> dict[str, pd.DataFrame]:
    df_country = _ensure(pd.DataFrame(buckets["country_summary"]),   CS_COUNTRY)
    df_cc      = _ensure(pd.DataFrame(buckets["country_channels"]),  CS_COUNTRY_CHANNEL)  # NEW
    df_city    = _ensure(pd.DataFrame(buckets["city_summary"]),      CS_CITY)
    df_city_m  = _ensure(pd.DataFrame(buckets["city_monthly"]),      CS_CITY_MONTHLY)
    df_cust    = _ensure(pd.DataFrame(buckets["customer_summary"]),  CS_CUST)
    df_orders  = _ensure(pd.DataFrame(buckets["orders"]),            CS_ORDERS)
    df_items   = _ensure(pd.DataFrame(buckets["order_items"]),       CS_ITEMS)
    return {
        "country_summary": df_country,
        "country_channels": df_cc,         # NEW
        "city_summary": df_city,
        "city_monthly": df_city_m,
        "customer_summary": df_cust,
        "orders": df_orders,
        "order_items": df_items,
    }


In [29]:
# %% [markdown]
# # Persist to Parquet

# %%
def write_all_parquet(dfs: dict[str, pd.DataFrame], out_dir: Path) -> None:
    save_parquet(dfs["country_summary"],   out_dir / "country_summary.parquet")
    save_parquet(dfs["country_channels"],  out_dir / "country_customers_by_channel.parquet")  # NEW
    save_parquet(dfs["city_summary"],      out_dir / "city_summary.parquet")
    save_parquet(dfs["city_monthly"],      out_dir / "city_monthly_revenue.parquet")
    save_parquet(dfs["customer_summary"],  out_dir / "customer_summary.parquet")
    save_parquet(dfs["orders"],            out_dir / "orders.parquet")
    save_parquet(dfs["order_items"],       out_dir / "order_items.parquet")
    print(f"[done] wrote Parquet files to {out_dir.resolve()}")

In [30]:
# %% [markdown]
# # Main

# %%
def main():
    buckets = collect_buckets(COUNTRY_FILES)
    dfs = to_dataframes(buckets)
    write_all_parquet(dfs, OUTPUT_DIR)

if __name__ == "__main__":
    main()


[ok] parsed Sweden
[ok] parsed Denmark
[ok] parsed Finland
[ok] parsed Norway
[done] wrote Parquet files to /workspace/data/parquet_out


In [31]:
import pandas as pd
from pathlib import Path

OUTPUT_DIR = Path("/workspace/data/parquet_out/")

order_items_df = pd.read_parquet(OUTPUT_DIR / "order_items.parquet")
order_items_df.sample(10)


Unnamed: 0,country,city,customer_id,order_id,sku,groupId,created,quantity,price_sek,name,line_total_sek,type,brand,category,price
176301,Finland,Varkaus,408235,588421,200304,200304,2025-02-18 08:54:06,1,54,Laukku tummanharmaa/luonnonvalkoinen,54,telephone,Åshild,Accessoarer,4.9
269858,Norway,Tynset,215050,650036,261318-E085,261318,2025-04-03 19:40:49,1,481,Sports-bh,481,web,Glamorise,"Sport-bh,Bh utan bygel,Bh,Underkläder",509.0
176514,Finland,Forssa,769425,601632,200267,200267,2025-02-26 13:00:28,1,32,Laukku,32,telephone,unknown,unknown,2.9
293162,Norway,Hyen,772632,632550,200267,200267,2025-03-21 08:36:45,1,27,Veske,27,letter,unknown,unknown,29.0
197971,Finland,Savikylä,320113,231990,261634-3639,261637,2024-07-28 22:16:40,4,72,Nilkkasukka VID,288,web,Locköstrumpan,"Stödstrumpor,Strumpor,Underkläder",6.5
82376,Sweden,Billdal,826543,791858,210729-4446,210729,2025-08-12 10:38:48,1,169,Stickad kofta,169,telephone,Åshild,"Överdelar,Tröjor",169.0
55646,Sweden,Bagarmossen,855266,745073,260701-C080,260701,2025-06-24 13:25:01,1,1223,Korsett Fiore,1223,telephone,Anita,"Bh utan bygel,Korsetter,Underkläder",1223.2
223690,Norway,Moss,371272,814522,230047-4042,230047,2025-08-29 12:07:20,1,470,Kjole,470,web,Åshild,"Klänningar,Överdelar,Tunikor",498.0
261561,Norway,Moelv,876922,248271,210189-4446,210186,2024-08-09 08:52:19,1,206,Polojumper,206,telephone,Åshild,"Toppar,Överdelar",218.0
26813,Sweden,Köping,643454,587466,293647,293647,2025-02-17 12:57:30,1,1098,Fotvagga,1098,web,Good Living,"Fotvård,Vardagshjälpmedel",1098.0


In [None]:
# %% [markdown]
# # Top Categories by Season

from pathlib import Path
import pandas as pd

def _season_from_month(m: int) -> str:
    # Winter: Dec–Feb, Spring: Mar–May, Summer: Jun–Aug, Autumn: Sep–Nov
    if m in (12, 1, 2):   return "Winter"
    if m in (3, 4, 5):    return "Spring"
    if m in (6, 7, 8):    return "Summer"
    return "Autumn"

def build_top_categories_by_season(
    df_items: pd.DataFrame,
    category_sep: str = ",",
    keep_top_n: int = 10,
) -> pd.DataFrame:
    df = df_items.copy()

    # --- types & defaults ---
    df["created"]  = pd.to_datetime(df["created"], errors="coerce")
    df["quantity"] = pd.to_numeric(df.get("quantity"), errors="coerce").fillna(1).astype("int64")

    def _norm(s: pd.Series) -> pd.Series:
        s = s.astype("string[python]")
        s = s.where(~s.isna(), "Unknown")
        return s.str.strip().fillna("Unknown")

    df["country"]  = _norm(df.get("country"))
    df["category"] = _norm(df.get("category"))

    # --- season label (Dec belongs to next winter) ---
    m = df["created"].dt.month
    y = df["created"].dt.year
    season = m.map(_season_from_month).fillna("Unknown")
    season_year = y + (m == 12).astype(int)
    df["season_label"] = (season + " " + season_year.astype(str)).astype("string[python]")

    # --- explode categories (no sharing → integer counts) ---
    cat_lists = (
        df["category"]
          .str.split(category_sep)
          .apply(lambda parts: list(dict.fromkeys([p.strip() for p in (parts or []) if p and p.strip()])))
    )
    df_exp = df.loc[cat_lists.index, ["country","season_label","quantity"]].copy()
    df_exp["category"] = cat_lists
    df_exp = df_exp.explode("category", ignore_index=True)
    df_exp["category"] = _norm(df_exp["category"])

    # --- aggregate & rank ---
    agg = (
        df_exp.groupby(["country","season_label","category"], dropna=False)["quantity"]
              .sum().reset_index(name="count")
    )
    agg["count"] = agg["count"].astype("int64")

    agg = agg.sort_values(["country","season_label","count"], ascending=[True, True, False])
    agg["rank"] = (
        agg.groupby(["country","season_label"])["count"]
           .rank(method="first", ascending=False).astype(int)
    )

    top = (
        agg[agg["rank"] <= keep_top_n]
        .sort_values(["country","season_label","rank"], ascending=[True, True, True])
        .reset_index(drop=True)
    )

    # final tidy schema
    return top[["country","season_label","category","count","rank"]]

# --- usage ---
items_path = OUTPUT_DIR / "order_items.parquet"
top_path   = OUTPUT_DIR / "top_categories_by_season.parquet"
df_items = pd.read_parquet(items_path)
df_top = build_top_categories_by_season(df_items)
df_top.to_parquet(top_path, index=False)
print(f"[done] wrote {top_path.resolve()}")

[done] wrote /workspace/data/parquet_out/top_categories_by_season.parquet


In [38]:
# %% [markdown]
# # Top Products by Season

import pandas as pd

def _season_from_month(m: int) -> str:
    # Winter: Dec–Feb, Spring: Mar–May, Summer: Jun–Aug, Autumn: Sep–Nov
    if m in (12, 1, 2):   return "Winter"
    if m in (3, 4, 5):    return "Spring"
    if m in (6, 7, 8):    return "Summer"
    return "Autumn"

def build_top_groupids_by_season(
    df_items: pd.DataFrame,
    keep_top_n: int = 10,
) -> pd.DataFrame:
    df = df_items.copy()

    # --- types & defaults ---
    df["created"]  = pd.to_datetime(df["created"], errors="coerce")
    df["quantity"] = pd.to_numeric(df.get("quantity"), errors="coerce").fillna(1).astype("int64")

    def _norm(s: pd.Series) -> pd.Series:
        s = s.astype("string[python]")
        s = s.where(~s.isna(), "Unknown")
        return s.str.strip().fillna("Unknown")

    df["country"] = _norm(df.get("country"))
    df["groupId"] = _norm(df.get("groupId"))

    # choose a product-name column if present; normalize
    if   "name" in df.columns:           base_name = df["name"]
    elif "productName" in df.columns:     base_name = df["productName"]
    elif "title" in df.columns:           base_name = df["title"]
    elif "product_name" in df.columns:    base_name = df["product_name"]
    else:                                 base_name = pd.Series(pd.NA, index=df.index)
    df["name"] = _norm(base_name)

    # --- season label (Dec → next winter) ---
    m = df["created"].dt.month
    y = df["created"].dt.year
    season = m.map(_season_from_month).fillna("Unknown")
    season_year = y + (m == 12).astype(int)
    df["season_label"] = (season + " " + season_year.astype(str)).astype("string[python]")

    # --- aggregate counts by groupId ---
    gid_agg = (
        df.groupby(["country","season_label","groupId"], dropna=False)["quantity"]
          .sum()
          .reset_index(name="count")
    )
    gid_agg["count"] = gid_agg["count"].astype("int64")

    # representative name per (country, season_label, groupId) by max quantity (ties -> alphabetical)
    rep_name = (
        df.groupby(["country","season_label","groupId","name"], dropna=False)["quantity"]
          .sum().reset_index(name="qty")
          .sort_values(["country","season_label","groupId","qty","name"],
                       ascending=[True, True, True, False, True])
          .drop_duplicates(["country","season_label","groupId"])
          .rename(columns={"name":"rep_name"})
          [["country","season_label","groupId","rep_name"]]
    )
    gid_agg = gid_agg.merge(rep_name, on=["country","season_label","groupId"], how="left")

    # --- rank within (country, season) and take top N ---
    gid_agg = gid_agg.sort_values(["country","season_label","count"], ascending=[True, True, False])
    gid_agg["rank"] = (
        gid_agg.groupby(["country","season_label"])["count"]
               .rank(method="first", ascending=False).astype(int)
    )
    top = (
        gid_agg[gid_agg["rank"] <= keep_top_n]
        .sort_values(["country","season_label","rank"])
        .rename(columns={"groupId":"value", "rep_name":"name"})
        .reset_index(drop=True)
    )

    # final schema
    return top[["country","season_label","value","name","count","rank"]]


# --- usage ---
items_path = OUTPUT_DIR / "order_items.parquet"
top_path   = OUTPUT_DIR / "top_groupids_by_season.parquet"
df_items = pd.read_parquet(items_path)
df_top = build_top_groupids_by_season(df_items)
df_top.to_parquet(top_path, index=False)
print(f"[done] wrote {top_path.resolve()}")



[done] wrote /workspace/data/parquet_out/top_groupids_by_season.parquet


In [None]:
# %% [markdown]
# # Top Repurchased Products by Country

import pandas as pd

def build_top_repurchase_groupids_by_country_unique_days(
    df_items: pd.DataFrame,
    unique_days_threshold: int = 4,   # "more than 4 different dates"
    keep_top_n: int = 10,
) -> pd.DataFrame:
    df = df_items.copy()

    # --- types & normalization ---
    df["created"] = pd.to_datetime(df.get("created"), errors="coerce")
    df = df.dropna(subset=["created"])  # need a date to test "different dates"
    df["purchase_date"] = df["created"].dt.date
    df["quantity"] = pd.to_numeric(df.get("quantity"), errors="coerce").fillna(1).astype("int64")

    def _norm(s: pd.Series) -> pd.Series:
        s = s.astype("string[python]")
        s = s.where(~s.isna(), "Unknown")
        return s.str.strip().fillna("Unknown")

    df["country"]     = _norm(df.get("country"))
    df["groupId"]     = _norm(df.get("groupId"))
    df["customer_id"] = _norm(df.get("customer_id"))

    # best-available product name column
    if   "name" in df.columns:          base_name = df["name"]
    elif "productName" in df.columns:   base_name = df["productName"]
    elif "title" in df.columns:         base_name = df["title"]
    elif "product_name" in df.columns:  base_name = df["product_name"]
    else:                               base_name = pd.Series(pd.NA, index=df.index)
    df["name"] = _norm(base_name)

    # ---------- per customer, per (country, groupId): count UNIQUE purchase dates ----------
    per_cust = (
        df.groupby(["country","groupId","customer_id"], dropna=False)
          .agg(unique_days=("purchase_date", pd.Series.nunique),
               qty=("quantity","sum"))
          .reset_index()
    )
    eligible = per_cust[per_cust["unique_days"] > unique_days_threshold].copy()

    # ---------- # of repurchasing customers per (country, groupId) ----------
    rep_counts = (
        eligible.groupby(["country","groupId"], dropna=False)["customer_id"]
                .nunique().reset_index(name="repurchasers")
    ).astype({"repurchasers":"int64"})

    # ---------- representative name per (country, groupId) ----------
    # Pick the name with the highest overall quantity (ties → alphabetical)
    name_weight = (
        df.groupby(["country","groupId","name"], dropna=False)["quantity"]
          .sum().reset_index(name="qty")
    )
    rep_name = (
        name_weight.sort_values(["country","groupId","qty","name"],
                                ascending=[True, True, False, True])
                   .drop_duplicates(["country","groupId"])
                   .rename(columns={"name":"rep_name"})
                   [["country","groupId","rep_name"]]
    )

    out = rep_counts.merge(rep_name, on=["country","groupId"], how="left")

    # ---------- rank within country & top-N ----------
    out = out.sort_values(["country","repurchasers"], ascending=[True, False])
    out["rank"] = out.groupby("country")["repurchasers"].rank(method="first", ascending=False).astype(int)

    top = (out[out["rank"] <= keep_top_n]
           .sort_values(["country","rank"])
           .rename(columns={"groupId":"value", "rep_name":"name"})
           .reset_index(drop=True))

    return top[["country","value","name","repurchasers","rank"]]


In [45]:
from pathlib import Path
import pandas as pd

items_path = Path("../data/parquet_out/order_items.parquet")
df_items = pd.read_parquet(items_path)

df_top_repurchase_country = build_top_repurchase_groupids_by_country(
    df_items,
    threshold=4,
    keep_top_n=10
)

out_path = Path("../data/parquet_out/top_repurchase_groupids_by_country.parquet")
df_top_repurchase_country.to_parquet(out_path, index=False)
print(f"[done] wrote {out_path.resolve()}")

# quick peek for Denmark
print(
    df_top_repurchase_country.query("country == 'Denmark'")
                             .sort_values("rank")[["value","name","repurchasers","rank"]]
                             .to_string(index=False)
)


[done] wrote /workspace/data/parquet_out/top_repurchase_groupids_by_country.parquet
 value                            name  repurchasers  rank
261637                 Ankelsokker VID            44     1
260695                 Seamless BH-top            38     2
260646               Dametrusser 3-pak            23     3
263988                Støtteknæstrømpe            17     4
261475                       Benklæder            15     5
218982                         T-shirt            14     6
260513                   BH uden bøjle            14     7
260596             BH uden bøjle Stars            14     8
210186                      Turtleneck            13     9
261610 Stretchtrusser 2-pak Basic Maxi            13    10


In [None]:
# %% [markdown]
# # Top Brands by Country

import pandas as pd

def build_top_brands_by_country(df: pd.DataFrame, keep_top_n: int = 10) -> pd.DataFrame:
    df = df.copy()
    df["quantity"] = pd.to_numeric(df["quantity"], errors="coerce").fillna(1).astype("int64")
    df["country"]  = df["country"].astype("string")
    df["brand"]    = df["brand"].astype("string").str.strip()

    # exclude Unknown/unknown/na/empty and NA values
    mask = df["brand"].notna() & ~df["brand"].str.lower().isin({"unknown", "na", ""})
    df = df[mask]

    agg = (df.groupby(["country","brand"], dropna=False)["quantity"]
             .sum().reset_index(name="count").astype({"count":"int64"}))
    agg = agg.sort_values(["country","count","brand"], ascending=[True, False, True])
    agg["rank"] = agg.groupby("country")["count"].rank(method="first", ascending=False).astype(int)

    return (agg[agg["rank"] <= keep_top_n]
              .sort_values(["country","rank"])
              .reset_index(drop=True)[["country","brand","count","rank"]])


In [53]:
df_top_brands = build_top_brands_by_country(df_items, keep_top_n=10)

out_path = Path("../data/parquet_out/top_brands_by_country.parquet")
df_top_brands.to_parquet(out_path, index=False)
print(f"[done] wrote {out_path.resolve()}")

# quick peek for Denmark
print(
    df_top_brands.query("country == 'Denmark'")
                    .sort_values("rank")[["brand","count","rank"]]
                    .to_string(index=False)
)


[done] wrote /workspace/data/parquet_out/top_brands_by_country.parquet
        brand  count  rank
       Åshild  15409     1
       Louise   6434     2
    Glamorise   2847     3
  Good Living   2485     4
    Miss Mary   2300     5
       Sloggi   1146     6
Locköstrumpan   1054     7
        Trofé    886     8
     Swegmark    786     9
    Funq Wear    753    10


Each customer is counted once, in a mutually exclusive bucket based on the time from their first purchase to their first return on a different date.

In [None]:
# %% [markdown]
# # Return Buckets

import pandas as pd

def bucket_return_days(d: int) -> str | pd.NA:
    if pd.isna(d) or d <= 0:          return pd.NA          # no return or same-day only
    if 1 <= d <= 7:                   return "week 1"
    if 8 <= d <= 14:                  return "week 2"
    if 15 <= d <= 21:                 return "week 3"
    if 22 <= d <= 30:                 return "1 month"
    # months as 30-day blocks up to 12 months
    for m in range(2, 13):            # 2..12 months
        lo, hi = 30*(m-1)+1, 30*m     # e.g., 31-60, 61-90, ...
        if lo <= d <= hi:             return f"{m} months"
    if d > 365:                       return "> 1 year"
    return pd.NA

def count_return_buckets(df_items: pd.DataFrame) -> pd.DataFrame:
    df = df_items.copy()
    df["created"] = pd.to_datetime(df["created"], errors="coerce")
    df = df.dropna(subset=["created", "customer_id"])
    df["purchase_date"] = df["created"].dt.date

    # unique purchase dates per customer
    uniq = df[["customer_id","purchase_date"]].drop_duplicates()

    # first purchase per customer
    first_date = uniq.groupby("customer_id")["purchase_date"].min().rename("first_date")

    # earliest purchase strictly AFTER first_date → first return date
    tmp = uniq.merge(first_date, on="customer_id")
    tmp = tmp[tmp["purchase_date"] > tmp["first_date"]]
    first_return = tmp.groupby("customer_id")["purchase_date"].min().rename("first_return_date")

    # join and compute delta days
    timeline = first_date.to_frame().merge(first_return, left_index=True, right_index=True, how="left")
    timeline["days_to_return"] = (pd.to_datetime(timeline["first_return_date"]) - 
                                  pd.to_datetime(timeline["first_date"])).dt.days

    # bucketize (mutually exclusive)
    timeline["bucket"] = timeline["days_to_return"].apply(bucket_return_days)

    # keep only customers who returned (bucket not NA)
    ret = timeline.dropna(subset=["bucket"])

    # counts by bucket
    counts = (ret.groupby("bucket").size().reset_index(name="customers")
                .sort_values("customers", ascending=False)
                .reset_index(drop=True))

    # optional: enforce a readable bucket order
    order = (["week 1","week 2","week 3","1 month"] +
             [f"{m} months" for m in range(2,13)] + ["> 1 year"])
    counts["order"] = counts["bucket"].map({b:i for i,b in enumerate(order)})
    counts = counts.sort_values("order").drop(columns="order").reset_index(drop=True)

    return counts


In [55]:
# df_items = pd.read_parquet("data/order_items.parquet")
df_buckets = count_return_buckets(df_items)
print(df_buckets)


       bucket  customers
0      week 1       1284
1      week 2       1715
2      week 3       1219
3     1 month       1262
4    2 months       3047
5    3 months       2444
6    4 months       2188
7    5 months       1769
8    6 months       1994
9    7 months       1640
10   8 months       1286
11   9 months        890
12  10 months        722
13  11 months        606
14  12 months        464
15   > 1 year        586


In [56]:
df_buckets.to_parquet("../data/parquet_out/return_buckets_overall.parquet", index=False)
