In [139]:
# %%  Low-CPU mode
#import os
#for k in ("ARROW_NUM_THREADS", "OMP_NUM_THREADS", "MKL_NUM_THREADS", "OPENBLAS_NUM_THREADS", "NUMEXPR_MAX_THREADS"):
#    os.environ.setdefault(k, "1")


In [1]:
import pandas as pd

tx = pd.read_parquet("../data/processed/transactions_clean.parquet")
#customers = pd.read_parquet("../data/processed/customers_clean.parquet")
articles  = pd.read_parquet("../data/processed/articles_clean.parquet")

In [2]:
articles.dtypes

sku                         object
groupId                     object
brandId             string[python]
brand_missing                 int8
name                        object
description         string[python]
brand               string[python]
color               string[python]
colorId             string[python]
size                string[python]
size_missing                  int8
sizeId              string[python]
audience            string[python]
audienceId          string[python]
audience_missing              int8
category            string[python]
category_missing              int8
categoryId          string[python]
priceSEK            string[python]
priceEUR            string[python]
priceNOK            string[python]
priceDKK            string[python]
quantity            string[python]
publishedDate       string[python]
forSale             string[python]
fabrics                     object
fabric_primary              object
fabrics_en                  object
dtype: object

In [40]:
tx.isna().sum()


orderId                0
shopUserId             0
created                0
currencyId             0
sku                    0
groupId                0
quantity               0
price                  0
name                   0
type                   0
invoiceCity            0
category               0
brand                  0
sek_rate               0
price_sek              0
Age               104713
Gender            104713
line_total_sek         0
country                0
dtype: int64

## Build json for mental mapping

In [4]:
# %% [markdown]
# # Setup & Constants
# Lightweight imports and shared constants.

# %%
import re, json
import numpy as np
import pandas as pd
from pathlib import Path

NORDICS = ["Sweden", "Denmark", "Finland", "Norway"]

In [5]:
# %% [markdown]
# # Generic helpers
# Small utilities reused across the pipeline.

# %%
def status_from_orders(n: int) -> str:
    return "new" if n <= 1 else ("returning" if n <= 3 else "loyal")

def mode_or_first(s: pd.Series):
    m = s.mode()
    if not m.empty:
        return m.iat[0]
    s = s.dropna()
    return s.iat[0] if not s.empty else None

def _pick(tx: pd.DataFrame, names, default=None) -> pd.Series:
    for n in names:
        if n in tx.columns:
            return tx[n]
    return pd.Series([default] * len(tx), index=tx.index)


In [6]:
# %% [markdown]
# # Normalization helpers
# Standardize ids and small categorical fields.

# %%
def _norm_id(s: pd.Series) -> pd.Series:
    # to string, strip, drop trailing ".0" if present (common after Parquet/float cast)
    s = s.astype("string[python]").str.strip()
    return s.str.replace(r"\.0+$", "", regex=True)

def _norm_gender(s: pd.Series):
    if s is None:
        return s
    return s.astype("string[python]").str.strip().str.lower()


In [7]:
# %% [markdown]
# # Tx base-frame builder
# Make a clean base transaction frame with unified schema.

# %%
def _build_base_tx(tx: pd.DataFrame) -> pd.DataFrame:
    country = _pick(tx, ["currency_country", "Country"], "Unknown").astype("string[python]").str.strip()
    country = country.replace({"": pd.NA}).fillna("Unknown")
    city = _pick(tx, ["invoiceCity", "city"], "Unknown").astype(object).fillna("Unknown")
    shop = _norm_id(tx["shopUserId"])
    order_id = tx["orderId"].astype("string[python]").str.strip()

    created_raw = tx["created"]
    created = created_raw if np.issubdtype(created_raw.dtype, np.datetime64) else pd.to_datetime(created_raw, errors="coerce")
    if isinstance(created, pd.Series) and np.issubdtype(created.dtype, np.datetime64):
        try:
            if getattr(created.dt, "tz", None) is not None:
                created = created.dt.tz_localize(None)
        except Exception:
            pass

    rev = pd.to_numeric(tx.get("line_total_sek"), errors="coerce").fillna(0)
    typ = tx["type"] if "type" in tx.columns else pd.Series([None] * len(tx), index=tx.index)
    price = tx["price"] if "price" in tx.columns else pd.Series([None] * len(tx), index=tx.index)

    return pd.DataFrame(
        {
            "country": country,
            "city": city,
            "shopUserId": shop,
            "orderId": order_id,
            "rev": rev,
            "created": created,
            "type": typ,
            "price": price,
        }
    )


In [8]:
# %% [markdown]
# # Tx base-frame builder
# Make a clean base transaction frame with unified schema.

# %%
def _build_base_tx(tx: pd.DataFrame) -> pd.DataFrame:
    # country & city (no defaulting if you know they exist)
    country = tx["country"].astype("string[python]").str.strip()
    city = _pick(tx, ["invoiceCity", "city"], "Unknown").astype(object)

    # ids
    shop = _norm_id(tx["shopUserId"])
    order_id = tx["orderId"].astype("string[python]").str.strip()

    # dates
    created_raw = tx["created"]
    created = created_raw if np.issubdtype(created_raw.dtype, np.datetime64) else pd.to_datetime(created_raw, errors="coerce")
    if isinstance(created, pd.Series) and np.issubdtype(created.dtype, np.datetime64):
        try:
            if getattr(created.dt, "tz", None) is not None:
                created = created.dt.tz_localize(None)
        except Exception:
            pass

    # money & misc
    rev = pd.to_numeric(tx.get("line_total_sek"), errors="coerce").fillna(0)
    typ = tx["type"] if "type" in tx.columns else pd.Series([None] * len(tx), index=tx.index)
    price = tx["price"] if "price" in tx.columns else pd.Series([None] * len(tx), index=tx.index)

    # NEW: carry Age / Gender from tx
    age = pd.to_numeric(tx.get("Age"), errors="coerce").astype("Float64") if "Age" in tx.columns else pd.Series([pd.NA] * len(tx), index=tx.index, dtype="Float64")
    gender = tx.get("Gender") if "Gender" in tx.columns else pd.Series([pd.NA] * len(tx), index=tx.index, dtype="object")

    return pd.DataFrame(
        {
            "country": country,
            "city": city,
            "shopUserId": shop,
            "orderId": order_id,
            "rev": rev,
            "created": created,
            "type": typ,
            "price": price,
            "Age": age,          # <- keep case exactly as used downstream
            "Gender": gender,    # <- keep case exactly as used downstream
        }
    )



In [9]:
# %% [markdown]
# # Customer normalization & collapse
# Normalize ids, pick Age/Gender variants, and collapse duplicates.

# %%
def _prep_customers(customers: pd.DataFrame) -> pd.DataFrame:
    c = customers.copy()
    c["shopUserId_norm"] = _norm_id(c["shopUserId"])
    c["Age"] = pd.to_numeric(_pick(c, ["Age", "age", "customer_age"]), errors="coerce").astype("Float64")
    c["Gender"] = _pick(c, ["Gender", "gender", "customer_gender"])
    c_agg = (
        c.groupby("shopUserId_norm", dropna=False)
        .agg(Age=("Age", mode_or_first), Gender=("Gender", mode_or_first))
        .reset_index()
    )
    return c_agg


In [10]:
# %% [markdown]
# # Customer normalization & collapse
# Normalize ids, pick Age/Gender variants, and collapse duplicates.

# %%
def _prep_customers(customers: pd.DataFrame) -> pd.DataFrame:
    c = customers.copy()
    c["shopUserId_norm"] = _norm_id(c["shopUserId"])
    c_agg = (
        c.groupby("shopUserId_norm", dropna=False)
        .agg(Age=("Age", mode_or_first), Gender=("Gender", mode_or_first))
        .reset_index()
    )
    return c_agg


In [11]:
NORDICS = ["Sweden", "Denmark", "Finland", "Norway"]

def split_nordics(tx: pd.DataFrame) -> dict:
    base_tx = _build_base_tx(tx)
    return {cname: base_tx[base_tx["country"] == cname].copy() for cname in NORDICS}

In [12]:
# %% [markdown]
# # Export helpers: totals & aggregates
# Small builders used by the JSON exporter.
# (UPDATED: added _agg_city_monthly)

# %%
def _country_totals(tx: pd.DataFrame):
    total_revenue = int(np.rint(tx["rev"].sum()))
    customers_cnt = int(tx["shopUserId"].nunique())
    total_orders = int(tx["orderId"].nunique())
    aov_country = None if total_orders == 0 else int(round(float(total_revenue) / total_orders))
    return total_revenue, customers_cnt, total_orders, aov_country

def _agg_city(tx: pd.DataFrame):
    agg_city = (
        tx.groupby("city", dropna=False, sort=False)
        .agg(total_revenue_sek=("rev", "sum"), customers_count=("shopUserId", "nunique"))
    )
    agg_city["total_revenue_sek"] = np.rint(agg_city["total_revenue_sek"]).astype("int64")
    agg_city["customers_count"] = agg_city["customers_count"].astype("int64")
    city_orders = tx.groupby("city", dropna=False)["orderId"].nunique().rename("total_orders").astype("int64")
    return agg_city, city_orders

def _agg_customer(tx: pd.DataFrame):
    tx_cust = tx[tx["shopUserId"].notna()].copy()
    agg_customer = (
        tx_cust.groupby(["city", "shopUserId"], dropna=False, sort=False)
        .agg(
            total_spent_sek=("rev", "sum"),
            total_orders=("orderId", "nunique"),
            first_order=("created", "min"),
            last_order=("created", "max"),
            age=("age", mode_or_first),
            gender=("gender", mode_or_first),
        )
    )
    agg_customer["total_spent_sek"] = np.rint(agg_customer["total_spent_sek"]).astype("int64")
    agg_customer = agg_customer.sort_index(level=["city", "shopUserId"])
    return agg_customer

def _agg_order(tx: pd.DataFrame):
    tx_cust = tx[tx["shopUserId"].notna()].copy()
    agg_order = (
        tx_cust.groupby(["city", "shopUserId", "orderId"], dropna=False, sort=False)
        .agg(
            order_total_sek=("rev", "sum"),
            n_items=("orderId", "size"),
            created=("created", "min"),
            order_type=("type", mode_or_first),
            price=("price", mode_or_first),
        )
    )
    agg_order["order_total_sek"] = np.rint(agg_order["order_total_sek"]).astype("int64")
    agg_order = agg_order.sort_index(level=["city", "shopUserId", "orderId"])
    return agg_order

def _agg_city_monthly(tx: pd.DataFrame) -> dict[str, dict[str, int]]:
    """
    NEW: sum revenue by city and calendar month (YYYY-MM).
    Returns {city -> { 'YYYY-MM': total_revenue_sek_int, ... }, ...}
    """
    txm = tx.copy()
    # Ensure naive, month string
    ym = txm["created"]
    if getattr(ym.dt, "tz", None) is not None:
        ym = ym.dt.tz_localize(None)
    txm["year_month"] = ym.dt.to_period("M").astype(str)

    g = (
        txm.groupby(["city", "year_month"], dropna=False)["rev"]
        .sum()
        .round()
        .astype("int64")
    )
    out: dict[str, dict[str, int]] = {}
    for (cty, ym), val in g.items():
        ckey = "Unknown" if pd.isna(cty) else str(cty)
        out.setdefault(ckey, {})[ym] = int(val)
    return out

def _customers_by_channel(tx: pd.DataFrame) -> dict[str, int]:
    """
    Count distinct customers per 'type' (channel).
    Assumes 'type' is always one of: 'telephone', 'web', or 'Email'.
    """
    if "type" not in tx.columns:
        return {}
    tmp = tx[["shopUserId", "type"]].dropna().copy()

    # No normalization needed; just use the type as channel
    tmp["channel"] = tmp["type"]

    # distinct customers per channel
    g = (
        tmp.dropna(subset=["channel", "shopUserId"])
           .groupby("channel")["shopUserId"]
           .nunique()
           .astype(int)
    )
    return {k: int(v) for k, v in g.items()}


In [13]:
def _agg_customer(tx: pd.DataFrame):
    tx_cust = tx[tx["shopUserId"].notna()].copy()
    agg_customer = (
        tx_cust.groupby(["city", "shopUserId"], dropna=False, sort=False)
        .agg(
            total_spent_sek=("rev", "sum"),
            total_orders=("orderId", "nunique"),
            first_order=("created", "min"),
            last_order=("created", "max"),
            age=("Age", mode_or_first),
            gender=("Gender", mode_or_first),
        )
    )
    agg_customer["total_spent_sek"] = np.rint(agg_customer["total_spent_sek"]).astype("int64")
    return agg_customer.sort_index(level=["city", "shopUserId"])

In [14]:
# %% [markdown]
# # Export helpers: items extraction
# Build items_tx from original tx aligned to country tx indices.

# %%
def _build_items_grouped(tx_country: pd.DataFrame, tx: pd.DataFrame, articles: pd.DataFrame | None = None):
    item_cols = ["sku","groupId","created","quantity","price_sek","name","line_total_sek","type","brand","category","price"]
    present   = [c for c in item_cols if c in tx.columns]

    items_tx = pd.DataFrame({"city": tx_country["city"], "shopUserId": tx_country["shopUserId"], "orderId": tx_country["orderId"]})
    for c in present:
        if c == "created":
            col_created = tx_country["created"]
            try:
                if getattr(col_created.dt, "tz", None) is not None:
                    col_created = col_created.dt.tz_localize(None)
            except Exception:
                pass
            items_tx[c] = col_created
        else:
            items_tx[c] = tx.loc[tx_country.index, c] if c in tx.columns else None

    # --- NEW: enrich brand/category from articles ---
    if articles is not None:
        art = articles.copy()

        # normalize join keys
        for key in ("sku", "groupId"):
            if key in art.columns:
                art[key] = art[key].astype("string[python]").str.strip()
        if "sku" in items_tx.columns:
            items_tx["sku"] = items_tx["sku"].astype("string[python]").str.strip()
        if "groupId" in items_tx.columns:
            items_tx["groupId"] = items_tx["groupId"].astype("string[python]").str.strip()

        # ensure target columns exist
        if "brand" not in items_tx.columns:
            items_tx["brand"] = pd.NA
        if "category" not in items_tx.columns:
            items_tx["category"] = pd.NA

        # maps by SKU (preferred)
        if "sku" in art.columns:
            if "brand" in art.columns:
                sku_brand = art.dropna(subset=["sku"]).drop_duplicates("sku").set_index("sku")["brand"]
                items_tx["brand"] = items_tx["brand"].fillna(items_tx.get("sku").map(sku_brand))
            if "category" in art.columns:
                sku_cat = art.dropna(subset=["sku"]).drop_duplicates("sku").set_index("sku")["category"]
                items_tx["category"] = items_tx["category"].fillna(items_tx.get("sku").map(sku_cat))

        # fallback maps by groupId
        if "groupId" in art.columns:
            if "brand" in art.columns:
                gid_brand = art.dropna(subset=["groupId"]).drop_duplicates("groupId").set_index("groupId")["brand"]
                items_tx["brand"] = items_tx["brand"].fillna(items_tx.get("groupId").map(gid_brand))
            if "category" in art.columns:
                gid_cat = art.dropna(subset=["groupId"]).drop_duplicates("groupId").set_index("groupId")["category"]
                items_tx["category"] = items_tx["category"].fillna(items_tx.get("groupId").map(gid_cat))

    items_tx = items_tx[tx_country["shopUserId"].notna()].copy()
    items_tx["city"] = items_tx["city"].fillna("Unknown")
    return items_tx.groupby(["city","shopUserId","orderId"], dropna=False, sort=False)


In [15]:
# %% [markdown]
# # Export helpers: JSON node builders
# Convert rows to JSON-friendly dicts.

# %%
def _item_dict(row: pd.Series):
    cr = row.get("created")
    if isinstance(cr, pd.Timestamp):
        cr = cr.isoformat(sep=" ")
    def nz(v): return None if pd.isna(v) else v
    def to_int(v): return None if pd.isna(v) else int(v)
    def to_float(v): return None if pd.isna(v) else float(v)
    return {
        "sku": nz(row.get("sku")),
        "groupId": nz(row.get("groupId")),
        "created": nz(cr),
        "quantity": to_int(row.get("quantity")),
        "price_sek": to_int(row.get("price_sek")),
        "name": nz(row.get("name")),
        "line_total_sek": to_int(row.get("line_total_sek")),
        "type": nz(row.get("type")),
        "brand": nz(row.get("brand")),
        "category": nz(row.get("category")),
        "price": to_float(row.get("price")),
    }


In [16]:
# %% [markdown]
# # Export: main function
# Assemble the JSON structure and write to disk.
# (UPDATED: inject per-city monthly revenue)

# %%
def export_country_json(tx_country: pd.DataFrame, tx_full: pd.DataFrame, country_name: str, out_dir="/workspace/data/processed", articles: pd.DataFrame | None = None):
    tx_c = tx_country.copy()
    tx_c["city"] = tx_c["city"].fillna("Unknown")

    total_revenue, customers_cnt, total_orders, aov_country = _country_totals(tx_c)
    agg_city, city_orders = _agg_city(tx_c)
    agg_customer = _agg_customer(tx_c)
    agg_order = _agg_order(tx_c)
    city_monthly_map = _agg_city_monthly(tx_c)
    customers_by_channel = _customers_by_channel(tx_c)

    items_grouped = _build_items_grouped(tx_c, tx_full, articles=articles)
 

    # ---------- build JSON ----------
    top_key = country_name.lower()
    result = {
        top_key: {
            "total_revenue_sek": int(total_revenue),
            "customers_count": int(customers_cnt),
            "total_orders": int(total_orders),
            "avg_order_value_sek": aov_country,
            "customers_by_channel": customers_by_channel,     # << NEW
            "cities": {},
        }
    }

    # cities
    for cty, row in agg_city.iterrows():
        ckey = "Unknown" if pd.isna(cty) else str(cty)
        orders_c = int(city_orders.get(cty, 0))
        rev_c = int(row["total_revenue_sek"])
        aov_c = None if orders_c == 0 else int(round(float(rev_c) / orders_c))

        result[top_key]["cities"][ckey] = {
            "total_revenue_sek": rev_c,
            "customers_count": int(row["customers_count"]),
            "total_orders": orders_c,
            "avg_order_value_sek": aov_c,
            # NEW: monthly breakdown here
            "monthly_revenue_sek": city_monthly_map.get(ckey, {}),
            "customers": {},
        }

    # customers + orders + items (unchanged)
    for (cty, uid), row in agg_customer.iterrows():
        status = status_from_orders(int(row["total_orders"]))
        first_iso = row["first_order"].isoformat(sep=" ") if pd.notna(row["first_order"]) else None
        last_iso = row["last_order"].isoformat(sep=" ") if pd.notna(row["last_order"]) else None

        age_val = None
        if "age" in row and pd.notna(row["age"]):
            try:
                age_val = int(row["age"])
            except Exception:
                age_val = None
        gender_val = None if "gender" not in row or pd.isna(row["gender"]) else str(row["gender"])

        cust_node = {
            "summary": {
                "total_orders": int(row["total_orders"]),
                "total_spent_sek": int(row["total_spent_sek"]),
                "first_order": first_iso,
                "last_order": last_iso,
                "status": status,
                "age": age_val,
                "gender": gender_val,
            },
            "orders": {},
        }

        try:
            cust_orders = agg_order.loc[(cty, uid)]
            if isinstance(cust_orders, pd.Series):
                cust_orders = cust_orders.to_frame().T
            for oid, orow in cust_orders.iterrows():
                try:
                    items_for_order = items_grouped.get_group((cty, uid, oid))
                    items = [_item_dict(r) for _, r in items_for_order.iterrows()]
                except KeyError:
                    items = []
                cust_node["orders"][str(oid)] = {
                    "created": orow["created"].isoformat(sep=" ") if pd.notna(orow["created"]) else None,
                    "order_total_sek": int(orow["order_total_sek"]),
                    "n_items": int(orow["n_items"]),
                    "order_type": None if pd.isna(orow["order_type"]) else orow["order_type"],
                    "price": None if pd.isna(orow.get("price")) else float(orow.get("price")),
                    "items": items,
                }
        except KeyError:
            pass

        ckey = "Unknown" if pd.isna(cty) else str(cty)
        result[top_key]["cities"][ckey]["customers"][str(uid)] = cust_node

    # write
    out_path = Path(out_dir) / f"{country_name}.json"
    out_path.parent.mkdir(parents=True, exist_ok=True)
    with out_path.open("w", encoding="utf-8") as f:
        json.dump(result, f, ensure_ascii=False, indent=2)
    print(f"Saved: {out_path}")


In [17]:
# %% [markdown]
# # Example usage
# Split once, export four times.

# %%
countries = split_nordics(tx)
export_country_json(countries["Sweden"],  tx, "Sweden", articles=articles)
export_country_json(countries["Denmark"], tx, "Denmark", articles=articles)
export_country_json(countries["Finland"], tx, "Finland", articles=articles)
export_country_json(countries["Norway"],  tx, "Norway",  articles=articles)


Saved: /workspace/data/processed/Sweden.json
Saved: /workspace/data/processed/Denmark.json
Saved: /workspace/data/processed/Finland.json
Saved: /workspace/data/processed/Norway.json


## Flatten json for quick math

In [18]:
# %% [markdown]
# # Imports & Futures

# %%
from __future__ import annotations
from pathlib import Path
import json
import pandas as pd


In [19]:
# %% [markdown]
# # Paths & Config

# %%
# ---- configure these paths ----
INPUT_DIR  = Path("../data/processed")     # Sweden.json, Denmark.json, Finland.json, Norway.json
OUTPUT_DIR = Path("../data/parquet_out")   # will contain the 5 Parquet files

COUNTRY_FILES = {
    "Sweden":  INPUT_DIR / "Sweden.json",
    "Denmark": INPUT_DIR / "Denmark.json",
    "Finland": INPUT_DIR / "Finland.json",
    "Norway":  INPUT_DIR / "Norway.json",
}


In [20]:
# %% [markdown]
# # Stable Column Schemas
# (UPDATED: added CS_COUNTRY_CHANNEL)

# %%
CS_COUNTRY = ["country","total_revenue_sek","customers_count","total_orders","avg_order_value_sek"]
CS_CITY    = ["country","city","total_revenue_sek","customers_count","total_orders","avg_order_value_sek"]

# Added age & gender here
CS_CUST    = ["country","city","customer_id","total_orders","total_spent_sek",
              "first_order","last_order","status","age","gender"]

CS_ORDERS  = ["country","city","customer_id","order_id","created","order_total_sek","n_items","order_type","price"]
CS_ITEMS   = ["country","city","customer_id","order_id","sku","groupId","created","quantity","price_sek","name","line_total_sek","type","brand","category","price"]

# monthly revenue per city (existing)
CS_CITY_MONTHLY = ["country","city","year_month","total_revenue_sek"]

# NEW: customers by channel per country
CS_COUNTRY_CHANNEL = ["country","channel","customers_count"]


In [21]:
# %% [markdown]
# # I/O Helpers

# %%
def load_json(path: Path) -> dict:
    with path.open("r", encoding="utf-8") as f:
        return json.load(f)

def save_parquet(tx: pd.DataFrame, path: Path) -> None:
    path.parent.mkdir(parents=True, exist_ok=True)
    tx.to_parquet(path, index=False)


In [22]:
# %% [markdown]
# # DataFrame Utilities

# %%
def _ensure(tx: pd.DataFrame | None, cols: list[str]) -> pd.DataFrame:
    """Ensure columns exist, preserve dtypes, and reorder."""
    if tx is None or tx.empty:
        return pd.DataFrame({c: pd.Series([], dtype="object") for c in cols})[cols]
    for c in cols:
        if c not in tx.columns:
            tx[c] = pd.NA
    return tx[cols]


In [23]:
# %% [markdown]
# # JSON Shape Helpers

# %%
def _unwrap(obj: dict, hint: str) -> tuple[str, dict]:
    """
    Accept {"denmark": {...}} or {...}.
    Returns (country_name_capitalized, payload_dict).
    """
    if isinstance(obj, dict) and len(obj) == 1 and isinstance(next(iter(obj.values())), dict):
        k = next(iter(obj.keys()))
        return k.capitalize(), next(iter(obj.values()))
    if not isinstance(obj, dict):
        raise ValueError("Top-level JSON must be an object/dict")
    return hint, obj


In [25]:

# %% [markdown]
# # Flatten: Country → Row Buckets
# (UPDATED: collect "country_channels" from root.customers_by_channel)

# %%
def flatten_country(obj: dict, country_hint: str) -> dict[str, list[dict]]:
    country, root = _unwrap(obj, country_hint)
    out = {
        "country_summary": [{
            "country": country,
            "total_revenue_sek": root.get("total_revenue_sek"),
            "customers_count": root.get("customers_count"),
            "total_orders": root.get("total_orders"),
            "avg_order_value_sek": root.get("avg_order_value_sek"),
        }],
        "country_channels": [],   # NEW
        "city_summary": [],
        "city_monthly": [],
        "customer_summary": [],
        "orders": [],
        "order_items": [],
    }

    # NEW: expand customers_by_channel dict into rows
    for ch, cnt in (root.get("customers_by_channel") or {}).items():
        out["country_channels"].append({
            "country": country,
            "channel": ch,
            "customers_count": cnt,
        })


    for city, cnode in (root.get("cities") or {}).items():
        out["city_summary"].append({
            "country": country, "city": city,
            "total_revenue_sek": cnode.get("total_revenue_sek"),
            "customers_count": cnode.get("customers_count"),
            "total_orders": cnode.get("total_orders"),
            "avg_order_value_sek": cnode.get("avg_order_value_sek"),
        })

        # NEW: monthly map { 'YYYY-MM': revenue }
        for ym, rev in (cnode.get("monthly_revenue_sek") or {}).items():
            out["city_monthly"].append({
                "country": country,
                "city": city,
                "year_month": ym,
                "total_revenue_sek": rev,
            })

        for cust_id, cst in (cnode.get("customers") or {}).items():
            summ = cst.get("summary") or {}
            out["customer_summary"].append({
                "country": country,
                "city": city,
                "customer_id": cust_id,
                "total_orders": summ.get("total_orders"),
                "total_spent_sek": summ.get("total_spent_sek"),
                "first_order": summ.get("first_order"),
                "last_order": summ.get("last_order"),
                "status": summ.get("status"),
                "age": summ.get("age"),
                "gender": summ.get("gender"),
            })
            for order_id, ordn in (cst.get("orders") or {}).items():
                out["orders"].append({
                    "country": country, "city": city, "customer_id": cust_id, "order_id": order_id,
                    "created": ordn.get("created"),
                    "order_total_sek": ordn.get("order_total_sek"),
                    "n_items": ordn.get("n_items"),
                    "order_type": ordn.get("order_type"),
                    "price": ordn.get("price"),
                })
                for it in (ordn.get("items") or []):
                    out["order_items"].append({
                        "country": country, "city": city, "customer_id": cust_id, "order_id": order_id,
                        "sku": it.get("sku"), "groupId": it.get("groupId"), "created": it.get("created"),
                        "quantity": it.get("quantity"), "price_sek": it.get("price_sek"), "name": it.get("name"),
                        "line_total_sek": it.get("line_total_sek"), "type": it.get("type"),
                        "brand": it.get("brand"), "category": it.get("category"), "price": it.get("price"),
                    })
    return out


In [26]:
# %% [markdown]
# # Aggregation Orchestrator
# (UPDATED: include "city_monthly" bucket)

# %%
def collect_buckets(country_files: dict[str, Path]) -> dict[str, list[dict]]:
    buckets = {k: [] for k in [
        "country_summary","country_channels","city_summary","city_monthly","customer_summary","orders","order_items"
    ]}
    for name, path in country_files.items():
        if not path.exists():
            print(f"[warn] missing: {path}")
            continue
        rows = flatten_country(load_json(path), name)
        for k, v in rows.items():
            buckets[k].extend(v)
        print(f"[ok] parsed {name}")
    return buckets


In [27]:
# %% [markdown]
# # Materialize DataFrames (Fixed Schemas)
# (UPDATED: return tx_city_monthly)

# %%
def to_dataframes(buckets: dict[str, list[dict]]) -> dict[str, pd.DataFrame]:
    tx_country = _ensure(pd.DataFrame(buckets["country_summary"]),   CS_COUNTRY)
    tx_cc      = _ensure(pd.DataFrame(buckets["country_channels"]),  CS_COUNTRY_CHANNEL)  # NEW
    tx_city    = _ensure(pd.DataFrame(buckets["city_summary"]),      CS_CITY)
    tx_city_m  = _ensure(pd.DataFrame(buckets["city_monthly"]),      CS_CITY_MONTHLY)
    tx_cust    = _ensure(pd.DataFrame(buckets["customer_summary"]),  CS_CUST)
    tx_orders  = _ensure(pd.DataFrame(buckets["orders"]),            CS_ORDERS)
    tx_items   = _ensure(pd.DataFrame(buckets["order_items"]),       CS_ITEMS)
    return {
        "country_summary": tx_country,
        "country_channels": tx_cc,         # NEW
        "city_summary": tx_city,
        "city_monthly": tx_city_m,
        "customer_summary": tx_cust,
        "orders": tx_orders,
        "order_items": tx_items,
    }


In [28]:
# %% [markdown]
# # Persist to Parquet

# %%
def write_all_parquet(txs: dict[str, pd.DataFrame], out_dir: Path) -> None:
    save_parquet(txs["country_summary"],   out_dir / "country_summary.parquet")
    save_parquet(txs["country_channels"],  out_dir / "country_customers_by_channel.parquet")  # NEW
    save_parquet(txs["city_summary"],      out_dir / "city_summary.parquet")
    save_parquet(txs["city_monthly"],      out_dir / "city_monthly_revenue.parquet")
    save_parquet(txs["customer_summary"],  out_dir / "customer_summary.parquet")
    save_parquet(txs["orders"],            out_dir / "orders.parquet")
    save_parquet(txs["order_items"],       out_dir / "order_items.parquet")
    print(f"[done] wrote Parquet files to {out_dir.resolve()}")

In [29]:
# %% [markdown]
# # Main

# %%
def main():
    buckets = collect_buckets(COUNTRY_FILES)
    txs = to_dataframes(buckets)
    write_all_parquet(txs, OUTPUT_DIR)

if __name__ == "__main__":
    main()


[ok] parsed Sweden
[ok] parsed Denmark
[ok] parsed Finland
[ok] parsed Norway
[done] wrote Parquet files to /workspace/data/parquet_out


In [30]:
import pandas as pd
from pathlib import Path

OUTPUT_DIR = Path("/workspace/data/parquet_out/")

order_items_tx = pd.read_parquet(OUTPUT_DIR / "order_items.parquet")
order_items_tx.sample(10)


Unnamed: 0,country,city,customer_id,order_id,sku,groupId,created,quantity,price_sek,name,line_total_sek,type,brand,category,price
60194,Sweden,Mjölby,245562,256993,352787,352787,2024-08-15 10:01:34,1,69,Tvättpåsar 4 delar,69,web,Ateljé Margaretha,"Tvätt & skötsel,Hushåll övrigt,Vardagshjälpmedel",69.0
44193,Sweden,Kiruna,398285,320448,261953-3641,261953,2024-09-25 15:10:44,1,98,Damstrumpor 5-pack,98,telephone,unknown,"Strumpor,Underkläder",98.0
55527,Sweden,Varberg,866714,821949,270599-4042,270599,2025-09-03 22:14:22,1,239,V-ringat nattl. blå blom,239,web,unknown,"Nattlinnen,Sovkläder",239.0
150137,Finland,Helsinki,294289,203155,260365-0040,216401,2024-07-03 11:40:32,1,385,Alushousut Adamo Basic,385,web,Swegmark,"Trosor,Underkläder,Gördlar",34.9
53180,Sweden,Alingsås,284860,192613,263988-3839,263988,2024-06-24 17:06:11,4,158,Stödknästrumpor i nylon med spets,632,web,Funq Wear,"Stödstrumpor,Underkläder",158.4
163550,Finland,Lahti,250044,846293,241297-0054,241562,2025-09-19 10:02:21,1,573,Veluurihousut,573,web,Åshild,"Byxor,Mjukisbyxor,Nederdelar,Mysplagg",51.9
41967,Sweden,Brandbergen,841064,387266,200312,200312,2024-11-01 14:11:49,1,98,Mönsterstickad mössa,98,letter,Åshild,"Accessoarer,Kepsar & mössor",98.0
209014,Norway,Drammen,200475,767171,292789,292789,2025-07-18 10:35:02,1,75,Tubeklemme for metalltuber 2-pk,75,web,Good Living,"Kökshjälpmedel,Vardagshjälpmedel",79.0
208570,Norway,Fredrikstad,406281,329469,266650-C095,266643,2024-09-30 18:18:12,1,311,Bh uten bøyle Betty,311,web,Trofé,"Bh utan bygel,Bh,Underkläder",329.0
209998,Norway,Drammen,839616,348295,290211,290211,2024-10-11 11:17:36,1,131,Urinflaske unisex,131,telephone,Good Living,"Inkontinens,Badrum/WC",139.0


In [31]:
# %% [markdown]
# # Top Categories by Season

from pathlib import Path
import pandas as pd

def _season_from_month(m: int) -> str:
    # Winter: Dec–Feb, Spring: Mar–May, Summer: Jun–Aug, Autumn: Sep–Nov
    if m in (12, 1, 2):   return "Winter"
    if m in (3, 4, 5):    return "Spring"
    if m in (6, 7, 8):    return "Summer"
    return "Autumn"

def build_top_categories_by_season(
    tx_items: pd.DataFrame,
    category_sep: str = ",",
    keep_top_n: int = 10,
) -> pd.DataFrame:
    tx = tx_items.copy()

    # --- types & defaults ---
    tx["created"]  = pd.to_datetime(tx["created"], errors="coerce")
    tx["quantity"] = pd.to_numeric(tx.get("quantity"), errors="coerce").fillna(1).astype("int64")

    def _norm(s: pd.Series) -> pd.Series:
        s = s.astype("string[python]")
        s = s.where(~s.isna(), "Unknown")
        return s.str.strip().fillna("Unknown")

    tx["country"]  = _norm(tx.get("country"))
    tx["category"] = _norm(tx.get("category"))

    # --- season label (Winter labeled by December’s year: Dec–Feb) ---
    m = tx["created"].dt.month
    y = tx["created"].dt.year
    season = m.map(_season_from_month).fillna("Unknown")
    season_year = y - (m <= 2).astype(int)  # Jan–Feb belong to previous December’s year
    tx["season_label"] = (season + " " + season_year.astype(str)).astype("string[python]")

    # --- explode categories (no sharing → integer counts) ---
    cat_lists = (
        tx["category"]
          .str.split(category_sep)
          .apply(lambda parts: list(dict.fromkeys([p.strip() for p in (parts or []) if p and p.strip()])))
    )
    tx_exp = tx.loc[cat_lists.index, ["country","season_label","quantity"]].copy()
    tx_exp["category"] = cat_lists
    tx_exp = tx_exp.explode("category", ignore_index=True)
    tx_exp["category"] = _norm(tx_exp["category"])

    # --- aggregate & rank ---
    agg = (
        tx_exp.groupby(["country","season_label","category"], dropna=False)["quantity"]
              .sum().reset_index(name="count")
    )
    agg["count"] = agg["count"].astype("int64")

    agg = agg.sort_values(["country","season_label","count"], ascending=[True, True, False])
    agg["rank"] = (
        agg.groupby(["country","season_label"])["count"]
           .rank(method="first", ascending=False).astype(int)
    )

    top = (
        agg[agg["rank"] <= keep_top_n]
        .sort_values(["country","season_label","rank"], ascending=[True, True, True])
        .reset_index(drop=True)
    )

    # final tidy schema
    return top[["country","season_label","category","count","rank"]]

# --- usage ---
items_path = OUTPUT_DIR / "order_items.parquet"
top_path   = OUTPUT_DIR / "top_categories_by_season.parquet"
tx_items = pd.read_parquet(items_path)
tx_top = build_top_categories_by_season(tx_items)
tx_top.to_parquet(top_path, index=False)
print(f"[done] wrote {top_path.resolve()}")


[done] wrote /workspace/data/parquet_out/top_categories_by_season.parquet


In [32]:
# %% [markdown]
# # Top Products by Season

import pandas as pd

def _season_from_month(m: int) -> str:
    if m in (12, 1, 2):   return "Winter"
    if m in (3, 4, 5):    return "Spring"
    if m in (6, 7, 8):    return "Summer"
    return "Autumn"

def build_top_groupids_by_season(
    tx_items: pd.DataFrame,
    keep_top_n: int = 10,
) -> pd.DataFrame:
    tx = tx_items.copy()

    # --- types & defaults ---
    tx["created"]  = pd.to_datetime(tx["created"], errors="coerce")
    tx["quantity"] = pd.to_numeric(tx.get("quantity"), errors="coerce").fillna(1).astype("int64")

    def _norm(s: pd.Series) -> pd.Series:
        s = s.astype("string[python]")
        s = s.where(~s.isna(), "Unknown")
        return s.str.strip().fillna("Unknown")

    tx["country"] = _norm(tx.get("country"))
    tx["groupId"] = _norm(tx.get("groupId"))
    tx["brand"]   = _norm(tx.get("brand"))

    # choose a product-name column if present; normalize
    if   "name" in tx.columns:         base_name = tx["name"]
    elif "productName" in tx.columns:  base_name = tx["productName"]
    elif "title" in tx.columns:        base_name = tx["title"]
    elif "product_name" in tx.columns: base_name = tx["product_name"]
    else:                              base_name = pd.Series(pd.NA, index=tx.index)
    tx["name"] = _norm(base_name)

    # --- season label (Winter labeled by December’s year: Dec–Feb) ---
    m = tx["created"].dt.month
    y = tx["created"].dt.year
    season = m.map(_season_from_month).fillna("Unknown")
    season_year = y - (m <= 2).astype(int)  # Jan–Feb belong to previous December’s year
    tx["season_label"] = (season + " " + season_year.astype(str)).astype("string[python]")

    # --- aggregate counts by groupId ---
    gid_agg = (
        tx.groupby(["country","season_label","groupId"], dropna=False)["quantity"]
          .sum()
          .reset_index(name="count")
    )
    gid_agg["count"] = gid_agg["count"].astype("int64")

    # representative (name, brand) per (country, season_label, groupId)
    rep_meta = (
        tx.groupby(["country","season_label","groupId","name","brand"], dropna=False)["quantity"]
          .sum().reset_index(name="qty")
          .sort_values(["country","season_label","groupId","qty","name","brand"],
                       ascending=[True, True, True, False, True, True])
          .drop_duplicates(["country","season_label","groupId"])
          .rename(columns={"name":"rep_name","brand":"rep_brand"})
          [["country","season_label","groupId","rep_name","rep_brand"]]
    )
    gid_agg = gid_agg.merge(rep_meta, on=["country","season_label","groupId"], how="left")

    # --- rank within (country, season) and take top N ---
    gid_agg = gid_agg.sort_values(["country","season_label","count"], ascending=[True, True, False])
    gid_agg["rank"] = (
        gid_agg.groupby(["country","season_label"])["count"]
               .rank(method="first", ascending=False).astype(int)
    )
    top = (
        gid_agg[gid_agg["rank"] <= keep_top_n]
        .sort_values(["country","season_label","rank"])
        .rename(columns={"groupId":"value", "rep_name":"name", "rep_brand":"brand"})
        .reset_index(drop=True)
    )

    # final schema (now includes brand)
    return top[["country","season_label","value","name","brand","count","rank"]]

# --- usage ---
items_path = OUTPUT_DIR / "order_items.parquet"
top_path   = OUTPUT_DIR / "top_groupids_by_season.parquet"
tx_items = pd.read_parquet(items_path)
tx_top = build_top_groupids_by_season(tx_items)
tx_top.to_parquet(top_path, index=False)
print(f"[done] wrote {top_path.resolve()}")


[done] wrote /workspace/data/parquet_out/top_groupids_by_season.parquet


In [33]:
# %% [markdown]
# # Top Repurchased Products by Country

import pandas as pd

def build_top_repurchase_groupids_by_country_unique_days(
    tx_items: pd.DataFrame,
    unique_days_threshold: int = 1,   # "more than 2 different dates"
    keep_top_n: int = 10,
) -> pd.DataFrame:
    tx = tx_items.copy()

    # --- types & normalization ---
    tx["created"] = pd.to_datetime(tx.get("created"), errors="coerce")
    tx = tx.dropna(subset=["created"])
    tx["purchase_date"] = tx["created"].dt.normalize()  # keeps datetime64[ns]

    # quantity: safe default = 1 if column missing
    if "quantity" in tx.columns:
        q = pd.to_numeric(tx["quantity"], errors="coerce")
        tx["quantity"] = q.fillna(1).astype("int64")
    else:
        tx["quantity"] = 1

    def _safe_series(colname: str) -> pd.Series:
        return tx[colname] if colname in tx.columns else pd.Series(pd.NA, index=tx.index)

    def _norm(s: pd.Series) -> pd.Series:
        s = s.astype("string[python]")
        s = s.where(~s.isna(), "Unknown")
        return s.str.strip().fillna("Unknown")

    tx["country"]     = _norm(_safe_series("country"))
    tx["groupId"]     = _norm(_safe_series("groupId"))
    tx["customer_id"] = _norm(_safe_series("customer_id"))
    tx["brand"]       = _norm(_safe_series("brand"))

    # best-available product name column
    for cand in ["name","productName","title","product_name"]:
        if cand in tx.columns:
            base_name = tx[cand]
            break
    else:
        base_name = pd.Series(pd.NA, index=tx.index)
    tx["name"] = _norm(base_name)

    # ---------- per customer, per (country, groupId): count UNIQUE purchase dates ----------
    per_cust = (
        tx.groupby(["country","groupId","customer_id"], dropna=False)
          .agg(unique_days=("purchase_date", pd.Series.nunique))
          .reset_index()
    )
    eligible = per_cust[per_cust["unique_days"] > unique_days_threshold].copy()

    # ---------- # of repurchasing customers per (country, groupId) ----------
    rep_counts = (
        eligible.groupby(["country","groupId"], dropna=False)["customer_id"]
                .nunique().reset_index(name="repurchasers")
        .astype({"repurchasers":"int64"})
    )

    # ---------- representative (name, brand) computed among repurchasers only ----------
    rep_keys = eligible[["country","groupId","customer_id"]]
    tx_rep = tx.merge(rep_keys, on=["country","groupId","customer_id"], how="inner")

    name_weight = (
        tx_rep.groupby(["country","groupId","name","brand"], dropna=False)["quantity"]
              .sum().reset_index(name="qty")
    )
    # pick (name, brand) with highest qty; tie-break by name, brand for determinism
    rep_meta = (
        name_weight.sort_values(["country","groupId","qty","name","brand"],
                                ascending=[True, True, False, True, True])
                   .drop_duplicates(["country","groupId"])
                   .rename(columns={"name":"rep_name", "brand":"rep_brand"})
                   [["country","groupId","rep_name","rep_brand"]]
    )

    out = rep_counts.merge(rep_meta, on=["country","groupId"], how="left")

    # ---------- rank within country & top-N ----------
    out = out.sort_values(["country","repurchasers"], ascending=[True, False])
    out["rank"] = out.groupby("country")["repurchasers"].rank(method="first", ascending=False).astype(int)

    top = (out[out["rank"] <= keep_top_n]
           .sort_values(["country","rank"])
           .rename(columns={"groupId":"value", "rep_name":"name", "rep_brand":"brand"})
           .reset_index(drop=True))

    return top[["country","value","name","brand","repurchasers","rank"]]




In [34]:
from pathlib import Path
import pandas as pd

items_path = Path("../data/parquet_out/order_items.parquet")
tx_items = pd.read_parquet(items_path)

tx_top_repurchase_country = build_top_repurchase_groupids_by_country_unique_days(
    tx_items,
)


out_path = Path("../data/parquet_out/top_repurchase_groupids_by_country.parquet")
tx_top_repurchase_country.to_parquet(out_path, index=False)
print(f"[done] wrote {out_path.resolve()}")

# quick peek for Denmark
print(
    tx_top_repurchase_country.query("country == 'Denmark'")
                             .sort_values("rank")[["value","name", "brand", "repurchasers","rank"]]
                             .to_string(index=False)
)


[done] wrote /workspace/data/parquet_out/top_repurchase_groupids_by_country.parquet
 value                                                         name         brand  repurchasers  rank
260695                                              Seamless BH-top        Louise            75     1
260646                                            Dametrusser 3-pak        Åshild            55     2
241562                                                 Velourbukser        Åshild            54     3
240187                                                  Fritidsbuks        Åshild            53     4
260513                                                BH uden bøjle     Glamorise            52     5
261637                                              Ankelsokker VID Locköstrumpan            47     6
210338                                                T-shirt 2-pak        Åshild            42     7
265843 Bomulds-BH uden bøjle med Magic Lift-function Cotton Support     Glamorise            41     

In [35]:
# %% [markdown]
# # Top Brands by Country

import pandas as pd

def build_top_brands_by_country(tx: pd.DataFrame, keep_top_n: int = 10) -> pd.DataFrame:
    tx = tx.copy()
    tx["quantity"] = pd.to_numeric(tx["quantity"], errors="coerce").fillna(1).astype("int64")
    tx["country"]  = tx["country"].astype("string")
    tx["brand"]    = tx["brand"].astype("string").str.strip()

    # exclude Unknown/unknown/na/empty and NA values
    mask = tx["brand"].notna() & ~tx["brand"].str.lower().isin({"unknown", "na", ""})
    tx = tx[mask]

    agg = (tx.groupby(["country","brand"], dropna=False)["quantity"]
             .sum().reset_index(name="count").astype({"count":"int64"}))
    agg = agg.sort_values(["country","count","brand"], ascending=[True, False, True])
    agg["rank"] = agg.groupby("country")["count"].rank(method="first", ascending=False).astype(int)

    return (agg[agg["rank"] <= keep_top_n]
              .sort_values(["country","rank"])
              .reset_index(drop=True)[["country","brand","count","rank"]])


In [36]:
tx_top_brands = build_top_brands_by_country(tx_items, keep_top_n=10)

out_path = Path("../data/parquet_out/top_brands_by_country.parquet")
tx_top_brands.to_parquet(out_path, index=False)
print(f"[done] wrote {out_path.resolve()}")

# quick peek for Denmark
print(
    tx_top_brands.query("country == 'Denmark'")
                    .sort_values("rank")[["brand","count","rank"]]
                    .to_string(index=False)
)


[done] wrote /workspace/data/parquet_out/top_brands_by_country.parquet
        brand  count  rank
       Åshild  15409     1
       Louise   6434     2
    Glamorise   2847     3
  Good Living   2485     4
    Miss Mary   2300     5
       Sloggi   1146     6
Locköstrumpan   1054     7
        Trofé    886     8
     Swegmark    786     9
    Funq Wear    753    10


Each customer is counted once, in a mutually exclusive bucket based on the time from their first purchase to their first return on a different date.

In [37]:
# %% [markdown]
# # Return Buckets

import pandas as pd

def bucket_return_days(d: int) -> str | pd.NA:
    if pd.isna(d) or d <= 0:          return pd.NA          # no return or same-day only
    if 1 <= d <= 7:                   return "week 1"
    if 8 <= d <= 14:                  return "week 2"
    if 15 <= d <= 21:                 return "week 3"
    if 22 <= d <= 30:                 return "1 month"
    # months as 30-day blocks up to 12 months
    for m in range(2, 13):            # 2..12 months
        lo, hi = 30*(m-1)+1, 30*m     # e.g., 31-60, 61-90, ...
        if lo <= d <= hi:             return f"{m} months"
    if d > 365:                       return "> 1 year"
    return pd.NA

def count_return_buckets(tx_items: pd.DataFrame) -> pd.DataFrame:
    tx = tx_items.copy()
    tx["created"] = pd.to_datetime(tx["created"], errors="coerce")
    tx = tx.dropna(subset=["created", "customer_id"])
    tx["purchase_date"] = tx["created"].dt.date

    # unique purchase dates per customer
    uniq = tx[["customer_id","purchase_date"]].drop_duplicates()

    # first purchase per customer
    first_date = uniq.groupby("customer_id")["purchase_date"].min().rename("first_date")

    # earliest purchase strictly AFTER first_date → first return date
    tmp = uniq.merge(first_date, on="customer_id")
    tmp = tmp[tmp["purchase_date"] > tmp["first_date"]]
    first_return = tmp.groupby("customer_id")["purchase_date"].min().rename("first_return_date")

    # join and compute delta days
    timeline = first_date.to_frame().merge(first_return, left_index=True, right_index=True, how="left")
    timeline["days_to_return"] = (pd.to_datetime(timeline["first_return_date"]) - 
                                  pd.to_datetime(timeline["first_date"])).dt.days

    # bucketize (mutually exclusive)
    timeline["bucket"] = timeline["days_to_return"].apply(bucket_return_days)

    # keep only customers who returned (bucket not NA)
    ret = timeline.dropna(subset=["bucket"])

    # counts by bucket
    counts = (ret.groupby("bucket").size().reset_index(name="customers")
                .sort_values("customers", ascending=False)
                .reset_index(drop=True))

    # optional: enforce a readable bucket order
    order = (["week 1","week 2","week 3","1 month"] +
             [f"{m} months" for m in range(2,13)] + ["> 1 year"])
    counts["order"] = counts["bucket"].map({b:i for i,b in enumerate(order)})
    counts = counts.sort_values("order").drop(columns="order").reset_index(drop=True)

    return counts


In [38]:
# tx_items = pd.read_parquet("data/order_items.parquet")
tx_buckets = count_return_buckets(tx_items)
print(tx_buckets)


       bucket  customers
0      week 1       1284
1      week 2       1714
2      week 3       1219
3     1 month       1262
4    2 months       3047
5    3 months       2444
6    4 months       2188
7    5 months       1769
8    6 months       1994
9    7 months       1640
10   8 months       1285
11   9 months        890
12  10 months        722
13  11 months        606
14  12 months        464
15   > 1 year        586


In [39]:
tx_buckets.to_parquet("../data/parquet_out/return_buckets_overall.parquet", index=False)
