In [45]:
from pandas import read_csv

tx = read_csv("../data/processed/transactions_clean.csv")

In [46]:
tx.dtypes

orderId               int64
shopUserId            int64
created              object
currencyId            int64
sku                  object
quantity            float64
price               float64
name                 object
type                 object
groupId             float64
currency_country     object
sek_rate            float64
price_sek             int64
Age                 float64
line_total_sek      float64
dtype: object

In [47]:
import pandas as pd

# --- Read inputs without forcing dtype=str ---
customers = pd.read_csv("../data/processed/customers_clean.csv")
articles  = pd.read_csv("../data/processed/articles_clean.csv")


  articles  = pd.read_csv("../data/processed/articles_clean.csv")


In [48]:
customers.dtypes

shopUserId            int64
invoiceFirstName     object
invoiceLastName      object
invoiceSSN           object
invoiceZip            int64
invoiceCity          object
invoiceCountryId      int64
invoiceEmail         object
Gender               object
Age                 float64
Country              object
dtype: object

In [49]:
articles.dtypes

sku                  object
groupId              object
brandId             float64
brand_missing         int64
name                 object
description          object
brand                object
color                object
colorId              object
size                 object
size_missing          int64
sizeId               object
audience             object
audienceId          float64
audience_missing      int64
category             object
category_missing      int64
categoryId           object
priceSEK            float64
priceEUR            float64
priceNOK            float64
priceDKK            float64
quantity              int64
publishedDate        object
forSale               int64
fabrics              object
fabric_primary       object
fabrics_en           object
dtype: object

In [50]:
# --- Normalize join keys to the same type: pandas nullable Int64 ---
# shopUserId
tx["shopUserId_key"]         = pd.to_numeric(tx["shopUserId"], errors="coerce").astype("Int64")
customers["shopUserId_key"]  = pd.to_numeric(customers["shopUserId"], errors="coerce").astype("Int64")

# groupId
tx["groupId_key"]            = pd.to_numeric(tx["groupId"], errors="coerce").astype("Int64")
articles["groupId_key"]      = pd.to_numeric(articles["groupId"], errors="coerce").astype("Int64")

# --- Customers merge (m:1 to avoid accidental duplication) ---
tx = tx.merge(
    customers[["shopUserId_key", "invoiceCity", "Gender"]].drop_duplicates("shopUserId_key"),
    on="shopUserId_key",
    how="left",
    validate="m:1"
)

# --- Articles merge (category, brand) ---
cols_articles = ["groupId_key"] + [c for c in ["category", "brand"] if c in articles.columns]
tx = tx.merge(
    articles[cols_articles].drop_duplicates("groupId_key"),
    on="groupId_key",
    how="left",
    validate="m:1"
)
tx = tx.drop(columns=["shopUserId_key", "groupId_key"])


In [51]:
tx.isna().mean()

orderId             0.000000
shopUserId          0.000000
created             0.000000
currencyId          0.000000
sku                 0.000000
quantity            0.000000
price               0.000000
name                0.000000
type                0.000000
groupId             0.275339
currency_country    0.000000
sek_rate            0.000000
price_sek           0.000000
Age                 0.365384
line_total_sek      0.000000
invoiceCity         0.046723
Gender              0.365384
category            0.000000
brand               0.000000
dtype: float64

In [55]:
tx.dtypes

orderId               int64
shopUserId            int64
created              object
currencyId            int64
sku                  object
quantity            float64
price               float64
name                 object
type                 object
groupId             float64
currency_country     object
sek_rate            float64
price_sek             int64
Age                 float64
line_total_sek      float64
invoiceCity          object
Gender               object
category             object
brand                object
dtype: object

In [72]:
import pandas as pd
import numpy as np

def normalize_tx_fast(tx: pd.DataFrame) -> pd.DataFrame:
    df = tx.copy()

    # 0) sensible defaults first
    df = df.convert_dtypes()  # -> StringDtype/Int64/Float64/boolean where possible

    # 1) datetimes
    if "created" in df.columns:
        df["created"] = pd.to_datetime(df["created"], errors="coerce")

    # 2) id-like strings (strip spaces, turn "" -> <NA>, drop trailing ".0")
    id_cols = ["orderId","shopUserId","currencyId","sku","name","type",
               "groupId","currency_country","invoiceCity","Gender","category","brand"]
    id_present = [c for c in id_cols if c in df.columns]
    for c in id_present:
        s = df[c].astype("string[python]").str.strip().replace({"": pd.NA})
        # remove only a terminal ".0" (keeps real dotted IDs)
        s = s.str.replace(r"\.0$", "", regex=True)
        df[c] = s

    # 3) numeric columns (set type explicitly only for the ones you care about)
    ints   = ["quantity","price_sek","line_total_sek", "Age", "price"]
    floats = ["sek_rate"]
    for c in [x for x in ints if x in df.columns]:
        df[c] = pd.to_numeric(df[c], errors="coerce").round(0).astype("Int64")
    for c in [x for x in floats if x in df.columns]:
        df[c] = pd.to_numeric(df[c], errors="coerce").astype("Float64")

    return df

# use it
tx = normalize_tx_fast(tx)


In [86]:
# === Cell 1: helpers + split ===
import re, json
import numpy as np
import pandas as pd
from pathlib import Path

NORDICS = ["Sweden", "Denmark", "Finland", "Norway"]

def normalize_city(x):
    if x is None or pd.isna(x) or (isinstance(x, str) and x.strip() == ""):
        return "Unknown"
    s = str(x)
    s = re.sub(r"\s+", " ", s).strip()
    s = re.sub(r"^(?:[A-Z]{1,3}[-\s])?\d{3,6}\s*", "", s, flags=re.IGNORECASE)
    s = re.sub(r"(?:,\s*)?(Denmark|Danmark|Sweden|Sverige|Norway|Norge|Finland|Suomi)\s*$", "", s, flags=re.IGNORECASE).strip()
    return "Unknown" if s in {"", "<NA>", "nan", "NaN"} else s

def status_from_orders(n):
    return "new" if n <= 1 else ("returning" if n <= 3 else "loyal")

def mode_or_first(s: pd.Series):
    m = s.mode()
    if not m.empty: return m.iat[0]
    s = s.dropna()
    return s.iat[0] if not s.empty else None

def split_nordics(tx: pd.DataFrame) -> dict:
    """Return dict of per-country DataFrames: {'Sweden': df_se, 'Denmark': df_dk, ...}"""
    # compact working frame from tx (shared for all countries)
    country = tx["currency_country"].astype("string[python]").str.strip().replace({"": pd.NA}).fillna("Unknown")
    city    = tx["invoiceCity"].map(normalize_city) if "invoiceCity" in tx.columns else pd.Series(["Unknown"]*len(tx), index=tx.index)
    city    = city.fillna("Unknown")
    shop    = tx["shopUserId"].astype("string[python]").str.strip().replace({"": pd.NA})
    order_id= tx["orderId"].astype("string[python]").str.strip()

    created = tx["created"] if np.issubdtype(tx["created"].dtype, np.datetime64) \
             else pd.to_datetime(tx["created"], errors="coerce")
    if isinstance(created, pd.Series) and np.issubdtype(created.dtype, np.datetime64):
        # drop tz if present
        try:
            if getattr(created.dt, "tz", None) is not None:
                created = created.dt.tz_localize(None)
        except Exception:
            pass

    rev   = pd.to_numeric(tx.get("line_total_sek"), errors="coerce").fillna(0)
    typ   = tx["type"] if "type" in tx.columns else pd.Series([None]*len(tx), index=tx.index)
    price = tx["price"] if "price" in tx.columns else pd.Series([None]*len(tx), index=tx.index)

    base_df = pd.DataFrame({
        "country": country,
        "city": city,
        "shopUserId": shop,
        "orderId": order_id,
        "rev": rev,
        "created": created,
        "type": typ,
        "price": price
    })

    # return filtered copies
    return {c: base_df[base_df["country"] == c].copy() for c in NORDICS}


In [89]:
# === Cell 2: pipeline for ONE country df; call this 4x (Sweden, Denmark, Finland, Norway) ===

def export_country_json(df_country: pd.DataFrame, tx: pd.DataFrame, country_name: str, out_dir="/workspace/data/processed"):
    df = df_country.copy()
    df["city"] = df["city"].fillna("Unknown")

    # --- totals (country) ---
    total_revenue   = int(np.rint(df["rev"].sum()))
    customers_cnt   = int(df["shopUserId"].nunique())
    total_orders    = int(df["orderId"].nunique())
    aov_country     = None if total_orders == 0 else int(round(float(total_revenue) / total_orders))

    # --- city aggregates (+ orders per city for AOV) ---
    agg_city = (
        df.groupby("city", dropna=False, sort=False)
          .agg(total_revenue_sek=("rev","sum"), customers_count=("shopUserId","nunique"))
    )
    agg_city["total_revenue_sek"] = np.rint(agg_city["total_revenue_sek"]).astype("int64")
    agg_city["customers_count"]   = agg_city["customers_count"].astype("int64")

    city_orders = (
        df.groupby("city", dropna=False)["orderId"]
          .nunique()
          .rename("total_orders")
          .astype("int64")
    )

    # --- customer summaries ---
    df_cust = df[df["shopUserId"].notna()].copy()
    agg_customer = (
        df_cust.groupby(["city","shopUserId"], dropna=False, sort=False)
              .agg(total_spent_sek=("rev","sum"),
                   total_orders=("orderId","nunique"),
                   first_order=("created","min"),
                   last_order=("created","max"))
    )
    agg_customer["total_spent_sek"] = np.rint(agg_customer["total_spent_sek"]).astype("int64")
    agg_customer = agg_customer.sort_index(level=["city","shopUserId"])

    # --- per-order (kept for downstream building) ---
    agg_order = (
        df_cust.groupby(["city","shopUserId","orderId"], dropna=False, sort=False)
              .agg(order_total_sek=("rev","sum"),
                   n_items=("orderId","size"),
                   created=("created","min"),
                   order_type=("type", mode_or_first),
                   price=("price", mode_or_first))
    )
    agg_order["order_total_sek"] = np.rint(agg_order["order_total_sek"]).astype("int64")
    agg_order = agg_order.sort_index(level=["city","shopUserId","orderId"])

    # --- items (brand/category/price on item level) ---
    item_cols = ["sku","groupId","created","quantity","price_sek","name","line_total_sek","type","brand","category","price"]
    present   = [c for c in item_cols if c in tx.columns]

    items_df = pd.DataFrame({"city": df_country["city"], "shopUserId": df_country["shopUserId"], "orderId": df_country["orderId"]})
    for c in present:
        if c == "created":
            col_created = df_country["created"]
            try:
                if getattr(col_created.dt, "tz", None) is not None:
                    col_created = col_created.dt.tz_localize(None)
            except Exception:
                pass
            items_df[c] = col_created
        else:
            items_df[c] = tx.loc[df_country.index, c] if c in tx.columns else None

    items_df = items_df[df_country["shopUserId"].notna()].copy()
    items_df["city"] = items_df["city"].fillna("Unknown")
    items_grouped = items_df.groupby(["city","shopUserId","orderId"], dropna=False, sort=False)

    # ---------- build JSON ----------
    top_key = country_name.lower()
    result = {
        top_key: {
            "total_revenue_sek": int(total_revenue),
            "customers_count": int(customers_cnt),
            "total_orders": int(total_orders),
            "avg_order_value_sek": aov_country,  # int or None
            "cities": {}
        }
    }

    # cities (include total_orders + AOV per city)
    for cty, row in agg_city.iterrows():
        ckey = "Unknown" if pd.isna(cty) else str(cty)
        orders_c = int(city_orders.get(cty, 0))
        rev_c    = int(row["total_revenue_sek"])
        aov_c    = None if orders_c == 0 else int(round(float(rev_c) / orders_c))

        result[top_key]["cities"][ckey] = {
            "total_revenue_sek": rev_c,
            "customers_count": int(row["customers_count"]),
            "total_orders": orders_c,
            "avg_order_value_sek": aov_c,
            "customers": {}
        }

    # item dict
    def _item_dict(row: pd.Series):
        cr = row.get("created")
        if isinstance(cr, pd.Timestamp): cr = cr.isoformat(sep=" ")
        def nz(v): return None if pd.isna(v) else v
        def to_int(v): return None if pd.isna(v) else int(v)
        def to_float(v): return None if pd.isna(v) else float(v)
        return {
            "sku": nz(row.get("sku")),
            "groupId": nz(row.get("groupId")),
            "created": nz(cr),
            "quantity": to_int(row.get("quantity")),
            "price_sek": to_int(row.get("price_sek")),
            "name": nz(row.get("name")),
            "line_total_sek": to_int(row.get("line_total_sek")),
            "type": nz(row.get("type")),
            "brand": nz(row.get("brand")),
            "category": nz(row.get("category")),
            "price": to_float(row.get("price")),
        }

    # customers + orders + items
    for (cty, uid), row in agg_customer.iterrows():
        status = status_from_orders(int(row["total_orders"]))
        first_iso = row["first_order"].isoformat(sep=" ") if pd.notna(row["first_order"]) else None
        last_iso  = row["last_order"].isoformat(sep=" ")  if pd.notna(row["last_order"])  else None

        cust_node = {
            "summary": {
                "total_orders": int(row["total_orders"]),
                "total_spent_sek": int(row["total_spent_sek"]),
                "first_order": first_iso,
                "last_order": last_iso,
                "status": status,
            },
            "orders": {}
        }

        try:
            cust_orders = agg_order.loc[(cty, uid)]
            if isinstance(cust_orders, pd.Series):
                cust_orders = cust_orders.to_frame().T
            for oid, orow in cust_orders.iterrows():
                try:
                    items_for_order = items_grouped.get_group((cty, uid, oid))
                    items = [_item_dict(r) for _, r in items_for_order.iterrows()]
                except KeyError:
                    items = []
                cust_node["orders"][str(oid)] = {
                    "created": orow["created"].isoformat(sep=" ") if pd.notna(orow["created"]) else None,
                    "order_total_sek": int(orow["order_total_sek"]),
                    "n_items": int(orow["n_items"]),
                    "order_type": None if pd.isna(orow["order_type"]) else orow["order_type"],
                    "price": None if pd.isna(orow.get("price")) else float(orow.get("price")),
                    "items": items,
                }
        except KeyError:
            pass

        ckey = "Unknown" if pd.isna(cty) else str(cty)
        result[top_key]["cities"][ckey]["customers"][str(uid)] = cust_node

    # ---------- save ----------
    out_path = Path(out_dir) / f"{country_name}.json"
    out_path.parent.mkdir(parents=True, exist_ok=True)
    with out_path.open("w", encoding="utf-8") as f:
        json.dump(result, f, ensure_ascii=False, indent=2)
    print(f"Saved: {out_path}")


# ==== Example usage ====
countries = split_nordics(tx)
export_country_json(countries["Sweden"],  tx, "Sweden")
export_country_json(countries["Denmark"], tx, "Denmark")
export_country_json(countries["Finland"], tx, "Finland")
export_country_json(countries["Norway"],  tx, "Norway")


Saved: /workspace/data/processed/Sweden.json
Saved: /workspace/data/processed/Denmark.json
Saved: /workspace/data/processed/Finland.json
Saved: /workspace/data/processed/Norway.json
