In [None]:
# ============================================================
# BioMedStruct: end-to-end SYNTHETIC SIMULATION 
# ============================================================
# This notebook demonstrates the complete BioMedStruct pipeline
# on fully synthetic and randomly generated maintenance data.
# No real hospital data are used or reproduced.
#
# Stages:
# M1 – Source ingestion + provenance tracking
# M2 – Data cleaning and logical consistency
# M3 – Semantic normalization
# M4 – Feature engineering (leakage-safe)
# M5 – Quality gates
# M6 – Final structured dataset
# ============================================================

from pathlib import Path
import pandas as pd
import numpy as np
import hashlib
import json

# Reproducibility
rng = np.random.default_rng(42)

# ------------------------------------------------------------
# 0) Synthetic heterogeneous maintenance data generation
# ------------------------------------------------------------
root = Path("./sim_biomedstruct")
raw_dir = root / "data_raw"
out_dir = root / "out"
raw_dir.mkdir(parents=True, exist_ok=True)
out_dir.mkdir(parents=True, exist_ok=True)

base_date = pd.Timestamp("2015-01-01")

# Synthetic Site A (spreadsheet-style, FR headers)
dfA = pd.DataFrame({
    "Service": ["Unit_A", "Unit_B", "Unit_C", "Unit_B"],
    "Désignation équipement": ["Device_X", "Device_Y", "Device_Z", "Device_W"],
    "Marque": ["Brand_A", "Brand_B", "Brand_C", "Brand_B"],
    "Modèle/Type": ["Model_1", "Model_2", "Model_3", "Model_4"],
    "N° Inventaire": [f"INV_{i:04d}" for i in rng.integers(100, 999, size=4)],
    "Date panne": [(base_date + pd.Timedelta(days=int(d))).strftime("%d/%m/%Y")
                   for d in rng.integers(0, 300, size=4)],
    "Date d’intervention": [(base_date + pd.Timedelta(days=int(d))).strftime("%d/%m/%Y")
                            for d in rng.integers(1, 320, size=4)],
    "Type de panne": ["Type_A", "Type_B", "Type_C", "Type_A"],
    "Etat": ["ok", "ok", "pending", "ok"],
    "Nature d intervention": ["internal", "internal", "internal", "internal"],
})

# Synthetic Site B (CSV-style, EN headers, different formats)
dfB = pd.DataFrame({
    "service": ["unit_a", "unit_b", "unit_b"],
    "designation": ["Device_X", "Device_Y", "Device_W"],
    "marque": ["Brand_A", "Brand_B", "Brand_B"],
    "modele": ["Model_1", "Model_2", "Model_4"],
    "inv_id": [f"INV_{i:04d}" for i in rng.integers(100, 999, size=3)],
    "failure_date": [(base_date + pd.Timedelta(days=int(d))).strftime("%Y-%m-%d")
                     for d in rng.integers(0, 300, size=3)],
    "date_interv": [(base_date + pd.Timedelta(days=int(d))).strftime("%Y-%m-%d")
                    for d in rng.integers(1, 320, size=3)],
    "failure_type": ["Type_A", "Type_B", "Type_A"],
    "status": ["ok", "ok", "ok"],
    "nature_interv": ["internal", "internal", "internal"]
})

dfA.to_excel(raw_dir / "siteA_synthetic.xlsx", index=False)
dfB.to_csv(raw_dir / "siteB_synthetic.csv", index=False, encoding="utf-8")

# ------------------------------------------------------------
# 1) Helpers: hashing and column harmonization
# ------------------------------------------------------------
def sha256_file(p: Path) -> str:
    h = hashlib.sha256()
    with open(p, "rb") as f:
        for chunk in iter(lambda: f.read(1 << 20), b""):
            h.update(chunk)
    return h.hexdigest()

CANON_COLS = {
    "service": ["service"],
    "designation": ["désignation équipement", "designation"],
    "marque": ["marque"],
    "modele": ["modèle/type", "modele"],
    "inv_id": ["n° inventaire", "inv_id"],
    "date_panne": ["date panne", "failure_date"],
    "date_interv": ["date d’intervention", "date_interv"],
    "type_panne": ["type de panne", "failure_type"],
    "etat": ["etat", "status"],
    "nature_interv": ["nature d intervention", "nature_interv"],
}

def normalize_cols(cols):
    return [str(c).strip().lower().replace("’", "'") for c in cols]

def map_to_canon(col):
    for k, aliases in CANON_COLS.items():
        if col in aliases:
            return k
    return col

# ------------------------------------------------------------
# 2) M1 – Ingestion + provenance
# ------------------------------------------------------------
frames, prov_log = [], []

for p in raw_dir.glob("*"):
    if p.suffix in [".xlsx", ".xls"]:
        xl = pd.ExcelFile(p)
        for sh in xl.sheet_names:
            df = xl.parse(sh, dtype=str)
            df.columns = normalize_cols(df.columns)
            df = df.rename(columns={c: map_to_canon(c) for c in df.columns})
            df["__source_file"] = p.name
            df["__source_sheet"] = sh
            df["__ingest_time"] = pd.Timestamp.utcnow().isoformat()
            df["__file_hash"] = sha256_file(p)
            frames.append(df)
            prov_log.append({"file": p.name, "sheet": sh, "hash": df["__file_hash"].iloc[0]})
    else:
        df = pd.read_csv(p, dtype=str)
        df.columns = normalize_cols(df.columns)
        df = df.rename(columns={c: map_to_canon(c) for c in df.columns})
        df["__source_file"] = p.name
        df["__source_sheet"] = None
        df["__ingest_time"] = pd.Timestamp.utcnow().isoformat()
        df["__file_hash"] = sha256_file(p)
        frames.append(df)
        prov_log.append({"file": p.name, "sheet": None, "hash": df["__file_hash"].iloc[0]})

raw = pd.concat(frames, ignore_index=True)
pd.DataFrame(prov_log).to_json(out_dir / "provenance_log.jsonl",
                               orient="records", lines=True)

# ------------------------------------------------------------
# 3) M2 – Cleaning and temporal consistency
# ------------------------------------------------------------
def parse_date(x):
    try:
        return pd.to_datetime(x, dayfirst=True, errors="coerce")
    except Exception:
        return pd.NaT

for c in ["date_panne", "date_interv"]:
    raw[c] = raw[c].apply(parse_date)

raw = raw.sort_values(["inv_id", "date_panne", "date_interv"])
raw = raw.drop_duplicates(subset=["inv_id", "date_panne"], keep="first")
raw = raw[raw["date_interv"] >= raw["date_panne"]]

# ------------------------------------------------------------
# 4) M3 – Semantic normalization
# ------------------------------------------------------------
raw["service"] = raw["service"].str.upper()
raw["type_panne"] = raw["type_panne"].str.upper()

# ------------------------------------------------------------
# 5) M4 – Feature engineering (leakage-safe)
# ------------------------------------------------------------
raw["mttr_days"] = (raw["date_interv"] - raw["date_panne"]).dt.days

raw = raw.sort_values(["inv_id", "date_panne"])
raw["pannes_90j"] = (
    raw.groupby("inv_id")["date_panne"]
    .rolling("90D", on="date_panne")
    .count()
    .reset_index(level=0, drop=True)
    - 1
).fillna(0)

raw["criticality_score"] = raw["pannes_90j"] * raw["mttr_days"].fillna(0)

# ------------------------------------------------------------
# 6) M5 – Quality gates
# ------------------------------------------------------------
quality_gates = {
    "temporal_consistency": (raw["date_interv"] >= raw["date_panne"]).all(),
    "unique_events": not raw.duplicated(subset=["inv_id", "date_panne"]).any(),
    "feature_availability": raw["mttr_days"].notna().mean() >= 0.8
}

print("Quality gates:", quality_gates)

# ------------------------------------------------------------
# 7) M6 – Final structured dataset
# ------------------------------------------------------------
final_cols = [
    "service", "designation", "marque", "modele", "inv_id",
    "date_panne", "date_interv", "type_panne",
    "mttr_days", "pannes_90j", "criticality_score",
    "__source_file", "__source_sheet", "__ingest_time", "__file_hash"
]

final_df = raw[final_cols]
final_df.to_csv(out_dir / "BioMedStruct_synthetic_structured_dataset.csv", index=False)

print("Final dataset shape:", final_df.shape)
final_df.head()
