In [None]:
bpic_path = "C://Users//nikol//MT-repo//data//bpic2012_a//bpic2012_a.csv"
sepsis_path = "C://Users//nikol//MT-repo//data//sepsis//sepsis.csv"
emergency_ort_path = "C://Users//nikol//MT-repo//data//emergency_ORT/emergency_ORT.csv"

In [None]:
# Utilities to load logs and compute statistics
import pandas as pd
from pathlib import Path
from typing import Optional, Dict, List

# Diagnostics toggles
VERBOSE = False
DIAGNOSE_RESOURCES = False
DIAG_PRINT_ALL_RESOURCES = False
DIAG_RESOURCE_MAX_SHOW = 500

# Heuristics to detect common column names across different logs
CASE_CANDIDATES = [
    'Case ID', 'case_id', 'caseid', 'case', 'case:concept:name', 'SEHRegistratienummer'
]
ACTIVITY_CANDIDATES = [
    'activity', 'Activity', 'concept:name', 'event', 'event_name'
]
TIMESTAMP_CANDIDATES = [
    'timestamp', 'Timestamp', 'time:timestamp', 'time', 'event_time'
]
RESOURCE_CANDIDATES = [
    'resource', 'Resource', 'org:resource', 'performer'
]


def pick_column(df: pd.DataFrame, candidates: List[str]) -> Optional[str]:
    cols_lower = {c.lower(): c for c in df.columns}
    for name in candidates:
        if name in df.columns:
            return name
        # case-insensitive match
        if name.lower() in cols_lower:
            return cols_lower[name.lower()]
    return None


def ensure_datetime(s: pd.Series) -> pd.Series:
    if pd.api.types.is_datetime64_any_dtype(s):
        return s
    return pd.to_datetime(s, errors='coerce', utc=False, infer_datetime_format=True)


def compute_log_stats(df: pd.DataFrame) -> pd.DataFrame:
    case_col = pick_column(df, CASE_CANDIDATES)
    act_col = pick_column(df, ACTIVITY_CANDIDATES)
    ts_col = pick_column(df, TIMESTAMP_CANDIDATES)
    res_col = pick_column(df, RESOURCE_CANDIDATES)

    if case_col is None or act_col is None or ts_col is None:
        missing = [n for n, v in [('case', case_col), ('activity', act_col), ('timestamp', ts_col)] if v is None]
        raise ValueError(f"Missing required column(s): {', '.join(missing)}")

    # Minimal cleaning
    df = df[[c for c in [case_col, act_col, ts_col, res_col] if c is not None]].copy()
    df[act_col] = df[act_col].astype(str).str.strip()
    df[ts_col] = ensure_datetime(df[ts_col])
    df = df.dropna(subset=[case_col, act_col, ts_col])

    # Resource normalization to avoid overcounting due to whitespace/case/placeholder tokens
    resources_raw_unique = 0
    resources_clean_unique = 0
    placeholder_counts: Dict[str, int] = {}
    if res_col is not None and res_col in df.columns:
        # Raw, as-is (excluding real NaNs)
        res_raw = df[res_col]
        resources_raw_unique = int(res_raw.dropna().nunique())

        # Clean: strip, case-fold, and nullify placeholder tokens
        res_s = res_raw.astype(str).str.strip()
        res_lower = res_s.str.lower()
        # Tokens that should not be treated as real resources
        empty_like = {'', 'nan', 'nat', 'none', 'null', 'na', 'n/a', '-', 'unknown', 'unk'}
        mask_placeholder = res_lower.isin(empty_like)
        # Track placeholder counts for diagnostics
        if DIAGNOSE_RESOURCES:
            for tok in sorted(empty_like):
                placeholder_counts[tok] = int((res_lower == tok).sum())
        # Apply cleaning: set placeholders to NaN, but keep original case (we count on lower values below)
        res_clean = res_s.mask(mask_placeholder)
        res_norm = res_clean.str.lower()
        resources_clean_unique = int(res_norm.dropna().nunique())

        # Optionally attach a normalized column for downstream debugging (not used in summary)
        df['__resource_norm__'] = res_norm

    # Sort to build consistent variants
    df = df.sort_values([case_col, ts_col, act_col]).reset_index(drop=True)

    # Core counts
    traces = int(df[case_col].nunique())
    activities = int(df[act_col].nunique())
    # Use cleaned resource count exclusively; if no resource column, use 0
    resources = int(resources_clean_unique) if (res_col is not None and res_col in df.columns) else 0

    # Per-case metrics
    events_per_case = df.groupby(case_col)[act_col].size()
    avg_trace_length = float(events_per_case.mean()) if not events_per_case.empty else 0.0
    max_trace_length = int(events_per_case.max()) if not events_per_case.empty else 0

    # Variants (sequence of activities per case)
    seq_per_case = df.groupby(case_col)[act_col].apply(lambda s: ' > '.join(s.tolist()))
    variants = int(seq_per_case.nunique())

    # Cycle times per case (max - min)
    ts_agg = df.groupby(case_col)[ts_col].agg(['min', 'max'])
    cycle_times = (ts_agg['max'] - ts_agg['min']).dt.total_seconds().div(3600.0)  # hours
    avg_cycle_time_h = float(cycle_times.mean()) if not cycle_times.empty else 0.0

    # Total duration over the whole log
    total_duration_h = float((df[ts_col].max() - df[ts_col].min()).total_seconds() / 3600.0) if len(df) else 0.0

    # Diagnostics output to help understand over-counting
    if DIAGNOSE_RESOURCES and (res_col is not None and res_col in df.columns):
        print("\n[Resource diagnostics]")
        print(f"- Raw unique resources (non-null): {resources_raw_unique}")
        print(f"- Cleaned unique resources: {resources_clean_unique}")
        if placeholder_counts:
            nonzero = {k: v for k, v in placeholder_counts.items() if v > 0}
            if nonzero:
                print("- Placeholder token occurrences (treated as empty):", nonzero)
        # Show top 20 resource values by frequency (cleaned)
        vc = df['__resource_norm__'].value_counts(dropna=True)
        if not vc.empty:
            print("- Top resource values (cleaned, top 20):")
            display(vc.head(20).rename_axis('resource').reset_index(name='events'))
        # Optionally print all unique cleaned resources
        if DIAG_PRINT_ALL_RESOURCES:
            uniques = df['__resource_norm__'].dropna().drop_duplicates().sort_values()
            to_show = uniques.head(DIAG_RESOURCE_MAX_SHOW)
            print(f"- Unique cleaned resources (showing up to {DIAG_RESOURCE_MAX_SHOW}): {len(uniques)} total")
            display(pd.DataFrame({'resource': to_show}))

    summary = pd.DataFrame([
        {
            'Traces': traces,
            'Variants': variants,
            'Activities': activities,
            # Cleaned/preferred resources only
            'Resources': resources,
            'Avg. trace length': round(avg_trace_length, 2),
            'Max trace length': int(max_trace_length),
            'Avg. trace cycle time (h)': round(avg_cycle_time_h, 2),
            'Total duration (h)': round(total_duration_h, 2),
        }
    ])
    return summary



def try_load_csv(path_str: str) -> Optional[pd.DataFrame]:
    try:
        p = Path(path_str)
        if not p.exists():
            if VERBOSE:
                print(f"Not found: {p}")
            return None
        df = pd.read_csv(p, delimiter=';')
        if VERBOSE:
            print(f"Loaded {p} -> {len(df):,} rows, {len(df.columns)} cols")
        return df
    except Exception as e:
        if VERBOSE:
            print(f"Failed to load {path_str}: {type(e).__name__}: {e}")
        return None

# Compute and display stats for known paths
paths = {
    'BPIC2012_A': bpic_path,
    'Sepsis': sepsis_path,
    # Attempt to include emergency ED log if present
    'Emergency_ort': emergency_ort_path,
}

rows = []
for name, p in paths.items():
    df = try_load_csv(p)
    if df is None or df.empty:
        continue
    try:
        if VERBOSE:
            print(f"\n=== Dataset: {name} ===")
        stats = compute_log_stats(df)
        stats.insert(0, 'Dataset', name)
        rows.append(stats)
    except Exception as e:
        if VERBOSE:
            print(f"Skipping {name}: {type(e).__name__}: {e}")

if rows:
    all_stats = pd.concat(rows, ignore_index=True)
    # Only print the full combined table
    display(all_stats)
else:
    print("No datasets loaded; please check the paths.")