In [None]:
"""
NHS MCDA for patient pathway prioritization

This script implements a small, flexible Multi-Criteria Decision Analysis (MCDA)
for ranking patients within each Speciality (department).

Primary rule from user:
  1) Group (primary key): Speciality
  2) Within each Speciality, rank using criteria in this order (but implemented
     as a weighted MCDA so weights can be tuned):
       - Complexity (numerical) — larger => higher priority
       - Acuity (1..5, higher = worse) — larger => higher priority
       - VitalsTrend (categorical) — priority order: Deteriorating > Stable > Improving

Features:
  - Normalises numeric values (optionally within each Speciality)
  - Maps VitalsTrend to an ordinal score
  - Allows adjustable weights for each criterion
  - Handles missing data sensibly
  - Returns a ranked DataFrame per Speciality and an overall ordering (Speciality groups preserved)

Usage example included at bottom.
"""

from typing import Dict, Any
import pandas as pd
import numpy as np


DEFAULT_VITALS_ORDER = {
    'Deteriorating': 1.0,
    'Stable': 0.5,
    'Improving': 0.0
}


def normalize_series(s: pd.Series) -> pd.Series:
    """Min-max normalize a pandas Series to [0,1]. If constant, returns 0.5 for all.
    NaNs are left as NaN.
    """
    valid = s.dropna()
    if valid.empty:
        return s
    mn = valid.min()
    mx = valid.max()
    if mn == mx:
        # constant series; return 0.5 for known values
        out = s.copy()
        out.loc[s.notna()] = 0.5
        return out
    return (s - mn) / (mx - mn)


def compute_mcda_scores(
    df: pd.DataFrame,
    weights: Dict[str, float] = None,
    vitals_map: Dict[str, float] = None,
    normalize_within_Speciality: bool = True,
    patient_id: str = 'pseudo_patient_id',
    age_col: str = 'age',
    gender_col: str = 'sex',
    complexity_col: str = 'Complexity',
    acuity_col: str = 'Acuity',
    primarydiagnosis_col: str = 'Primary Diagnosis Summary',
    department_col: str = 'Speciality',
    vitals_col: str = 'Vitals Trend',
    waiting_col: str = 'Waiting Time (days)',
    admission_col: str = 'Time since Admission (days)',
    nextAction_col: str = 'nextAction',
    blocker_col: str = 'blocker',
    dischargedependence_col: str = 'Discharge Dependence'
) -> pd.DataFrame:

    df = df.copy()
    required = ['Speciality', complexity_col, acuity_col, waiting_col, admission_col, vitals_col]

    # Find missing columns
    missing = [c for c in required if c not in df.columns]

    # If any missing, create them with default value 0
    for c in missing:
        df[c] = 0

    # Ensure all expected keys are present (missing keys get 0.0)
    for k in ['complexity', 'acuity', 'waiting']:
        weights.setdefault(k, 0.0)

    w_total = sum(weights.values())
    if w_total == 0:
        raise ValueError('Sum of weights must be > 0')
    weights = {k: v / w_total for k, v in weights.items()}

    if vitals_map is None:
        vitals_map = DEFAULT_VITALS_ORDER

    df['_vitals_score_raw'] = df[vitals_col].map(vitals_map)
    if df['_vitals_score_raw'].isna().any():
        known_median = df['_vitals_score_raw'].median(skipna=True)
        df['_vitals_score_raw'].fillna(known_median, inplace=True)

    if normalize_within_Speciality:
        norm_complexity = df.groupby('Speciality')[complexity_col].transform(lambda s: normalize_series(s))
        norm_acuity = df.groupby('Speciality')[acuity_col].transform(lambda s: normalize_series(s))
        norm_wait = df.groupby('Speciality')[waiting_col].transform(lambda s: normalize_series(s))
        norm_admit = df.groupby('Speciality')[admission_col].transform(lambda s: normalize_series(s))
    else:
        norm_complexity = normalize_series(df[complexity_col])
        norm_acuity = normalize_series(df[acuity_col])
        norm_wait = normalize_series(df[waiting_col])
        norm_admit = normalize_series(df[admission_col])

    df['_norm_complexity'] = df.groupby('Speciality')[complexity_col].transform(lambda s: normalize_series(s)).fillna(0.5)
    df['_norm_acuity'] = df.groupby('Speciality')[acuity_col].transform(lambda s: normalize_series(s)).fillna(0.5)
    df['_norm_wait'] = df.groupby('Speciality')[waiting_col].transform(lambda s: normalize_series(s)).fillna(0.5)
    df['_norm_admit'] = df.groupby('Speciality')[admission_col].transform(lambda s: normalize_series(s)).fillna(0.5)

    df['MCDA_score'] = (
        weights['complexity'] * df['_norm_complexity'] +
        weights['acuity'] * df['_norm_acuity'] +
        weights['vitals'] * df['_vitals_score_raw'] +
        weights['waiting'] * df['_norm_wait'] +
        weights['admission'] * df['_norm_admit']
    )

    df['mcda_rank_within_Speciality'] = df.groupby('Speciality')['MCDA_score'].rank(method='first', ascending=False).astype(int)

    df['tie_breaker'] = df[[acuity_col, complexity_col, vitals_col]].apply(
        lambda row: f"Acuity={row[acuity_col]}|Complexity={row[complexity_col]}|Vitals={row[vitals_col]}", axis=1
    )

    df.drop(columns=['_vitals_score_raw', '_norm_complexity', '_norm_acuity', '_norm_wait', '_norm_admit'], inplace=True)

    return df


def rank_within_all_specialties(df_out: pd.DataFrame) -> pd.DataFrame:
    """
    Returns a DataFrame ordered first by Speciality (alphabetical) and then by mcda_rank_within_Speciality.
    If you prefer a different Speciality order, reorder 'Speciality' before calling this function.
    """
    df = df_out.copy()
    df.sort_values(['Speciality', 'mcda_rank_within_Speciality'], inplace=True)
    df.reset_index(drop=True, inplace=True)
    return df


# -------------------------------
# Example usage (run as script or import functions)
# -------------------------------
if __name__ == '__main__':
    # Load ScenarioA data
    from pathlib import Path
    import pandas as pd

    data_dir = Path('../data/scenarioA/')
    paths = {
        "current": data_dir / "ScenarioA_patients_current.csv",
        "coming": data_dir / "ScenarioA_patients_coming.csv",
        "historic": data_dir / "ScenarioA_patients_historic.csv"
    }

    dfs = {k: pd.read_csv(v) for k, v in paths.items()}
    for name, df in dfs.items():
        df.columns = [c.strip() for c in df.columns]
        dfs[name] = df

    current = dfs["current"].copy()
    coming = dfs["coming"].copy()
    historic = dfs["historic"].copy()

    # Choose which dataset to run: 'current', 'coming', or 'historic'
    which = 'current'
    input_df = dfs[which].copy()

    if which == 'current':
        weights = {
            'complexity': 0.20,
            'acuity': 0.30,
            'vitals': 0.30,
            'waiting': 0.10,
            'admission': 0.10
        }
    elif which == 'coming':
        weights = {
            'complexity': 0.30,
            'acuity': 0.50,
            'waiting': 0.20,
            'vitals': 0,
            'admission':0,
        }
    # elif which == 'historic':
    #     weights = {
    #         'complexity': 0.25,
    #         'acuity': 0.25,
    #         'vitals': 0.25,
    #         'waiting': 0.125,
    #         'admission': 0.125
    #     }

    ranked = compute_mcda_scores(input_df, weights=weights, normalize_within_Speciality=True)
    ordered = rank_within_all_specialties(ranked)

    save_name = f'mcda_ranked_patients_{which}.csv'
    ordered.to_csv(save_name, index=False)
    print(ordered.head())


In [None]:
from typing import Dict, Any
import pandas as pd
import numpy as np

DEFAULT_VITALS_ORDER = {
    'Deteriorating': 1.0,
    'Stable': 0.5,
    'Improving': 0.0
}

# ---------- Utilities ----------
def _keyize(name: str) -> str:
    """Lowercase, strip, collapse spaces/punct to create a matching key."""
    if name is None:
        return ""
    return "".join(ch for ch in name.strip().lower() if ch.isalnum())

def _apply_column_aliases(df: pd.DataFrame, alias_map: Dict[str, str]) -> pd.DataFrame:
    """
    Map many possible input column spellings to canonical names present in alias_map values.
    If a canonical column already exists, it is left untouched.
    """
    df = df.copy()
    # Build lookup from normalized current col names -> actual col name
    current = {_keyize(c): c for c in df.columns}

    for variants, canonical in alias_map.items():
        # variants can be a list/tuple of possible keys
        if isinstance(variants, (list, tuple, set)):
            found = None
            for v in variants:
                vkey = _keyize(v)
                if vkey in current:
                    found = current[vkey]
                    break
        else:
            vkey = _keyize(variants)
            found = current.get(vkey, None)

        if found is not None and canonical not in df.columns:
            df.rename(columns={found: canonical}, inplace=True)

    return df

def normalize_series(s: pd.Series) -> pd.Series:
    """Min-max normalize a pandas Series to [0,1]. If constant, returns 0.5 for all; NaNs preserved."""
    valid = s.dropna()
    if valid.empty:
        return s
    mn = valid.min()
    mx = valid.max()
    if mn == mx:
        out = s.copy()
        out.loc[s.notna()] = 0.5
        return out
    return (s - mn) / (mx - mn)

# ---------- Core MCDA ----------
def compute_mcda_scores(
    df: pd.DataFrame,
    weights: Dict[str, float] = None,
    vitals_map: Dict[str, float] = None,
    normalize_within_Speciality: bool = True,
    patient_id: str = 'pseudo_patient_id',
    age_col: str = 'age',
    gender_col: str = 'sex',
    complexity_col: str = 'Complexity',
    acuity_col: str = 'Acuity',
    primarydiagnosis_col: str = 'Primary Diagnosis Summary',
    department_col: str = 'Speciality',
    vitals_col: str = 'Vitals Trend',
    waiting_col: str = 'Waiting Time (days)',
    admission_col: str = 'Time since Admission (days)',
    nextAction_col: str = 'nextAction',
    blocker_col: str = 'blocker',
    dischargedependence_col: str = 'Discharge Dependence'
) -> pd.DataFrame:

    # 1) Standardize incoming columns (handles spaces/US-UK spellings/etc.)
    alias_map = {
        # department
        ('speciality', 'specialty', 'department', 'dept'): department_col,
        # vitals trend
        ('vitalstrend', 'vitals trend', 'vitals', 'trend'): vitals_col,
        # waiting days
        ('waitingtime(days)', 'waitingtime', 'waitingdays', 'waitdays'): waiting_col,
        # admission days
        ('timesinceadmission(days)', 'timesinceadmission', 'admissiondays', 'admitdays'): admission_col,
        # complexity / acuity
        ('complexity',): complexity_col,
        ('acuity',): acuity_col,
        # others used elsewhere
        ('primarydiagnosissummary', 'primarydiagnosis', 'diagnosis', 'dx'): primarydiagnosis_col,
        ('dischargedependence', 'discharge dependence'): dischargedependence_col,
        ('nextaction',): nextAction_col,
        ('blocker', 'block'): blocker_col,
        ('sex', 'gender'): gender_col,
        ('age',): age_col,
        ('pseudopatientid', 'patientid', 'id'): patient_id,
    }
    df = _apply_column_aliases(df, alias_map)

    # 2) Create missing required columns with safe defaults
    df = df.copy()
    # Required for scoring:
    required = [department_col, complexity_col, acuity_col, waiting_col, admission_col, vitals_col]

    # Decide defaults by type
    default_values: Dict[str, Any] = {
        department_col: "Unknown",
        complexity_col: 0.0,
        acuity_col: 0.0,
        waiting_col: 0.0,
        admission_col: 0.0,
        vitals_col: "Stable",   # Safe neutral-ish default
    }

    for c in required:
        if c not in df.columns:
            df[c] = default_values[c]

    # 3) Coerce numeric columns to numeric (errors='coerce' -> NaN -> later filled/handled)
    for c in [complexity_col, acuity_col, waiting_col, admission_col]:
        df[c] = pd.to_numeric(df[c], errors='coerce')

    # 4) Weights & vitals map defaults
    if weights is None:
        weights = dict(complexity=0.25, acuity=0.25, vitals=0.25, waiting=0.125, admission=0.125)
    # ensure all keys present
    for k in ['complexity', 'acuity', 'vitals', 'waiting', 'admission']:
        weights.setdefault(k, 0.0)

    w_total = sum(weights.values())
    if w_total == 0:
        # If user gives all zeros, fall back to equal non-zero weights on present criteria
        weights = dict(complexity=0.25, acuity=0.25, vitals=0.25, waiting=0.125, admission=0.125)
        w_total = 1.0
    weights = {k: v / w_total for k, v in weights.items()}

    if vitals_map is None:
        vitals_map = DEFAULT_VITALS_ORDER

    # 5) Vitals mapping -> numeric score, fill unknowns with median of known
    df['_vitals_score_raw'] = df[vitals_col].map(vitals_map)
    if df['_vitals_score_raw'].isna().any():
        known_median = df['_vitals_score_raw'].median(skipna=True)
        # If still NaN (e.g., no known), fall back to 0.5
        if pd.isna(known_median):
            known_median = 0.5
        df['_vitals_score_raw'] = df['_vitals_score_raw'].fillna(known_median)

    # 6) Normalization (within department or global)
    if normalize_within_Speciality:
        group = df.groupby(department_col, dropna=False)
        norm_complexity = group[complexity_col].transform(normalize_series)
        norm_acuity     = group[acuity_col].transform(normalize_series)
        norm_wait       = group[waiting_col].transform(normalize_series)
        norm_admit      = group[admission_col].transform(normalize_series)
    else:
        norm_complexity = normalize_series(df[complexity_col])
        norm_acuity     = normalize_series(df[acuity_col])
        norm_wait       = normalize_series(df[waiting_col])
        norm_admit      = normalize_series(df[admission_col])

    # Fill NaNs from normalization (e.g., all-NaN/constant groups) with neutral 0.5
    df['_norm_complexity'] = norm_complexity.fillna(0.5)
    df['_norm_acuity']     = norm_acuity.fillna(0.5)
    df['_norm_wait']       = norm_wait.fillna(0.5)
    df['_norm_admit']      = norm_admit.fillna(0.5)

    # 7) MCDA score
    df['MCDA_score'] = (
        weights['complexity'] * df['_norm_complexity'] +
        weights['acuity']     * df['_norm_acuity'] +
        weights['vitals']     * df['_vitals_score_raw'] +
        weights['waiting']    * df['_norm_wait'] +
        weights['admission']  * df['_norm_admit']
    )

    # 8) Rank within department
    df['mcda_rank_within_Speciality'] = (
        df.groupby(department_col)['MCDA_score']
          .rank(method='first', ascending=False)
          .astype(int)
    )

    # 9) Tie-breaker label (robust to missing)
    def _fmt_tie(row):
        a = row.get(acuity_col, np.nan)
        c = row.get(complexity_col, np.nan)
        v = row.get(vitals_col, "Unknown")
        return f"Acuity={a}|Complexity={c}|Vitals={v}"

    df['tie_breaker'] = df.apply(_fmt_tie, axis=1)

    # 10) Cleanup helper cols
    df.drop(columns=['_vitals_score_raw', '_norm_complexity', '_norm_acuity', '_norm_wait', '_norm_admit'], inplace=True)

    return df

def rank_within_all_specialties(df_out: pd.DataFrame, department_col: str = 'Speciality') -> pd.DataFrame:
    """
    Returns a DataFrame ordered first by department_col (alphabetical)
    and then by mcda_rank_within_Speciality.
    """
    df = df_out.copy()
    if department_col not in df.columns:
        # Graceful fallback
        department_col = next((c for c in df.columns if _keyize(c) in ('speciality','specialty','department')), 'Speciality')
        if department_col not in df.columns:
            df[department_col] = "Unknown"
    df.sort_values([department_col, 'mcda_rank_within_Speciality'], inplace=True)
    df.reset_index(drop=True, inplace=True)
    return df

# -------------------------------
# Example usage
# -------------------------------
if __name__ == '__main__':
    from pathlib import Path

    data_dir = Path('../data/scenarioA/')
    paths = {
        "current":  data_dir / "ScenarioA_patients_current.csv",
        "coming":   data_dir / "ScenarioA_patients_coming.csv",
        "historic": data_dir / "ScenarioA_patients_historic.csv"
    }

    dfs = {k: pd.read_csv(v) for k, v in paths.items()}

    # Trim whitespace in headers
    dfs = {k: df.rename(columns={c: c.strip() for c in df.columns}) for k, df in dfs.items()}

    which = 'coming'  # 'current' | 'coming' | 'historic'
    input_df = dfs[which].copy()

    if which == 'current':
        weights = {'complexity': 0.20, 'acuity': 0.30, 'vitals': 0.30, 'waiting': 0.10, 'admission': 0.10}
    elif which == 'coming':
        weights = {'complexity': 0.30, 'acuity': 0.50, 'waiting': 0.20, 'vitals': 0.0, 'admission': 0.0}
    else:  # 'historic' or anything else
        weights = {'complexity': 0.25, 'acuity': 0.25, 'vitals': 0.25, 'waiting': 0.125, 'admission': 0.125}

    ranked = compute_mcda_scores(input_df, weights=weights, normalize_within_Speciality=True)
    ordered = rank_within_all_specialties(ranked, department_col='Speciality')

    save_name = f'mcda_ranked_patients_{which}.csv'
    ordered.to_csv(save_name, index=False)
    print(ordered.head())


In [6]:
# -------------------------------
# Example usage (combine current + coming and rank)
# -------------------------------
if __name__ == '__main__':
    from pathlib import Path
    import pandas as pd

    data_dir = Path('../data/scenarioA/')
    paths = {
        "current":  data_dir / "ScenarioA_patients_current.csv",
        "coming":   data_dir / "ScenarioA_patients_coming.csv",
    }

    # Load & trim header whitespace
    dfs = {}
    for name, p in paths.items():
        df = pd.read_csv(p)
        df = df.rename(columns={c: c.strip() for c in df.columns})
        df["Dataset"] = name  # tag source
        dfs[name] = df

    # Combine current + coming
    input_df = pd.concat([dfs["current"], dfs["coming"]], ignore_index=True)

    # Choose unified weights for the combined list
    # (you can tweak these; using the more "current"-like set)
    weights = {
        'complexity': 0.20,
        'acuity':     0.30,
        'vitals':     0.30,  # rows without vitals will auto-fill neutrals
        'waiting':    0.10,
        'admission':  0.10
    }

    ranked = compute_mcda_scores(
        input_df,
        weights=weights,
        normalize_within_Speciality=True,   # normalize per Speciality
        department_col='Speciality',
        vitals_col='Vitals Trend',
        complexity_col='Complexity',
        acuity_col='Acuity',
        waiting_col='Waiting Time (days)',
        admission_col='Time since Admission (days)'
    )

    # Order by Speciality then MCDA rank (combined across current+coming)
    ordered = rank_within_all_specialties(ranked, department_col='Speciality')

    # Also add an overall priority rank (ignoring Speciality) if you want
    ordered['overall_rank'] = ordered['MCDA_score'].rank(method='first', ascending=False).astype(int)

    # Save results
    ordered.to_csv('mcda_ranked_patients_current_plus_coming.csv', index=False)
    print(ordered.head(10))


  pseudo_patient_id  age sex  Complexity  Acuity Primary Diagnosis Summary  \
0            SP0814    4   F        1.43       4                   K50-K52   
1            SP0853   50   F        1.39       4                   K55-K64   
2            SP0884   37   F        1.47       5                   K50-K52   
3            SP0838   31   F        1.49       5                   K00-K14   
4            SP0888   34   F        1.08       5                   K50-K52   
5            SP0978   95   F        1.48       5                   K20-K31   
6            SP0914   52   F        1.37       5                   K55-K64   
7            SP0864   40   M        1.08       5                   K40-K46   
8            SP0994   72   F        1.43       4                   K90-K93   
9            SP0899   68   F        0.58       4                   K80-K87   

          Speciality   Vitals Trend  Waiting Time (days)  \
0  Gastronenterology  Deteriorating                   33   
1  Gastronenterology 

In [8]:
# -------------------------------
# Example usage (combine current + coming with source label)
# -------------------------------
if __name__ == '__main__':
    from pathlib import Path
    import pandas as pd

    data_dir = Path('../data/scenarioA/')
    paths = {
        "current":  data_dir / "ScenarioA_patients_current.csv",
        "coming":   data_dir / "ScenarioA_patients_coming.csv",
    }

    dfs = {}
    for name, p in paths.items():
        df = pd.read_csv(p)
        df = df.rename(columns={c: c.strip() for c in df.columns})
        df["Dataset"] = name  # add source label column
        dfs[name] = df

    # Combine current + coming into one DataFrame
    combined_df = pd.concat([dfs["current"], dfs["coming"]], ignore_index=True)

    # Unified weight configuration
    weights = {
        'complexity': 0.25,
        'acuity':     0.35,
        'vitals':     0.25,  # some 'coming' patients may have no vitals → handled automatically
        'waiting':    0.10,
        'admission':  0.05
    }

    # Compute MCDA scores for the combined data
    ranked = compute_mcda_scores(
        combined_df,
        weights=weights,
        normalize_within_Speciality=True,
        department_col='Speciality',
        vitals_col='Vitals Trend',
        complexity_col='Complexity',
        acuity_col='Acuity',
        waiting_col='Waiting Time (days)',
        admission_col='Time since Admission (days)'
    )

    # Rank within each Speciality
    ordered = rank_within_all_specialties(ranked, department_col='Speciality')

    # Add an overall rank across all Specialities (optional)
    ordered['overall_rank'] = ordered['MCDA_score'].rank(method='first', ascending=False).astype(int)

    # Save to file
    save_path = 'mcda_ranked_patients_current_plus_coming.csv'
    ordered.to_csv(save_path, index=False)
    print(f"✅ Combined ranking saved to {save_path}")

    # Show preview
    print(ordered[['pseudo_patient_id', 'Speciality', 'MCDA_score',
                   'mcda_rank_within_Speciality', 'overall_rank', 'Dataset']].head(10))


PermissionError: [Errno 13] Permission denied: 'mcda_ranked_patients_current_plus_coming.csv'