In [1]:
import os
import re
import warnings
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import shutil
from pathlib import Path
from typing import List, Tuple, Dict
from lifelines import KaplanMeierFitter

In [2]:
df = pd.read_excel(r"../data/export_with_days_delta _051025_,תיקונים.xlsx")
# filtering out patients that've waited more than an year
#df = df[~(df['waiting_duration'] > 365)]

In [3]:
df.waiting_duration.notna().sum()

36

In [4]:
#df.to_excel("consort_data_22-10.xlsx", index=0)

In [5]:
OUTPUT_ROOT = Path("output_by_consort_23-10_b")
consort_groups = [
    "N",
    "Eligible",
    "Randomized",
    "Dropout",
    "Research Dropout",
    "Clinical Dropout",
    "In Waiting List",
    "Finished",
    "Active",
    "Not Cooperative",
]


In [6]:

# Output base directory
output_base = Path(OUTPUT_ROOT)
if output_base.exists():
    shutil.rmtree(output_base)
output_base.mkdir(parents=True, exist_ok=True)


In [7]:

def to_bool(value) -> bool:
    """Coerce a wide range of inputs to boolean."""
    if pd.isna(value):
        return False
    if isinstance(value, bool):
        return value
    s = str(value).strip().lower()
    if s in {"1", "true", "t", "yes", "y"}:
        return True
    if s in {"0", "false", "f", "no", "n", ""}:
        return False
    try:
        return bool(int(s))
    except Exception:
        return False

In [8]:

def add_boolean_consort_columns(df: pd.DataFrame, consort_groups: List[str]) -> pd.DataFrame:
    """
    For each consort group name create a <name>__bool column with True/False.
    If the original column is absent, the boolean column will be all False.
    """
    for c in consort_groups:
        if c in df.columns:
            df[c + "__bool"] = df[c].apply(to_bool)
        else:
            df[c + "__bool"] = False
    return df


In [9]:

def coerce_numeric_columns(df: pd.DataFrame, cols: List[str]) -> pd.DataFrame:
    """Coerce a set of columns to numeric, leaving NaN where coercion fails."""
    for col in cols:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')
    return df


In [10]:

def summarize_by_arm(subset: pd.DataFrame, arms: List[str]) -> pd.DataFrame:
    """
    For the provided subset (rows belonging to one consort group) produce a small
    summary dataframe indexed by arm with:
      - count_sum (sum of 'count' if present, otherwise row-count)
      - row_count
      - waiting_duration_mean
      - did_started_therapy_mean
      - suiteable_for_pp_mean
    """
    rows = []
    dfs = []
    for arm in arms:
        arm_df = subset[subset['group'] == arm]
        dfs.append(arm_df)
        if arm_df.empty:
            rows.append({
                'group': arm,
                'count_sum': 0,
                'row_count': 0,
                'waiting_duration_mean': np.nan,
                'did_started_therapy_mean': np.nan,
                'suiteable_for_pp_mean': np.nan
            })
            continue

        if 'count' in arm_df.columns and arm_df['count'].notna().any():
            count_sum = arm_df['count'].sum(skipna=True)
        else:
            count_sum = len(arm_df)

        rows.append({
            'group': arm,
            'count_sum': int(count_sum) if not pd.isna(count_sum) else 0,
            'row_count': len(arm_df),
            'waiting_duration_mean': arm_df['waiting_duration'].mean(skipna=True),
            'waiting_duration_std': arm_df['waiting_duration'].std(skipna=True),
            'waiting_duration_median': arm_df['waiting_duration'].median(skipna=True),
            'did_started_therapy_mean': arm_df['did_started_therapy'].mean(skipna=True),
            'suiteable_for_pp_mean': arm_df['suiteable_for_pp'].mean(skipna=True)
        })

    summary_df = pd.DataFrame(rows).set_index('group')
    return summary_df, dfs


In [11]:

def save_summary_csv(summary_df: pd.DataFrame, out_dir: Path) -> Path:
    """Save summary DataFrame to CSV in out_dir and return path."""
    out_dir.mkdir(parents=True, exist_ok=True)
    csv_path = out_dir / "summary_by_arm.csv"
    summary_df.to_csv(csv_path)
    return csv_path


def plot_metric_bar(summary_df: pd.DataFrame, metric_col: str, metric_label: str, out_dir: Path) -> Path:
    """Plot a single-bar chart for metric_col and save to out_dir."""
    out_dir.mkdir(parents=True, exist_ok=True)
    fig, ax = plt.subplots()
    values = summary_df[metric_col]
    values.plot(kind='bar', ax=ax)  # no explicit colors per environment rules
    ax.set_title(f"{metric_label} — {out_dir.name}")
    ax.set_xlabel("Group")
    ax.set_ylabel(metric_label)
    plt.tight_layout()
    fname = out_dir / f"{metric_col}.png"
    fig.savefig(fname)
    plt.close(fig)
    return fname


def fit_and_plot_km(subset: pd.DataFrame, arms: List[str], out_dir: Path) -> Tuple[bool, Path]:
    """
    Fit Kaplan-Meier curves per arm using:
      - duration: waiting_duration
      - event_observed: did_started_therapy
    Returns (plotted_any, save_path)
    """
    out_dir.mkdir(parents=True, exist_ok=True)
    surv_df = subset[['group', 'waiting_duration', 'did_started_therapy']].copy()
    surv_df = surv_df.dropna(subset=['waiting_duration', 'did_started_therapy'])
    surv_df = surv_df[surv_df['waiting_duration'] >= 0]

    if surv_df.empty:
        return False, None

    kmf = KaplanMeierFitter()
    fig, ax = plt.subplots()
    plotted_any = False
    for arm in arms:
        arm_df = surv_df[surv_df['group'] == arm]
        if arm_df.empty:
            continue
        durations = arm_df['waiting_duration']
        events = arm_df['did_started_therapy'].astype(int)
        kmf.fit(durations, event_observed=events, label=arm)
        kmf.plot_survival_function(ax=ax)
        plotted_any = True

    if plotted_any:
        ax.set_title(f"Kaplan-Meier survival (time to start therapy) — {out_dir.name}")
        ax.set_xlabel("waiting_duration")
        ax.set_ylabel("Survival probability (not yet started therapy)")
        plt.tight_layout()
        fname = out_dir / "kaplan_meier.png"
        fig.savefig(fname)
        plt.close(fig)
        return True, fname
    else:
        return False, None



In [12]:
arms = ['CAU', 'Stepped Care']

In [13]:
df = df[df.group.isin(arms)]
df = add_boolean_consort_columns(df, consort_groups)
df = coerce_numeric_columns(df, ['count', 'waiting_duration', 'did_started_therapy', 'suiteable_for_pp'])

produced_files = {}

In [55]:

for c in ["N"]:
    bool_col = c + "__bool"
    subset = df[df[bool_col]]
    if subset.empty:
        # skip empty consort groups
        continue
    summary_df, dfs = summarize_by_arm(subset, ['CAU', 'Stepped Care'])


In [14]:

for c in consort_groups:
    bool_col = c + "__bool"
    subset = df[df[bool_col]]
    if subset.empty:
        # skip empty consort groups
        continue

    out_dir = output_base / c.replace(" ", "_")
    files_for_group = []

    # Summary CSV
    summary_df, _ = summarize_by_arm(subset, ['CAU', 'Stepped Care'])
    csv_path = save_summary_csv(summary_df, out_dir)
    files_for_group.append(str(csv_path))

    # Plots for metrics
    metric_list = [
        ('count_sum', 'Count (sum or row-count)'),
        ('waiting_duration_mean', 'Mean waiting_duration'),
        ('did_started_therapy_mean', 'Mean did_started_therapy'),
        ('suiteable_for_pp_mean', 'Mean suiteable_for_pp')
    ]
    for metric_col, metric_label in metric_list:
        try:
            p = plot_metric_bar(summary_df, metric_col, metric_label, out_dir)
            files_for_group.append(str(p))
        except Exception as e:
            # don't crash the whole run for one bad plot
            print(f"Failed plotting {metric_col} for {c}: {e}")

    # Survival
    plotted, km_path = fit_and_plot_km(subset, arms, out_dir)
    if plotted:
        files_for_group.append(str(km_path))

    produced_files[c] = files_for_group
