In [2]:
# ------------------------------------------------------------
# 0.  Imports  (all are in the default Anaconda / Colab stack)
# ------------------------------------------------------------
import pandas as pd
import numpy  as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    roc_curve, auc, confusion_matrix, ConfusionMatrixDisplay
)

# --------------------------------------------------------------------
# 1.  Load & basic clean‑up
#      * adjust path if you keep the csv elsewhere
#      * any rows missing the key fields are dropped
# --------------------------------------------------------------------
PATH = "simulated_patients.csv"          # <- change if needed

df = (
    pd.read_csv(PATH, dtype=str)         # read as str first to trim/clean
      .apply(lambda col: col.str.strip())# remove leading/trailing blanks
      .replace({"": np.nan})             # empty → NaN
)

# --- cast columns to proper types (edit names if yours differ) --------
num_cols   = ["AIS_admission", "AIS_discharge"]  # example column names
cat_cols   = ["Gender"]                          # must contain only two levels
df[num_cols] = df[num_cols].apply(pd.to_numeric, errors="coerce")
df[cat_cols] = df[cat_cols].astype("category")

# keep complete cases
df = df.dropna(subset=num_cols + cat_cols).reset_index(drop=True)

# --------------------------------------------------------------------
# 2.  Derive the binary outcome:  1 = improved; 0 = no/negative change
# --------------------------------------------------------------------
df["Improved"] = (df["AIS_discharge"] > df["AIS_admission"]).astype(int)

# quick sanity check
print(df["Improved"].value_counts(dropna=False))

# --------------------------------------------------------------------
# 3.  Exploratory plots
# --------------------------------------------------------------------
sns.set_theme(style="whitegrid")

# (a) bar chart of improvement rate by gender
improv_rate = (
    df.groupby("Gender")["Improved"]
      .agg(["mean", "count"])
      .rename(columns={"mean": "ImprovementRate"})
      .reset_index()
)

ax = sns.barplot(
    data=improv_rate,
    x="Gender", y="ImprovementRate",
    palette="Set2"
)
ax.set_ylim(0, 1)
ax.set_ylabel("Proportion improved")
ax.set_title("Improvement rate by gender")
for p, n in zip(ax.patches, improv_rate["count"]):
    ax.text(p.get_x()+p.get_width()/2, p.get_height()+.02, f"n={n}", 
            ha="center", va="bottom", fontsize=9)
plt.show()

# (b) distribution of score change by gender
df["Δscore"] = df["AIS_discharge"] - df["AIS_admission"]
sns.boxplot(data=df, x="Gender", y="Δscore", palette="Set2")
plt.title("Change in score (discharge – admission)")
plt.show()

# --------------------------------------------------------------------
# 4.  Statistical tests
#      • two‑sample t (or Mann‑Whitney) on Δscore
#      • chi‑square / Fisher on 2×2 table
# --------------------------------------------------------------------
male_delta  = df.loc[df["Gender"] == df["Gender"].cat.categories[0], "Δscore"]
female_delta= df.loc[df["Gender"] == df["Gender"].cat.categories[1], "Δscore"]

tstat, p_t = stats.ttest_ind(male_delta, female_delta, equal_var=False)
print(f"T‑test on Δscore by gender: t={tstat:.3g},  p={p_t:.3g}")

cont = pd.crosstab(df["Gender"], df["Improved"])
chi2, p_chi, *_ = stats.chi2_contingency(cont)         # fallback if any cell ≥5
print(f"Chi‑square on improvement×gender: χ²={chi2:.3g}, p={p_chi:.3g}")

# --------------------------------------------------------------------
# 5.  Simple logistic regression  (Improved ~ Gender)
# --------------------------------------------------------------------
X = pd.get_dummies(df["Gender"], drop_first=True)   # 0 = baseline gender
y = df["Improved"].values

model = LogisticRegression(solver="lbfgs")
model.fit(X, y)

# coefficients
coef = float(model.coef_[0])
odds_ratio = np.exp(coef)
print(f"Log‑odds coefficient (Gender = {X.columns[0]} vs. baseline): {coef:.3f}")
print(f"Odds ratio: {odds_ratio:.2f}")

# --------------------------------------------------------------------
# 6.  ROC & best threshold (Youden’s J) – optional visual
# --------------------------------------------------------------------
prob = model.predict_proba(X)[:, 1]
fpr, tpr, thresholds = roc_curve(y, prob)
roc_auc = auc(fpr, tpr)

# find threshold closest to (0,1) i.e. maximise TPR–FPR
j_idx = np.argmax(tpr - fpr)
best_thresh = thresholds[j_idx]

plt.plot(fpr, tpr, label=f"AUC = {roc_auc:.2f}")
plt.scatter(fpr[j_idx], tpr[j_idx], marker="o", color="red",
            label=f"Best thr ({best_thresh:.2f})")
plt.plot([0,1],[0,1],"k--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC – gender‑only model")
plt.legend()
plt.show()

# confusion matrix at best threshold
y_pred = (prob >= best_thresh).astype(int)
cm = confusion_matrix(y, y_pred, labels=[1,0])
ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["Improved","Not Improved"]) \
    .plot(cmap="Blues"); plt.show()


Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3577, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/var/folders/n2/t_fg5h7n7rb0ysjgfhyq_m7c0000gn/T/ipykernel_37369/1514557038.py", line 22, in <module>
    pd.read_csv(PATH, dtype=str)         # read as str first to trim/clean
  File "/opt/anaconda3/lib/python3.10/site-packages/pandas/io/parsers/readers.py", line 1026, in read_csv
    return _read(filepath_or_buffer, kwds)
  File "/opt/anaconda3/lib/python3.10/site-packages/pandas/io/parsers/readers.py", line 620, in _read
    parser = TextFileReader(filepath_or_buffer, **kwds)
  File "/opt/anaconda3/lib/python3.10/site-packages/pandas/io/parsers/readers.py", line 1620, in __init__
    self._engine = self._make_engine(f, self.engine)
  File "/opt/anaconda3/lib/python3.10/site-packages/pandas/io/parsers/readers.py", line 1880, in _make_engine
    self.handles = get_handle(
  File "/