# Fusion â€” Logistic Meta-Learner (Baseline)


Combine calibrated probabilities from VoiceNet and GaitNet using a simple **logistic regression** trained on your validation folds.
This notebook expects the CV prediction files produced by the other notebooks.


## Install/verify dependencies

In [None]:

import sys, subprocess, importlib

def ensure(pkg, import_name=None):
    try:
        importlib.import_module(import_name or pkg)
        print(f"{import_name or pkg} OK")
    except ImportError:
        print(f"Installing {pkg}...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", pkg])
        importlib.import_module(import_name or pkg)
        print(f"{import_name or pkg} installed")

ensure("pandas")
ensure("numpy")
ensure("scikit-learn", import_name="sklearn")
ensure("matplotlib")


## Load predictions

In [None]:

from pathlib import Path
import pandas as pd
import numpy as np

MANIFESTS = Path(r"C:\Users\muham\_Projects\PD New\manifests")
MODELS = MANIFESTS / "models"

v = pd.read_csv(MODELS/"voicenet_cv_predictions.csv")      # columns: path, subject_id, split2, y_true, p_voice
# If you also export gait CV preds, load here (for now, we use gaitnet window CV probs via the GaitNet notebook)
# We'll create a subject-level aggregate for gait: max prob per subject to reflect worst-case FOG.
try:
    g_meta = pd.read_csv(MANIFESTS/'gait_manifest_splits.csv')
except Exception as e:
    g_meta = None
    print("Could not load gait manifest splits:", e)

# For simplicity, we set p_gait=NaN (not available) at subject level unless you aggregate and save it after training.
# You can update this later to use a proper subject-level aggregation from GaitNet.
sv = v.groupby("subject_id", dropna=False)["p_voice"].mean().reset_index().rename(columns={"p_voice":"p_voice_mean"})
sv["y_true"] = v.groupby("subject_id", dropna=False)["y_true"].max().values

# placeholder p_gait if you have subject ids in gait metadata
if g_meta is not None and "subject_id" in g_meta.columns:
    g_subjects = g_meta[g_meta["split"]=="train"].groupby("subject_id").size().reset_index().rename(columns={0:"n"})
    sv = sv.merge(g_subjects[["subject_id"]], on="subject_id", how="left")
else:
    sv["subject_id"] = sv["subject_id"]

sv["p_gait"] = np.nan  # fill later when you export gait subject-level predictions
print(sv.head())


## Train fusion model (handles missing p_gait)

In [None]:

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score, accuracy_score
import numpy as np

X = sv[["p_voice","p_voice_mean"]].copy() if "p_voice" in sv.columns else sv[["p_voice_mean"]].copy()
# If gait probs available later, add: X["p_gait"] = sv["p_gait"]
y = sv["y_true"].astype(int).values

pipe = Pipeline([
    ("imp", SimpleImputer(strategy="median")),
    ("clf", LogisticRegression(max_iter=1000))
])

kf = KFold(n_splits=5, shuffle=True, random_state=42)
probs, trues = [], []
for tr, te in kf.split(X):
    pipe.fit(X.iloc[tr], y[tr])
    p = pipe.predict_proba(X.iloc[te])[:,1]
    probs.append(p); trues.append(y[te])
probs = np.concatenate(probs); trues = np.concatenate(trues)

print("Fusion CV AUC:", roc_auc_score(trues, probs))
print("Fusion CV ACC:", accuracy_score(trues, probs>0.5))
