# GaitNet — FOG Detection (Baseline)


Window IMU CSVs into short segments, derive simple time+frequency features, and learn to detect **Freezing of Gait (FOG)**.
- Labels are derived from `events.csv` if available (FOG intervals per recording).
- Grouped by `subject_id` to avoid leakage.
- Uses `HistGradientBoostingClassifier` for tabular signal features.
> **Note:** If your event schema differs, adapt the `load_events()` mapper below.


## Install/verify dependencies

In [13]:

import sys, subprocess, importlib

def ensure(pkg, import_name=None):
    try:
        importlib.import_module(import_name or pkg)
        print(f"{import_name or pkg} OK")
    except ImportError:
        print(f"Installing {pkg}...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", pkg])
        importlib.import_module(import_name or pkg)
        print(f"{import_name or pkg} installed")

ensure("pandas")
ensure("numpy")
ensure("scikit-learn", import_name="sklearn")
ensure("matplotlib")
ensure("scipy")


pandas OK
numpy OK
sklearn OK
matplotlib OK
scipy OK


## Config

In [14]:

from pathlib import Path

ROOT = Path(r"C:\Users\muham\_Projects\PD\data")
GAIT = ROOT / "gait"
MANIFESTS = Path(r"C:\Users\muham\_Projects\PD New\manifests")
OUT_DIR = MANIFESTS / "models"
OUT_DIR.mkdir(parents=True, exist_ok=True)

FS = 100  # Hz (change if your CSVs indicate a different rate)
WIN_S = 2.0
HOP_S = 1.0

print("GAIT:", GAIT.exists())
print("FS=", FS, "win_s=", WIN_S, "hop_s=", HOP_S)


GAIT: False
FS= 100 win_s= 2.0 hop_s= 1.0


In [15]:
# ===== Speed knobs =====
FAST_MODE = True               # quick run
DOWN_SAMPLE_TO = 50            # Hz target if a time column exists (try 25 for faster)
MAX_RECORDINGS = 100           # limit # of training recordings processed (raise later)
MAX_WINDOWS_PER_REC = 400      # cap windows per recording
USE_FREQ_FEATS = False         # time-domain stats only (faster). Set True later for accuracy.
PRINT_EVERY = 10               # status every N recordings

# Progress bar (auto installs if missing)
try:
    from tqdm.auto import tqdm
except Exception:
    import sys, subprocess
    subprocess.check_call([sys.executable, "-m", "pip", "install", "tqdm"])
    from tqdm.auto import tqdm


## Load manifests and events

In [16]:

import pandas as pd
import numpy as np

gman = pd.read_csv(MANIFESTS/"gait_manifest_splits.csv")
events_path = GAIT/"events.csv"
tasks_path = GAIT/"tasks.csv"
subjects_path = GAIT/"subjects.csv"

def load_events(path):
    # Try to normalize to: Id, start_time, end_time
    df = pd.read_csv(path)
    lower = {c.lower(): c for c in df.columns}
    idc = lower.get("id") or lower.get("recording_id") or lower.get("series_id") or lower.get("file_id")
    sc  = lower.get("start") or lower.get("start_time") or lower.get("onset")
    ec  = lower.get("end") or lower.get("end_time") or lower.get("offset")
    if idc is None or sc is None or ec is None:
        print("WARNING: Could not find standard event columns; events disabled.")
        return None
    out = df.rename(columns={idc:"Id", sc:"start_time", ec:"end_time"})[["Id","start_time","end_time"]]
    return out

events = load_events(events_path) if events_path.exists() else None
print("Events loaded:", events is not None, "rows=" if events is not None else "", len(events) if events is not None else 0)

# Keep only training recordings
gtrain = gman[gman["split"]=="train"].copy()
print("Train recordings:", len(gtrain))


Events loaded: False  0
Train recordings: 970


In [20]:
# --- Load & normalize events.csv (supports init/completion) ---
import pandas as pd
from pathlib import Path

events_path = Path(r"C:\Users\muham\_Projects\PD New\data\gait\events.csv")  # found path
ev = pd.read_csv(events_path)

# normalize column names
ev.columns = [c.strip().lower() for c in ev.columns]

# Map your schema -> unified schema
rename_map = {}
if "id" in ev.columns:
    rename_map["id"] = "Id"
if "init" in ev.columns:
    rename_map["init"] = "start_time"
if "completion" in ev.columns:
    rename_map["completion"] = "end_time"
ev = ev.rename(columns=rename_map)

# sanity check
required = {"Id","start_time","end_time"}
missing = required - set(ev.columns)
if missing:
    raise RuntimeError(f"events.csv missing required columns {missing}; have: {list(ev.columns)}")

# coerce times to numeric
ev["start_time"] = pd.to_numeric(ev["start_time"], errors="coerce")
ev["end_time"]   = pd.to_numeric(ev["end_time"], errors="coerce")
ev = ev.dropna(subset=["start_time","end_time"])
ev = ev[ev["end_time"] > ev["start_time"]]

# If looks like milliseconds, convert to seconds
if float(ev[["start_time","end_time"]].max().max()) > 1e4:
    ev[["start_time","end_time"]] = ev[["start_time","end_time"]] / 1000.0

# Make Id comparable to CSV stems (strip .csv just in case)
ev["Id"] = ev["Id"].astype(str).str.replace(".csv","", regex=False).str.strip()
events = ev.reset_index(drop=True)

print("[events] rows:", len(events), "| columns:", list(events.columns))
print(events.head(3))


[events] rows: 3544 | columns: ['Id', 'start_time', 'end_time', 'type', 'kinetic']
           Id  start_time  end_time  type  kinetic
0  003f117e14     8.61312   14.7731  Turn      1.0
1  009ee11563    11.38470   41.1847  Turn      1.0
2  009ee11563    54.66470   58.7847  Turn      1.0


## Feature extraction (windowed)

In [9]:
# --- Fast windowing + feature extraction ---
import numpy as np
import pandas as pd
from pathlib import Path
from scipy.signal import welch
import time

NUMERIC_LIMIT = 128  # limit number of channels to keep feature size manageable

def window_signal(n_samples, fs, win_s, hop_s, cap=None):
    win = int(round(win_s*fs))
    hop = int(round(hop_s*fs))
    count = 0
    for start in range(0, max(1, n_samples - win + 1), hop):
        if cap is not None and count >= cap:
            break
        count += 1
        yield start, start+win

def downsample_if_needed(df, target_fs, time_col):
    """Round timestamps to 1/target_fs bins and average numeric cols."""
    if time_col is None or target_fs is None:
        return df, None
    tt = pd.to_numeric(df[time_col], errors="coerce").to_numpy()
    if np.isnan(tt).all() or tt.size < 2:
        return df, None
    dt = np.nanmedian(np.diff(tt))
    if not (dt > 0):
        return df, None
    fs_now = 1.0 / dt
    if fs_now <= target_fs + 1e-6:
        return df, fs_now
    bins = np.floor(tt * target_fs) / target_fs
    g = df.groupby(bins)
    df_ds = g.mean(numeric_only=True).reset_index(drop=True)
    return df_ds, target_fs

def extract_features(df):
    num_cols = [c for c in df.columns if np.issubdtype(df[c].dtype, np.number)]
    if not num_cols:
        return None, []
    num_cols = num_cols[:NUMERIC_LIMIT]
    Xs = [df[c].to_numpy(dtype=float, copy=False) for c in num_cols]
    Xmat = np.vstack(Xs)  # (C, T)
    return Xmat, num_cols

def summarize_time_only(x):
    # x: (C, W)
    mean = x.mean(axis=1)
    std  = x.std(axis=1)
    rms  = np.sqrt((x**2).mean(axis=1))
    return np.concatenate([mean, std, rms], axis=0)

def summarize_with_freq(x, fs):
    feats = [x.mean(axis=1), x.std(axis=1), np.sqrt((x**2).mean(axis=1))]
    f, Pxx = welch(x, fs=fs, axis=1, nperseg=min(128, x.shape[1]))
    def band_power(lo, hi):
        idx = (f >= lo) & (f < hi)
        return Pxx[:, idx].sum(axis=1)
    for lo, hi in [(0.1,0.5),(0.5,3),(3,8),(8,20)]:
        feats.append(band_power(lo, hi))
    return np.concatenate(feats, axis=0)

rows, labels, groups = [], [], []

# restrict to training split and optionally cap number of recordings
gtrain = gman[gman["split"]=="train"].copy()
rec_paths = gtrain["path"].astype(str).tolist()
if FAST_MODE and MAX_RECORDINGS is not None:
    rec_paths = rec_paths[:int(MAX_RECORDINGS)]

t0 = time.time()
for i, path_str in enumerate(tqdm(rec_paths, desc="Recordings")):
    p = Path(path_str)
    try:
        df = pd.read_csv(p)
    except Exception as e:
        print("Skip", p.name, "->", e, flush=True)
        continue

    # find a time column if present
    time_col = next((c for c in ["time","Time","t","timestamp","Timestamp"] if c in df.columns), None)

    # estimate sampling rate
    fs = FS
    if time_col is not None:
        tt = pd.to_numeric(df[time_col], errors="coerce").to_numpy()
        if tt.size > 1:
            dt = np.nanmedian(np.diff(tt))
            if dt and dt > 0:
                fs = float(round(1.0/dt))

    # optional downsample for speed
    if FAST_MODE and time_col is not None and DOWN_SAMPLE_TO is not None:
        df, fs2 = downsample_if_needed(df, DOWN_SAMPLE_TO, time_col)
        if fs2 is not None:
            fs = fs2

    Xmat, cols = extract_features(df)
    if Xmat is None:
        continue

    # windowing with cap
    cap = MAX_WINDOWS_PER_REC if FAST_MODE else None
    added = 0
    for s, e in window_signal(Xmat.shape[1], fs, WIN_S, HOP_S, cap=cap):
        xw = Xmat[:, s:e]
        if xw.shape[1] < int(WIN_S*fs):  # drop short last chunk
            continue

        if FAST_MODE and not USE_FREQ_FEATS:
            feats = summarize_time_only(xw)
        else:
            feats = summarize_with_freq(xw, fs)

        # label via events.csv if present (center of window inside any event)
        y = 0
        if 'events' in globals() and events is not None and time_col is not None:
            rid_arr = gtrain.loc[gtrain["path"]==path_str, "recording_id"].values
            if rid_arr.size:
                ev = events[events["Id"]==rid_arr[0]]
                if not ev.empty:
                    center_t = (s + (e - s)/2) / float(fs)
                    y = int(((ev["start_time"] <= center_t) & (center_t <= ev["end_time"])).any())

        rows.append(feats)
        labels.append(y)
        # group by subject to avoid leakage
        subj_arr = gtrain.loc[gtrain["path"]==path_str, "subject_id"].astype(str).values
        groups.append(subj_arr[0] if subj_arr.size else path_str)
        added += 1

    if (i+1) % max(1, PRINT_EVERY) == 0:
        elapsed = time.time() - t0
        print(f"[{i+1}/{len(rec_paths)}] {p.name}: {added} windows | fs={fs}Hz | {elapsed:.1f}s", flush=True)

X = np.asarray(rows, dtype=float)
y = np.asarray(labels, dtype=int)
groups = np.asarray(groups)
print("Feature matrix:", X.shape, "| positives:", int(y.sum()), "| negatives:", int((y==0).sum()))


Recordings:   9%|▉         | 9/100 [00:02<00:25,  3.60it/s]

[10/100] 15508c7f41.csv: 400 windows | fs=1.0Hz | 2.8s


Recordings:  19%|█▉        | 19/100 [00:05<00:23,  3.47it/s]

[20/100] 32d03020a9.csv: 400 windows | fs=1.0Hz | 5.6s


Recordings:  29%|██▉       | 29/100 [00:08<00:21,  3.23it/s]

[30/100] 4f613ccf88.csv: 400 windows | fs=1.0Hz | 8.6s


Recordings:  39%|███▉      | 39/100 [00:10<00:16,  3.76it/s]

[40/100] 6a20935af5.csv: 400 windows | fs=1.0Hz | 11.2s


Recordings:  49%|████▉     | 49/100 [00:13<00:14,  3.42it/s]

[50/100] 850748a138.csv: 400 windows | fs=1.0Hz | 14.1s


Recordings:  59%|█████▉    | 59/100 [00:16<00:10,  3.89it/s]

[60/100] a2f1a8ab76.csv: 400 windows | fs=1.0Hz | 17.0s


Recordings:  69%|██████▉   | 69/100 [00:19<00:08,  3.53it/s]

[70/100] be9d33541d.csv: 400 windows | fs=1.0Hz | 19.7s


Recordings:  79%|███████▉  | 79/100 [00:22<00:06,  3.43it/s]

[80/100] e1f92471b9.csv: 400 windows | fs=1.0Hz | 22.6s


Recordings:  89%|████████▉ | 89/100 [00:25<00:03,  3.57it/s]

[90/100] f9efef91fb.csv: 400 windows | fs=1.0Hz | 25.3s


Recordings:  99%|█████████▉| 99/100 [00:26<00:00,  5.52it/s]

[100/100] 0330ea6680.csv: 400 windows | fs=1.0Hz | 27.1s


Recordings: 100%|██████████| 100/100 [00:27<00:00,  3.69it/s]


Feature matrix: (40000, 21) | positives: 0 | negatives: 40000


## Train classifier

In [10]:

from sklearn.experimental import enable_hist_gradient_boosting  # noqa: F401
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import GroupKFold
from sklearn.metrics import roc_auc_score, accuracy_score
import numpy as np
import joblib
import matplotlib.pyplot as plt

gkf = GroupKFold(n_splits=5)
probs, trues = [], []

for fold, (tr, te) in enumerate(gkf.split(X, y, groups)):
    clf = HistGradientBoostingClassifier(max_depth=None, learning_rate=0.07, max_iter=300)
    clf.fit(X[tr], y[tr])
    p = clf.predict_proba(X[te])[:,1]
    probs.append(p); trues.append(y[te])
    print(f"Fold {fold+1} AUC={roc_auc_score(y[te], p):.3f} ACC={accuracy_score(y[te], p>0.5):.3f}")

probs = np.concatenate(probs); trues = np.concatenate(trues)
print("\\nCV AUC:", roc_auc_score(trues, probs))
print("CV ACC:", accuracy_score(trues, probs>0.5))

final_clf = HistGradientBoostingClassifier(max_depth=None, learning_rate=0.07, max_iter=400)
final_clf.fit(X, y)
joblib.dump({"model": final_clf, "meta":{"fs":FS,"win_s":WIN_S,"hop_s":HOP_S}}, OUT_DIR/"gaitnet_hgb.joblib")
print("Saved ->", OUT_DIR/"gaitnet_hgb.joblib")




Fold 1 AUC=nan ACC=1.000
Fold 2 AUC=nan ACC=1.000
Fold 3 AUC=nan ACC=1.000




Fold 4 AUC=nan ACC=1.000
Fold 5 AUC=nan ACC=1.000
\nCV AUC: nan
CV ACC: 1.0
Saved -> C:\Users\muham\_Projects\PD New\manifests\models\gaitnet_hgb.joblib


In [11]:
import numpy as np, pandas as pd
print("y counts:", pd.Series(y).value_counts(dropna=False).to_dict())
print("unique groups:", pd.Series(groups).nunique())
# Per-fold label counts (with your current CV splitter `cv`)
for k, (tr, va) in enumerate(cv.split(X, y, groups=groups), 1):
    yc = pd.Series(y[va]).value_counts()
    print(f"Fold {k} label counts:\n{yc}")


y counts: {0: 40000}
unique groups: 46


NameError: name 'cv' is not defined