## How to use (workflow)
1) Step 0: run installs once (or skip if already installed).
2) Step 1: pick audio from dropdown and click *Load file*.
3) Step 2: edit ground truth (CALM / NOT_CALM) and auto-align toggles.
4) Step 3: run analysis to see the plot.
5) Step 3b: run metrics on a 2s grid with auto-calibrated threshold.
6) Step 3c: run feature benchmark (rank features).
7) Step 4: transcript (optional).


In [None]:
!pip install librosa==0.10.1 numpy pandas matplotlib seaborn plotly scipy ipywidgets
!pip install sentencepiece


In [None]:
# Step 1 — Select audio and compute baseline (run after installs).
# Pick a file in the dropdown and click 'Load file'. Baseline uses first 15s.

# Universal loader for WAV/MP3 with forced 16 kHz mono
import numpy as _np

def load_audio_any(path, target_sr=16000):
    try:
        audio, sr = librosa.load(path, sr=target_sr, mono=True)
        return audio.astype(_np.float32), sr
    except Exception as e:
        print(f"⚠️ Failed to load {path}: {e}")
        return _np.array([], dtype=_np.float32), target_sr


import librosa
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import os, glob, warnings
from pathlib import Path
import ipywidgets as widgets
from IPython.display import display, clear_output

warnings.filterwarnings('ignore')
pd.options.display.float_format = '{:.3f}'.format



def extract_features(audio, sr=16000, hop_length=512):
    """Extracts short-term audio features for one window."""
    default_feats = {
        'rms': 0.0,
        'rms_std': 0.0,
        'pitch_jitter': 0.0,
        'voiced_ratio': 0.0,
        'pitch_mean': 0.0,
        'pitch_std': 0.0,
        'pitch_range': 0.0,
        'pitch_slope': 0.0,
        'spectral_centroid': 0.0,
        'spectral_bandwidth': 0.0,
        'spectral_rolloff': 0.0,
        'spectral_flux': 0.0,
        'zcr': 0.0,
    }
    if audio is None or len(audio) == 0:
        return default_feats

    audio = np.asarray(audio, dtype=np.float32)
    audio = np.nan_to_num(audio, nan=0.0, posinf=0.0, neginf=0.0)

    if np.allclose(audio, 0):
        return default_feats

    rms = librosa.feature.rms(y=audio, hop_length=hop_length)[0]

    try:
        pitches, voiced_flag, voiced_probs = librosa.pyin(
            audio, fmin=50, fmax=600, sr=sr, hop_length=hop_length
        )
    except Exception:
        pitches = np.full_like(rms, np.nan)
        voiced_probs = np.zeros_like(rms)

    voiced_probs_clean = voiced_probs[~np.isnan(voiced_probs)]
    voiced_ratio = float(np.mean(voiced_probs_clean)) if voiced_probs_clean.size > 0 else 0.0

    valid_idx = np.where(~np.isnan(pitches))[0]
    valid_pitches = pitches[valid_idx]
    pitch_jitter = float(np.std(valid_pitches) / np.mean(valid_pitches) * 100) if valid_pitches.size > 1 and np.mean(valid_pitches) > 0 else 0.0
    pitch_mean = float(np.nanmean(pitches)) if np.isfinite(np.nanmean(pitches)) else 0.0
    pitch_std = float(np.nanstd(valid_pitches)) if valid_pitches.size > 0 else 0.0
    pitch_range = float(np.nanmax(valid_pitches) - np.nanmin(valid_pitches)) if valid_pitches.size > 0 else 0.0
    if valid_idx.size > 1:
        try:
            slope = np.polyfit(valid_idx, valid_pitches, 1)[0]
        except Exception:
            slope = 0.0
    else:
        slope = 0.0
    pitch_slope = float(slope)

    try:
        S = np.abs(librosa.stft(audio, n_fft=2048, hop_length=hop_length))
        spectral_centroid = librosa.feature.spectral_centroid(S=S, sr=sr)[0]
        spectral_bandwidth = librosa.feature.spectral_bandwidth(S=S, sr=sr)[0]
        spectral_rolloff = librosa.feature.spectral_rolloff(S=S, sr=sr, roll_percent=0.85)[0]
        if S.shape[1] > 1:
            flux = np.sqrt(np.sum(np.diff(S, axis=1) ** 2, axis=0))
            spectral_flux = float(np.mean(flux))
        else:
            spectral_flux = 0.0
        zcr = librosa.feature.zero_crossing_rate(y=audio, hop_length=hop_length)[0]
    except Exception:
        spectral_centroid = np.array([0.0])
        spectral_bandwidth = np.array([0.0])
        spectral_rolloff = np.array([0.0])
        spectral_flux = 0.0
        zcr = np.array([0.0])

    return {
        'rms': float(np.mean(rms)),
        'rms_std': float(np.std(rms)),
        'pitch_jitter': pitch_jitter,
        'voiced_ratio': voiced_ratio,
        'pitch_mean': pitch_mean,
        'pitch_std': pitch_std,
        'pitch_range': pitch_range,
        'pitch_slope': pitch_slope,
        'spectral_centroid': float(np.mean(spectral_centroid)),
        'spectral_bandwidth': float(np.mean(spectral_bandwidth)),
        'spectral_rolloff': float(np.mean(spectral_rolloff)),
        'spectral_flux': float(spectral_flux),
        'zcr': float(np.mean(zcr)),
    }


def compute_baseline(audio, sr=16000, window_sec=15):
    """Compute a personal baseline on the first window_sec seconds; safe for short files."""
    if audio is None:
        return {'rms': 1e-6, 'rms_std': 1e-9, 'pitch_jitter': 1e-3, 'voiced_ratio': 0.0, 'pitch_mean': 0.0}

    samples_per_window = max(int(window_sec * sr), 1)
    baseline_features = []

    for i in range(0, len(audio), samples_per_window):
        window = audio[i:i+samples_per_window]
        if len(window) >= int(0.5 * sr):
            feats = extract_features(window, sr)
            baseline_features.append(feats)

    if not baseline_features:
        baseline_features.append(extract_features(audio, sr))

    keys = baseline_features[0].keys()
    baseline = {}
    for k in keys:
        values = [f.get(k, 0.0) for f in baseline_features]
        baseline[k] = float(np.nan_to_num(np.mean(values)))

    baseline['rms'] = max(baseline.get('rms', 0.0), 1e-6)
    baseline['rms_std'] = max(baseline.get('rms_std', 0.0), 1e-9)
    baseline['pitch_jitter'] = max(baseline.get('pitch_jitter', 0.0), 1e-3)
    return baseline


def compute_agitation_score(features, baseline, prev_score=None, smoothing_alpha=0.5, spike_threshold=20.0, max_step=9.0):
    """Compute agitation 0–100 with smoothing and spike control.
    - smoothing_alpha (0.4–0.6): weight of the new window; rest is previous score.
    - spike_threshold: |raw - prev| threshold to cap change by max_step.
    - max_step: max allowed step per window for sharp jumps.
    """
    baseline_rms = max(baseline.get('rms', 1e-6) or 1e-6, 1e-6)
    rms_ratio = max(float(features.get('rms', 0.0)) / baseline_rms, 0.0)
    rms_volatility = max(float(features.get('rms_std', 0.0)) / baseline_rms, 0.0)
    pitch_jitter = float(np.nan_to_num(features.get('pitch_jitter', 0.0), nan=0.0, posinf=0.0, neginf=0.0))
    voiced_ratio = float(np.nan_to_num(features.get('voiced_ratio', 0.0), nan=0.0, posinf=0.0, neginf=0.0))

    rms_term = np.clip((rms_ratio - 0.9), 0, 2.0) * 8.0
    jitter_term = np.clip(pitch_jitter, 0, 40) * 1.3
    volatility_term = np.clip(rms_volatility, 0, 3.0) * 18.0
    voiced_term = np.clip(voiced_ratio, 0, 1.0) * 5.0

    raw_score = float(np.clip(rms_term + jitter_term + volatility_term + voiced_term, 0, 100))

    if prev_score is None or not np.isfinite(prev_score):
        prev_score = raw_score

    delta_raw = raw_score - prev_score
    direction = np.sign(delta_raw)

    if abs(delta_raw) > spike_threshold:
        candidate = prev_score + direction * max_step
    else:
        candidate = smoothing_alpha * raw_score + (1 - smoothing_alpha) * prev_score

    candidate = float(np.clip(candidate, 0, 100))
    return round(candidate, 1)


def get_mood_state(features, baseline, agitation_score, prev_state='CALM', prev_score=None, recent_scores=None, state_streak=1, pending_state=None, pending_count=0):
    """Infer CALM/TENSE/ESCALATING from smoothed score with hysteresis and streaks.
    - CALM<->TENSE: switch only after >=3 consecutive windows.
    - TENSE<->ESCALATING: switch only after >=4 windows; keep ESC while score >=60.
    - ESC triggers at score >=65 or growth >25 over ~3 windows (6–9s).
    Returns state and updated streak/pending counters plus metrics.
    """
    recent_scores = recent_scores or []

    baseline_rms = max(baseline.get('rms', 1e-6) or 1e-6, 1e-6)
    rms_ratio = float(np.clip((features.get('rms', 0.0) / baseline_rms), 0, 10))
    pitch_jitter = float(np.nan_to_num(features.get('pitch_jitter', 0.0), nan=0.0, posinf=0.0, neginf=0.0))

    last_ref = prev_score if prev_score is not None else (recent_scores[-1] if recent_scores else None)
    delta = abs(agitation_score - last_ref) if last_ref is not None else 0.0

    if len(recent_scores) >= 3:
        growth_current = agitation_score - recent_scores[-3]
    else:
        growth_current = agitation_score - last_ref if last_ref is not None else 0.0

    escalate_cond = (agitation_score >= 65) or (growth_current > 25)
    calm_cond = (agitation_score < 30) and (delta < 10)
    hold_escalating = (prev_state == 'ESCALATING' and agitation_score >= 60)

    if hold_escalating:
        candidate_state = 'ESCALATING'
    elif escalate_cond:
        candidate_state = 'ESCALATING'
    elif calm_cond:
        candidate_state = 'CALM'
    elif agitation_score >= 30:
        candidate_state = 'TENSE'
    else:
        candidate_state = 'CALM'

    borderline = False
    new_pending_state = pending_state
    new_pending_count = pending_count
    new_state_streak = state_streak

    def required_streak(prev_state, cand_state):
        if {'CALM', 'TENSE'} == {prev_state, cand_state}:
            return 3
        if 'ESCALATING' in (prev_state, cand_state):
            return 4
        return 3

    if candidate_state == prev_state:
        final_state = prev_state
        new_state_streak = state_streak + 1
        new_pending_state = None
        new_pending_count = 0
    else:
        needed = required_streak(prev_state, candidate_state)
        if candidate_state == pending_state:
            new_pending_count = pending_count + 1
        else:
            new_pending_state = candidate_state
            new_pending_count = 1

        if new_pending_count >= needed:
            final_state = candidate_state
            new_state_streak = 1
            new_pending_state = None
            new_pending_count = 0
        else:
            final_state = prev_state
            new_state_streak = state_streak + 1
            borderline = True

    return {
        'state': final_state,
        'agitation_score': round(float(agitation_score), 1),
        'rms_ratio': round(rms_ratio, 2),
        'pitch_jitter': round(pitch_jitter, 1),
        'state_streak': new_state_streak,
        'pending_state': new_pending_state,
        'pending_count': new_pending_count,
        'borderline_state': borderline
    }


print("📂 Scanning audio folder...")
audio_dir = str((Path.cwd() / 'audio_samples').resolve())
wav_files = glob.glob(os.path.join(audio_dir, '*.wav')) + glob.glob(os.path.join(audio_dir, '*.mp3'))

audio = None
baseline = None
duration = 0
sr = 16000
audio_ready = False
selected_path = None

if not wav_files:
    dropdown = None
    print("❌ No audio files found. Check path.")
else:
    file_names = [os.path.basename(f) for f in wav_files]
    dropdown = widgets.Dropdown(
        options=file_names,
        description='Pick audio:',
        style={'description_width': 'initial'}
    )
    load_btn = widgets.Button(description='Load file', button_style='primary')
    status = widgets.Output()

    def load_selected(_):
        global audio, baseline, duration, sr, audio_ready, selected_path
        with status:
            status.clear_output()
            if dropdown.value is None:
                print("⚠️ Pick a file in the dropdown.")
                audio_ready = False
                return
            selected_file = os.path.join(audio_dir, dropdown.value)
            if not os.path.exists(selected_file):
                print("❌ File not found (maybe removed).")
                audio_ready = False
                return
            print(f"🔄 Loading: {dropdown.value}")
            audio, sr = load_audio_any(selected_file, target_sr=16000)
            duration = len(audio) / sr if sr else 0
            baseline_audio = audio[:min(15 * sr, len(audio))]
            baseline = compute_baseline(baseline_audio, sr)
            readable = {k: round(v, 3) for k, v in baseline.items()}
            selected_path = selected_file
            audio_ready = True
            print(f"✅ Loaded: {duration:.1f} s, sr={sr}Hz")
            print("✅ Baseline:", readable)

    load_btn.on_click(load_selected)

    print(f"✅ Found {len(wav_files)} WAV files. Pick a file and click 'Load file'.")
    display(widgets.VBox([dropdown, load_btn, status]))


📂 Scanning audio folder...
✅ Found 14 WAV files. Pick a file and click 'Load file'.


VBox(children=(Dropdown(description='Pick audio:', options=('untitled #2.wav', 'good_interview.wav', 'untitled…

In [25]:

# Step 2 — Ground truth setup (2 classes)
from pathlib import Path

# Keys are audio file stems in lowercase with underscores
GT_PRESETS = {
    "test_emotions": [
        {"start": 0, "end": 9, "label": "CALM"},
        {"start": 9, "end": 39, "label": "NOT_CALM"},
    ],
}

# Optional auto-alignment of GT to the model signal
AUTO_ALIGN_GT = False
LAG_SEARCH_SEC = (-5, 5)  # search window in seconds
LAG_STEPS = 41            # number of lag samples (e.g., 41 -> 0.25s step for +-5s)

audio_key = Path(selected_path).stem.lower().replace(" ", "_") if 'selected_path' in globals() and selected_path else None
manual_annotations = GT_PRESETS.get(audio_key, []).copy() if audio_key else []

if manual_annotations:
    print(f"🎯 GT for '{audio_key}': {manual_annotations}")
else:
    print("ℹ️ No preset GT found. Edit GT_PRESETS or set manual_annotations manually.")


🎯 GT for 'test_emotions': [{'start': 0, 'end': 9, 'label': 'CALM'}, {'start': 9, 'end': 39, 'label': 'NOT_CALM'}]


In [26]:
# Step 2b — Optional text/ASR pipelines (leave as-is if no network)

try:
    import torch
    from transformers import pipeline
except Exception:
    pipeline = None
    print("⚠️ transformers not available, semantic layer disabled")

asr_pipe = None
sent_pipe = None

def load_text_pipelines(asr_model="openai/whisper-small", cls_model="nlptown/bert-base-multilingual-uncased-sentiment"): 
    """Lazy load ASR and text model; safe if no network/models."""
    global asr_pipe, sent_pipe
    if pipeline is None:
        print("⚠️ transformers not installed")
        return False
    device = 0 if "torch" in globals() and hasattr(torch, "cuda") and torch.cuda.is_available() else -1
    if asr_pipe is None:
        try:
            asr_pipe = pipeline("automatic-speech-recognition", model=asr_model, device=device)
        except Exception as e:
            print("⚠️ Failed to load ASR:", e)
            asr_pipe = None
    if sent_pipe is None:
        try:
            sent_pipe = pipeline("text-classification", model=cls_model, device=device)
        except Exception as e:
            print("⚠️ Failed to load text model:", e)
            sent_pipe = None
    return asr_pipe is not None and sent_pipe is not None


def transcribe_and_classify(path, window_sec=5.0, hop_sec=5.0, sr=16000):
    """Split audio into windows, run ASR + sentiment; returns a DataFrame with time and sentiment_score."""
    if asr_pipe is None or sent_pipe is None:
        return pd.DataFrame()
    audio, _ = librosa.load(path, sr=sr)
    win = int(window_sec * sr)
    hop = int(hop_sec * sr)
    rows = []
    for start in range(0, len(audio), hop):
        end = min(len(audio), start + win)
        chunk = audio[start:end]
        if len(chunk) < 0.5 * sr:
            continue
        try:
            text = asr_pipe({"array": chunk, "sampling_rate": sr}).get("text", "")
        except Exception:
            text = ""
        try:
            senti = sent_pipe(text)[0]
            sentiment_score = float(senti.get("score", 0.0)) if isinstance(senti, dict) else 0.0
        except Exception:
            sentiment_score = 0.0
        rows.append({
            "start_sec": start / sr,
            "end_sec": end / sr,
            "sentiment_score": sentiment_score,
            "text": text
        })
    return pd.DataFrame(rows)


def fuse_audio_text(df_audio: pd.DataFrame, df_text: pd.DataFrame):
    """Late fusion: audio (fast) + text (calibration). Returns df with fusion_score."""
    if df_audio is None or df_audio.empty:
        return df_audio
    df_audio = df_audio.copy()
    if df_text is None or df_text.empty:
        df_audio["fusion_score"] = df_audio.get("vis_agitation", df_audio.get("agitation_score", 0.0))
        df_audio["sentiment_score"] = 0.0
        return df_audio

    df_text = df_text.copy()
    df_text["mid_sec"] = (df_text["start_sec"] + df_text["end_sec"]) / 2
    if "time_sec" in df_audio.columns:
        df_audio["mid_sec"] = df_audio["time_sec"]
    elif "start_sec" in df_audio.columns and "end_sec" in df_audio.columns:
        df_audio["mid_sec"] = (df_audio["start_sec"] + df_audio["end_sec"]) / 2
    else:
        df_audio["mid_sec"] = range(len(df_audio))

    merged = pd.merge_asof(
        df_audio.sort_values("mid_sec"),
        df_text.sort_values("mid_sec")[["mid_sec", "sentiment_score"]],
        on="mid_sec", direction="nearest", tolerance=3
    )
    merged["sentiment_score"] = merged["sentiment_score"].fillna(0.0)

    base = merged.get("vis_agitation", merged.get("agitation_score", merged.get("frustration_proxy", 0.0)))
    text_component = merged["sentiment_score"]
    if text_component.max() <= 1:
        text_component = text_component * 100
    
    # Smooth and clip to avoid short drops
    text_component = text_component.rolling(window=3, min_periods=1, center=True).mean()
    text_component = text_component.clip(lower=20)  # minimum text contribution
    merged["fusion_score"] = 0.7 * base + 0.3 * text_component
            
    return merged


In [27]:
# Step 3 — Analyze and visualize (run after GT setup)
if 'audio_ready' not in globals() or not audio_ready or audio is None or len(audio) == 0:
    print("⚠️ No audio to analyze. Load audio in Step 1 first.")
else:
    window_sec = 3
    hop_sec = 1
    sr = 16000
    if 'THRESHOLD_NOT_CALM' not in globals():
        THRESHOLD_NOT_CALM = 35
    if 'HYSTERESIS_DELTA' not in globals():
        HYSTERESIS_DELTA = 2.0
    # Visualization knobs (points; lower = more sensitive)
    if 'VIS_SMOOTH_WINDOW' not in globals():
        VIS_SMOOTH_WINDOW = 15
    if 'VIS_LONG_WINDOW' not in globals():
        VIS_LONG_WINDOW = 30

    THRESHOLD_ENTER_NOT_CALM = THRESHOLD_NOT_CALM + HYSTERESIS_DELTA
    THRESHOLD_EXIT_NOT_CALM = max(0, THRESHOLD_NOT_CALM - HYSTERESIS_DELTA)
    window_samples = int(window_sec * sr)
    hop_samples = int(hop_sec * sr)

    starts = list(range(0, max(len(audio) - window_samples, 0) + hop_samples, hop_samples))
    if not starts:
        starts = [0]

    print("🔄 Analyzing 3s windows...")
    progress = widgets.IntProgress(value=0, min=0, max=len(starts), description='⏳', bar_style='info')
    progress_label = widgets.HTML(value="⏳ Preparing...")
    display(widgets.VBox([progress, progress_label]))

    results = []
    prev_state = 'CALM'
    prev_agitation = None
    last_active_agitation = None
    last_active_state = 'CALM'
    tension_trend = None
    score_history = []
    trend_alpha = 0.03  # slow trend ~30-50s
    state_streak = 0
    pending_state = None
    pending_count = 0

    pause_voiced_thr = 0.12  # more sensitive to silence
    pause_rms_scale = 0.8

    for idx, start_sample in enumerate(starts):
        end_sample = min(len(audio), start_sample + window_samples)
        window_audio = audio[start_sample:end_sample]

        if len(window_audio) < int(0.5 * sr) and len(audio) > int(window_samples):
            continue

        features = extract_features(window_audio, sr)
        baseline_rms = max(baseline.get('rms', 1e-6) or 1e-6, 1e-6)
        voiced_ratio_val = float(np.nan_to_num(features.get('voiced_ratio', 0.0), nan=0.0, posinf=0.0, neginf=0.0))

        rms_val = float(np.nan_to_num(features.get('rms', 0.0), nan=0.0, posinf=0.0, neginf=0.0))
        rms_ratio_val = float(np.clip(rms_val / baseline_rms, 0, 10))
        pitch_jitter_val = float(np.nan_to_num(features.get('pitch_jitter', 0.0), nan=0.0, posinf=0.0, neginf=0.0))
        is_pause = (voiced_ratio_val < pause_voiced_thr) and (rms_val < baseline_rms * pause_rms_scale)

        feature_row = {
            "rms": float(features.get("rms", 0.0)),
            "rms_std": float(features.get("rms_std", 0.0)),
            "rms_ratio": rms_ratio_val,
            "voiced_ratio": voiced_ratio_val,
            "pitch_mean": float(features.get("pitch_mean", 0.0)),
            "pitch_std": float(features.get("pitch_std", 0.0)),
            "pitch_range": float(features.get("pitch_range", 0.0)),
            "pitch_slope": float(features.get("pitch_slope", 0.0)),
            "pitch_jitter": pitch_jitter_val,
            "spectral_centroid": float(features.get("spectral_centroid", 0.0)),
            "spectral_bandwidth": float(features.get("spectral_bandwidth", 0.0)),
            "spectral_rolloff": float(features.get("spectral_rolloff", 0.0)),
            "spectral_flux": float(features.get("spectral_flux", 0.0)),
            "zcr": float(features.get("zcr", 0.0)),
            "pause_flag": 1 if is_pause else 0,
        }

        if is_pause and last_active_agitation is not None:
            agitation_score = last_active_agitation
            state_for_row = last_active_state
            if tension_trend is None:
                tension_trend = agitation_score
            dialogue_escalation = (tension_trend > 50) and (agitation_score > 65)
            out_row = {
                'state': state_for_row,
                'agitation_score': round(float(agitation_score), 1),
                'rms_ratio': round(rms_ratio_val, 2),
                'pitch_jitter': round(pitch_jitter_val, 1),
                'time_sec': round(((start_sample + end_sample) / 2) / sr, 2),
                'tension_trend': round(float(tension_trend), 1),
                'dialogue_escalation': bool(dialogue_escalation)
            }
            out_row.update(feature_row)
            results.append(out_row)
            score_history.append(agitation_score)
        else:
            agitation_score = compute_agitation_score(
                features, baseline, prev_score=prev_agitation,
                smoothing_alpha=0.5, spike_threshold=20.0, max_step=9.0
            )

            if tension_trend is None:
                tension_trend = agitation_score
            else:
                tension_trend = trend_alpha * agitation_score + (1 - trend_alpha) * tension_trend

            mood_info = get_mood_state(
                features, baseline, agitation_score,
                prev_state=prev_state, prev_score=prev_agitation,
                recent_scores=score_history,
                state_streak=state_streak,
                pending_state=pending_state,
                pending_count=pending_count
            )
            display_state = 'CALM' if mood_info['state'] == 'CALM' else 'NOT_CALM'

            dialogue_escalation = (tension_trend > 50) and (agitation_score > 65)

            out_row = {
                'state': display_state,
                'agitation_score': mood_info['agitation_score'],
                'rms_ratio': mood_info['rms_ratio'],
                'pitch_jitter': mood_info['pitch_jitter'],
                'time_sec': round(((start_sample + end_sample) / 2) / sr, 2),
                'tension_trend': round(float(tension_trend), 1),
                'dialogue_escalation': bool(dialogue_escalation)
            }
            out_row.update(feature_row)
            results.append(out_row)

            score_history.append(agitation_score)
            prev_agitation = agitation_score
            prev_state = mood_info['state']
            state_streak = mood_info.get('state_streak', state_streak)
            pending_state = mood_info.get('pending_state', None)
            pending_count = mood_info.get('pending_count', 0)
            last_active_agitation = agitation_score
            last_active_state = display_state

        progress.value = idx + 1
        progress_label.value = (
            f"🔎 Window {idx+1}/{len(starts)} — t={round(((start_sample + end_sample) / 2) / sr, 2):.1f}s | {results[-1]['state']} | "
            f"score={results[-1]['agitation_score']:.1f} | trend={results[-1]['tension_trend']:.1f}"
        )

    if not results:
        fallback_features = extract_features(audio, sr)
        fallback_agitation = compute_agitation_score(fallback_features, baseline, prev_score=None)
        fallback_trend = fallback_agitation

        fallback_mood = get_mood_state(fallback_features, baseline, fallback_agitation)
        fallback_feature_row = {
            "rms": float(fallback_features.get("rms", 0.0)),
            "rms_std": float(fallback_features.get("rms_std", 0.0)),
            "rms_ratio": float(fallback_features.get("rms", 0.0)) / max(baseline.get('rms', 1e-6) or 1e-6, 1e-6),
            "voiced_ratio": float(fallback_features.get("voiced_ratio", 0.0)),
            "pitch_mean": float(fallback_features.get("pitch_mean", 0.0)),
            "pitch_std": float(fallback_features.get("pitch_std", 0.0)),
            "pitch_range": float(fallback_features.get("pitch_range", 0.0)),
            "pitch_slope": float(fallback_features.get("pitch_slope", 0.0)),
            "pitch_jitter": float(fallback_features.get("pitch_jitter", 0.0)),
            "spectral_centroid": float(fallback_features.get("spectral_centroid", 0.0)),
            "spectral_bandwidth": float(fallback_features.get("spectral_bandwidth", 0.0)),
            "spectral_rolloff": float(fallback_features.get("spectral_rolloff", 0.0)),
            "spectral_flux": float(fallback_features.get("spectral_flux", 0.0)),
            "zcr": float(fallback_features.get("zcr", 0.0)),
            "pause_flag": 0,
        }
        out_row = {
            'state': 'CALM' if fallback_mood['state'] == 'CALM' else 'NOT_CALM',
            'agitation_score': fallback_mood['agitation_score'],
            'rms_ratio': fallback_mood['rms_ratio'],
            'pitch_jitter': fallback_mood['pitch_jitter'],
            'time_sec': round(len(audio) / (2 * sr), 2),
            'tension_trend': round(float(fallback_trend), 1),
            'dialogue_escalation': bool(False)
        }
        out_row.update(fallback_feature_row)
        results.append(out_row)

    df = pd.DataFrame(results)

    # Extra smoothing for visualization only (yellow line)
    df['rolling_mean_15s'] = df['agitation_score'].rolling(window=VIS_SMOOTH_WINDOW, min_periods=1).mean()
    df['rolling_mean_30s'] = df['agitation_score'].rolling(window=VIS_LONG_WINDOW, min_periods=1).mean()

    # Session-level stats for agitation
    session_mean = float(df['agitation_score'].mean())
    session_std = float(df['agitation_score'].std(ddof=0) or 0.0)
    session_p90 = float(df['agitation_score'].quantile(0.9))
    df.attrs['session_mean_agitation'] = session_mean
    df.attrs['session_std_agitation'] = session_std
    df.attrs['session_p90_agitation'] = session_p90
    df.attrs['session_state'] = 'CALM_SESSION' if (session_p90 < 40 and session_mean < 35) else 'NORMAL_SESSION'

    print(f"✅ Analysis finished! {len(df)} time points")
    display(df.head())

    # Prepare visualization df
    def _prepare_agitation_vis(df_in: pd.DataFrame, window_points_30: int = VIS_LONG_WINDOW):
        if df_in.empty or 'agitation_score' not in df_in.columns:
            return df_in
        dfv = df_in.copy()
        mean_fp = dfv.attrs.get('session_mean_agitation', float(dfv['agitation_score'].mean()))
        std_fp = dfv.attrs.get('session_std_agitation', float(dfv['agitation_score'].std(ddof=0) or 0.0))
        eps = 1e-6
        dfv['ag_centered'] = dfv['agitation_score'] - mean_fp
        dfv['ag_z'] = dfv['ag_centered'] / max(std_fp, eps)
        window = max(3, int(window_points_30))
        dfv['ag_centered_30s'] = dfv['ag_centered'].rolling(window=window, min_periods=1).mean()
        session_state_loc = dfv.attrs.get('session_state', 'NORMAL_SESSION')
        calm_level = 35.0
        calm_band = 5.0
        if session_state_loc == 'CALM_SESSION':
            vis = calm_level + dfv['ag_centered_30s']
            vis = vis.clip(calm_level - calm_band, calm_level + calm_band)
            dfv['vis_agitation'] = vis
        else:
            dfv['vis_agitation'] = dfv['rolling_mean_15s']
        return dfv

    df_vis = _prepare_agitation_vis(df)

    # Semantics: ASR + text (optional)
    df_text = pd.DataFrame()
    if 'selected_path' in globals() and selected_path and load_text_pipelines():
        df_text = transcribe_and_classify(selected_path, window_sec=5.0, hop_sec=5.0, sr=16000)
        if df_text.empty:
            print('⚠️ Semantic layer unavailable (ASR/classifier not loaded)')
    else:
        print('ℹ️ Semantic layer not active (no selected_path or model)')


    df_fused = fuse_audio_text(df_vis, df_text)
    
    # Ground truth to plot (with optional auto-align)
    def _align_gt(manual_annotations, df_fused, score_col, search_range=(-5,5), steps=41):
        import numpy as _np
        if not manual_annotations or df_fused.empty:
            return manual_annotations, 0.0, None
        t = df_fused["time_sec"].to_numpy()
        sig = df_fused[score_col].to_numpy()
        sig = (sig - _np.nanmean(sig)) / (_np.nanstd(sig) + 1e-6)
        label_levels = {"CALM": 0, "NOT_CALM": 1}
        lags = _np.linspace(search_range[0], search_range[1], steps)
        best_lag, best_corr = 0.0, -1.0
        best_shifted = manual_annotations
        for lag in lags:
            mask = _np.zeros_like(sig, dtype=float)
            for seg in manual_annotations:
                x0 = seg["start"] + lag
                x1 = seg["end"] + lag
                m = (t >= x0) & (t < x1)
                mask[m] = label_levels.get(seg["label"].upper(), 0)
            if mask.std() == 0:
                continue
            corr = _np.corrcoef(sig, mask)[0, 1]
            if _np.isnan(corr):
                continue
            if corr > best_corr:
                best_corr = corr
                best_lag = float(lag)
                best_shifted = [
                    {"start": seg["start"] + lag, "end": seg["end"] + lag, "label": seg["label"]}
                    for seg in manual_annotations
                ]
        return best_shifted, best_lag, best_corr
    
    annotations_for_plot = manual_annotations.copy() if 'manual_annotations' in globals() else []
    best_lag = 0.0
    best_corr = None
    score_col = "fusion_score" if "fusion_score" in df_fused.columns else ("vis_agitation" if "vis_agitation" in df_fused.columns else "agitation_score")
    if annotations_for_plot and globals().get('AUTO_ALIGN_GT', False):
        annotations_for_plot, best_lag, best_corr = _align_gt(
            annotations_for_plot,
            df_fused,
            score_col=score_col,
            search_range=globals().get('LAG_SEARCH_SEC', (-5,5)),
            steps=globals().get('LAG_STEPS', 41),
        )
        print(f"🔧 Auto-align GT: shift {best_lag:+.2f}s (corr={best_corr:.3f})")
    elif not annotations_for_plot:
        print("ℹ️ No GT annotations: edit Step 2.")
    else:
        print("ℹ️ GT alignment disabled.")
    
    
    
    # ===== MANUAL GROUND TRUTH =====

    label_colors = {
        "CALM": "rgba(0,170,90,0.30)",
        "NOT_CALM": "rgba(220,40,30,0.30)"
    }

    # ===== FIGURE: Reference-style (NO OVERLAP) =====
    fig = go.Figure()

    # =========================
    # 1. Ground Truth — separate Y2 axis
    # =========================
    GT_COLORS = {
        "CALM": "rgba(0, 170, 90, 0.70)",
        "NOT_CALM": "rgba(220, 40, 30, 0.70)",
    }

    for seg in annotations_for_plot:
        label = seg["label"].upper()

        fig.add_shape(
            type="rect",
            x0=seg["start"],
            x1=seg["end"],
            y0=0,
            y1=1,
            yref="y2",
            fillcolor=GT_COLORS[label],
            line=dict(width=0),
            layer="below"
        )

        fig.add_annotation(
            x=(seg["start"] + seg["end"]) / 2,
            xref="x",
            y=0.99,
            yref="paper",
            text=label,
            showarrow=False,
            font=dict(size=12, color="#2c3e50"),
        )


    
def _score_to_label(score, threshold=THRESHOLD_NOT_CALM):
    return "NOT_CALM" if score >= threshold else "CALM"

def _apply_hysteresis(scores, thr_enter, thr_exit):
    if len(scores) == 0:
        return []
    state = "CALM" if scores[0] < thr_enter else "NOT_CALM"
    labels = []
    for s in scores:
        if state == "CALM" and s >= thr_enter:
            state = "NOT_CALM"
        elif state == "NOT_CALM" and s < thr_exit:
            state = "CALM"
        labels.append(state)
    return labels

# GT change markers
if annotations_for_plot:
    gt_change_points = sorted({seg["start"] for seg in annotations_for_plot} | {seg["end"] for seg in annotations_for_plot})
    for x in gt_change_points:
        fig.add_vline(x=x, line_width=1, line_dash="dot", line_color="rgba(0,0,0,0.25)")

# Model change markers (hysteresis)
if score_col in df_fused.columns:
    scores = df_fused[score_col].to_numpy()
    pred_labels = _apply_hysteresis(scores, THRESHOLD_ENTER_NOT_CALM, THRESHOLD_EXIT_NOT_CALM)
    change_times = [
        df_fused["time_sec"].iloc[i]
        for i in range(1, len(pred_labels))
        if pred_labels[i] != pred_labels[i - 1]
    ]
    for x in change_times:
        fig.add_vline(x=x, line_width=1, line_dash="dash", line_color="rgba(0,0,0,0.2)")

# 2. Model — main Y axis

    # =========================

    # Raw (background)
    fig.add_trace(
        go.Scatter(
            x=df_fused["time_sec"],
            y=df_fused["agitation_score"],
            name="Acoustic (raw)",
            mode="lines",
            line=dict(color="rgba(120,120,120,0.3)", width=1),
            hoverinfo="skip",
            yaxis="y"
        )
    )

    # Smoothed (main line)
    if "vis_agitation" in df_fused:
        fig.add_trace(
            go.Scatter(
                x=df_fused["time_sec"],
                y=df_fused["vis_agitation"],
                name="Acoustic (smoothed)",
                mode="lines",
                line=dict(color="#E67E22", width=3),
                yaxis="y"
            )
        )

    # Fusion
    if "fusion_score" in df_fused:
        fig.add_trace(
            go.Scatter(
                x=df_fused["time_sec"],
                y=df_fused["fusion_score"],
                name="Fusion (audio + text)",
                mode="lines",
                line=dict(color="#C0392B", width=2.5, dash="dash"),
                yaxis="y"
            )
        )

    # =========================
    # 3. Emotion zones (model)
    # =========================
    fig.add_hrect(
        y0=0, y1=THRESHOLD_NOT_CALM,
        fillcolor="rgba(0,170,90,0.18)",
        line_width=0,
        layer="below",
        yref="y"
    )

    fig.add_hrect(
        y0=THRESHOLD_NOT_CALM, y1=100,
        fillcolor="rgba(220,40,30,0.18)",
        line_width=0,
        layer="below",
        yref="y"
    )

    # =========================
    # 4. Layout
    # =========================
    # Keep x-axis aligned to audio duration (avoid negative time from GT shift)
    audio_duration_sec = float(df_fused['time_sec'].max()) if 'time_sec' in df_fused.columns else 0.0
    if 'audio' in globals() and audio is not None and len(audio) > 0:
        audio_duration_sec = len(audio) / sr
    audio_duration_sec = max(0.0, audio_duration_sec)

    fig.update_layout(
        title=f"Agitation vs Ground Truth — {dropdown.value}",
        xaxis=dict(title="Time (sec)", range=[0, audio_duration_sec]),

        # Main axis — model
        yaxis=dict(
            title="Agitation score (0–100)",
            range=[0, 100],
            domain=[0.0, 0.82]   # model below
        ),

        # Second axis — GT
        yaxis2=dict(
            range=[0, 1],
            domain=[0.86, 1.0],  # GT above
            visible=False
        ),

        hovermode="x unified",
        plot_bgcolor="white",
        paper_bgcolor="white",
        template="plotly_white",
        font=dict(size=13),

        legend=dict(
            orientation="h",
            yanchor="bottom",
            y=1.15,
            xanchor="left",
            x=0
        ),

        margin=dict(l=70, r=20, t=90, b=50),
        height=560
    )

    fig.show()

🔄 Analyzing 3s windows...


VBox(children=(IntProgress(value=0, bar_style='info', description='⏳', max=37), HTML(value='⏳ Preparing...')))

✅ Analysis finished! 37 time points


Unnamed: 0,state,agitation_score,rms_ratio,pitch_jitter,time_sec,tension_trend,dialogue_escalation,rms,rms_std,voiced_ratio,...,pitch_range,pitch_slope,spectral_centroid,spectral_bandwidth,spectral_rolloff,spectral_flux,zcr,pause_flag,rolling_mean_15s,rolling_mean_30s
0,CALM,33.2,1.046,14.261,1.5,33.2,False,0.006,0.004,0.085,...,70.037,-0.009,1291.501,1267.681,2368.767,3.63,0.102,0,33.2,33.2
1,CALM,36.6,1.085,18.188,2.5,33.3,False,0.007,0.005,0.108,...,82.715,-0.203,1229.8,1302.908,2278.923,3.615,0.086,0,34.9,34.9
2,NOT_CALM,34.4,1.094,14.263,3.5,33.3,False,0.007,0.004,0.158,...,71.909,0.184,1172.717,1261.648,2162.234,3.491,0.085,0,34.733,34.733
3,NOT_CALM,32.7,0.986,12.859,4.5,33.3,False,0.006,0.004,0.186,...,42.943,0.27,1188.192,1288.301,2183.926,3.12,0.078,0,34.225,34.225
4,NOT_CALM,32.4,1.017,13.485,5.5,33.3,False,0.006,0.004,0.258,...,56.373,-0.263,1061.046,1151.995,1865.691,2.987,0.073,0,33.86,33.86


ℹ️ GT alignment disabled.


In [19]:

# Step 3b — Metrics on a 2s grid (2 classes + calibration)
import numpy as np
import pandas as pd

BIN_SIZE = 2.0
THRESHOLD_NOT_CALM = 35
MIN_NOT_CALM_THRESHOLD = 35
CALIBRATION_METHOD = "calm_percentile"  # "calm_percentile", "f1", "none"
CALM_PERCENTILE = 95
CALM_OFFSET = 0.0
CALIBRATION_RANGE = 10
THRESHOLD_SEARCH_STEPS = 101
HYSTERESIS_DELTA = 2.0
USE_HYSTERESIS = True

LABEL_TO_CODE = {"CALM": 0, "NOT_CALM": 1}


def _gt_label_at(t, annotations):
    for seg in annotations:
        if t >= seg["start"] and t < seg["end"]:
            return seg["label"].upper()
    return None


def build_bins(df, feature, bin_size):
    if df is None or df.empty:
        return None
    if feature == "pause_ratio":
        if "pause_flag" not in df.columns:
            return None
        work = df[["time_sec", "pause_flag"]].copy()
        work["bin_idx"] = (work["time_sec"] / bin_size).astype(int)
        grouped = work.groupby("bin_idx").agg(time_sec=("time_sec", "mean"), score=("pause_flag", "mean"))
    else:
        if feature not in df.columns:
            return None
        work = df[["time_sec", feature]].copy()
        work["bin_idx"] = (work["time_sec"] / bin_size).astype(int)
        grouped = work.groupby("bin_idx").agg(time_sec=("time_sec", "mean"), score=(feature, "mean"))
    return grouped.reset_index(drop=True)


def _predict_label(score, threshold):
    return "NOT_CALM" if score >= threshold else "CALM"


def _apply_hysteresis(scores, thr_enter, thr_exit, start_label="CALM"):
    if len(scores) == 0:
        return []
    state = start_label
    labels = []
    for s in scores:
        if state == "CALM" and s >= thr_enter:
            state = "NOT_CALM"
        elif state == "NOT_CALM" and s < thr_exit:
            state = "CALM"
        labels.append(state)
    return labels


def calibrate_threshold(scores, gt_labels, steps=101, center=None, radius=None):
    scores = np.asarray(scores, dtype=float)
    gt_labels = np.asarray(gt_labels)
    if scores.size == 0:
        return None, {"f1_not_calm": 0.0, "precision": 0.0, "recall": 0.0}

    vmin = float(np.nanmin(scores))
    vmax = float(np.nanmax(scores))
    if not np.isfinite(vmin) or not np.isfinite(vmax) or vmax - vmin < 1e-9:
        return vmin, {"f1_not_calm": 0.0, "precision": 0.0, "recall": 0.0}

    if center is not None and radius is not None:
        vmin = max(vmin, center - radius)
        vmax = min(vmax, center + radius)
        if vmax - vmin < 1e-9:
            return vmin, {"f1_not_calm": 0.0, "precision": 0.0, "recall": 0.0}

    thresholds = np.linspace(vmin, vmax, steps)
    best_thr = thresholds[0]
    best_f1 = -1.0
    best_precision = 0.0
    best_recall = 0.0
    best_acc = -1.0

    gt_pos = gt_labels == "NOT_CALM"

    for thr in thresholds:
        pred_pos = scores >= thr
        tp = int((gt_pos & pred_pos).sum())
        fp = int((~gt_pos & pred_pos).sum())
        fn = int((gt_pos & ~pred_pos).sum())
        precision = tp / (tp + fp) if (tp + fp) else 0.0
        recall = tp / (tp + fn) if (tp + fn) else 0.0
        f1 = 2 * precision * recall / (precision + recall) if (precision + recall) else 0.0
        acc = float((pred_pos == gt_pos).mean())
        if f1 > best_f1 or (f1 == best_f1 and acc > best_acc):
            best_f1 = f1
            best_precision = precision
            best_recall = recall
            best_thr = float(thr)
            best_acc = acc

    return best_thr, {"f1_not_calm": best_f1, "precision": best_precision, "recall": best_recall}


def compute_metrics_from_bins(df_bins, threshold, use_hysteresis=True, hysteresis_delta=2.0):
    df_bins = df_bins.copy()
    scores = df_bins["score"].to_numpy()
    start_label = df_bins["gt_label"].iloc[0] if len(df_bins) else "CALM"

    if use_hysteresis:
        thr_enter = threshold + hysteresis_delta
        thr_exit = max(0, threshold - hysteresis_delta)
        pred_labels = _apply_hysteresis(scores, thr_enter, thr_exit, start_label=start_label)
    else:
        pred_labels = ["NOT_CALM" if s >= threshold else "CALM" for s in scores]

    df_bins["gt_code"] = df_bins["gt_label"].map(LABEL_TO_CODE)
    df_bins["pred_label"] = pred_labels
    df_bins["pred_code"] = df_bins["pred_label"].map(LABEL_TO_CODE)

    accuracy = float((df_bins["pred_label"] == df_bins["gt_label"]).mean())

    # NOT_CALM metrics
    gt_pos = df_bins["gt_label"] == "NOT_CALM"
    pred_pos = df_bins["pred_label"] == "NOT_CALM"
    tp = int((gt_pos & pred_pos).sum())
    fp = int((~gt_pos & pred_pos).sum())
    fn = int((gt_pos & ~pred_pos).sum())
    precision = tp / (tp + fp) if (tp + fp) else 0.0
    recall = tp / (tp + fn) if (tp + fn) else 0.0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) else 0.0
    iou = tp / (tp + fp + fn) if (tp + fp + fn) else 0.0

    # Correlations (score vs GT code)
    pearson = float(df_bins["score"].corr(df_bins["gt_code"])) if df_bins["gt_code"].nunique() > 1 else np.nan
    try:
        spearman = float(df_bins["score"].corr(df_bins["gt_code"], method="spearman")) if df_bins["gt_code"].nunique() > 1 else np.nan
    except Exception:
        spearman = np.nan

    # MAE/RMSE on 0–1 scale (normalized by min/max)
    s_min = float(df_bins["score"].min())
    s_max = float(df_bins["score"].max())
    if s_max - s_min > 1e-9:
        score_scaled = (df_bins["score"] - s_min) / (s_max - s_min)
    else:
        score_scaled = df_bins["score"] * 0.0
    mae = float(np.mean(np.abs(score_scaled - df_bins["gt_code"])))
    rmse = float(np.sqrt(np.mean((score_scaled - df_bins["gt_code"]) ** 2)))

    metrics = {
        "accuracy": accuracy,
        "pearson": pearson,
        "spearman": spearman,
        "mae_0_1": mae,
        "rmse_0_1": rmse,
        "precision_not_calm": precision,
        "recall_not_calm": recall,
        "f1_not_calm": f1,
        "iou_not_calm": iou,
        "n_bins": int(len(df_bins)),
    }

    cm = pd.crosstab(
        df_bins["gt_label"],
        df_bins["pred_label"],
        rownames=["GT"],
        colnames=["Pred"],
        dropna=False,
    ).reindex(index=["CALM", "NOT_CALM"], columns=["CALM", "NOT_CALM"], fill_value=0)

    return metrics, cm


if 'df_fused' not in globals() or df_fused is None or df_fused.empty:
    print("⚠️ No data to evaluate. Run Step 3 first.")
elif 'annotations_for_plot' not in globals() or not annotations_for_plot:
    print("⚠️ No GT annotations. Edit Step 2 first.")
else:
    score_col = "fusion_score" if "fusion_score" in df_fused.columns else ("vis_agitation" if "vis_agitation" in df_fused.columns else "agitation_score")
    df_bins = build_bins(df_fused, score_col, BIN_SIZE)
    if df_bins is None or df_bins.empty:
        print("⚠️ No bins for evaluation.")
    else:
        df_bins["gt_label"] = df_bins["time_sec"].apply(lambda t: _gt_label_at(t, annotations_for_plot))
        df_bins = df_bins[df_bins["gt_label"].notna()].reset_index(drop=True)
        if df_bins.empty:
            print("⚠️ No GT coverage on this grid.")
        else:
            scores = df_bins["score"].to_numpy()
            labels = df_bins["gt_label"].to_numpy()

            if CALIBRATION_METHOD == "calm_percentile":
                calm_scores = df_bins[df_bins["gt_label"] == "CALM"]["score"].to_numpy()
                if calm_scores.size > 0:
                    thr = float(np.percentile(calm_scores, CALM_PERCENTILE) + CALM_OFFSET)
                    thr = float(np.clip(thr, np.nanmin(scores), np.nanmax(scores)))
                    THRESHOLD_NOT_CALM = thr
                else:
                    print("ℹ️ No CALM bins for percentile calibration; using default threshold.")
            elif CALIBRATION_METHOD == "f1":
                best_thr, _ = calibrate_threshold(
                    scores,
                    labels,
                    steps=THRESHOLD_SEARCH_STEPS,
                    center=THRESHOLD_NOT_CALM,
                    radius=CALIBRATION_RANGE,
                )
                if best_thr is not None:
                    THRESHOLD_NOT_CALM = best_thr

            THRESHOLD_NOT_CALM = max(THRESHOLD_NOT_CALM, MIN_NOT_CALM_THRESHOLD)
            THRESHOLD_ENTER_NOT_CALM = THRESHOLD_NOT_CALM + HYSTERESIS_DELTA
            THRESHOLD_EXIT_NOT_CALM = max(0, THRESHOLD_NOT_CALM - HYSTERESIS_DELTA)

            metrics_summary, cmatrix = compute_metrics_from_bins(
                df_bins,
                threshold=THRESHOLD_NOT_CALM,
                use_hysteresis=USE_HYSTERESIS,
                hysteresis_delta=HYSTERESIS_DELTA,
            )
            best_lag_val = globals().get("best_lag", 0.0)
            best_corr_val = globals().get("best_corr", None)

            print(f"✅ Metrics on {BIN_SIZE:.1f}s grid using '{score_col}'")
            print(f"GT shift: {best_lag_val:+.2f}s | align corr: {best_corr_val}")
            print(f"Calibration method: {CALIBRATION_METHOD}")
            print(f"Threshold NOT_CALM: {THRESHOLD_NOT_CALM:.2f} (min {MIN_NOT_CALM_THRESHOLD:.2f}, enter {THRESHOLD_ENTER_NOT_CALM:.2f}, exit {THRESHOLD_EXIT_NOT_CALM:.2f})")
            for k, v in metrics_summary.items():
                if isinstance(v, float):
                    print(f"- {k}: {v:.3f}")
                else:
                    print(f"- {k}: {v}")
            display(cmatrix)


✅ Metrics on 2.0s grid using 'fusion_score'
GT shift: +4.75s | align corr: 0.2516122151828242
Calibration method: calm_percentile
Threshold NOT_CALM: 38.43 (min 35.00, enter 40.43, exit 36.43)
- accuracy: 0.647
- pearson: 0.277
- spearman: 0.184
- mae_0_1: 0.476
- rmse_0_1: 0.580
- precision_not_calm: 1.000
- recall_not_calm: 0.500
- f1_not_calm: 0.667
- iou_not_calm: 0.500
- n_bins: 17


Pred,CALM,NOT_CALM
GT,Unnamed: 1_level_1,Unnamed: 2_level_1
CALM,5,0
NOT_CALM,6,6


In [20]:

# Step 3c — Feature benchmark (2s grid, 2 classes)
import numpy as np
import pandas as pd

USE_FEATURES = [
    "agitation_score",
    "vis_agitation",
    "fusion_score",
    "rms_ratio",
    "rms_std",
    "voiced_ratio",
    "pitch_mean",
    "pitch_std",
    "pitch_range",
    "pitch_slope",
    "pitch_jitter",
    "spectral_centroid",
    "spectral_bandwidth",
    "spectral_rolloff",
    "spectral_flux",
    "zcr",
    "pause_ratio",
    "sentiment_score",
]

SORT_BY = "f1_not_calm"  # or "pearson"

USE_HYSTERESIS = globals().get("USE_HYSTERESIS", True)
HYSTERESIS_DELTA = globals().get("HYSTERESIS_DELTA", 2.0)

if "build_bins" not in globals() or "compute_metrics_from_bins" not in globals() or "_gt_label_at" not in globals() or "calibrate_threshold" not in globals():
    print("⚠️ Run Step 3b first to define helper functions.")
    raise SystemExit

if 'df_fused' not in globals() or df_fused is None or df_fused.empty:
    print("⚠️ No data to benchmark. Run Step 3 first.")
elif 'annotations_for_plot' not in globals() or not annotations_for_plot:
    print("⚠️ No GT annotations. Edit Step 2 first.")
else:
    rows = []
    for feat in USE_FEATURES:
        df_bins = build_bins(df_fused, feat, BIN_SIZE)
        if df_bins is None or df_bins.empty:
            continue

        # Optional per-feature auto-align (uses original manual_annotations)
        use_annotations = annotations_for_plot
        best_lag_feat = None
        best_corr_feat = None
        if '_align_gt' in globals() and 'manual_annotations' in globals() and manual_annotations and feat in df_fused.columns:
            try:
                shifted, best_lag_feat, best_corr_feat = _align_gt(
                    manual_annotations,
                    df_fused,
                    score_col=feat,
                    search_range=globals().get('LAG_SEARCH_SEC', (-5, 5)),
                    steps=globals().get('LAG_STEPS', 41),
                )
                use_annotations = shifted
            except Exception:
                use_annotations = annotations_for_plot

        df_bins["gt_label"] = df_bins["time_sec"].apply(lambda t: _gt_label_at(t, use_annotations))
        df_bins = df_bins[df_bins["gt_label"].notna()].reset_index(drop=True)
        if df_bins.empty:
            continue

        # Normalize feature to 0–100 for fair thresholding
        vals = df_bins["score"].to_numpy()
        vmin = float(np.nanmin(vals))
        vmax = float(np.nanmax(vals))
        if not np.isfinite(vmin) or not np.isfinite(vmax) or vmax - vmin < 1e-6:
            continue
        df_bins["score"] = (vals - vmin) / (vmax - vmin) * 100.0

        best_thr, _ = calibrate_threshold(
                df_bins["score"].to_numpy(),
                df_bins["gt_label"].to_numpy(),
                steps=THRESHOLD_SEARCH_STEPS,
            )
        if best_thr is None:
            continue

        metrics, _ = compute_metrics_from_bins(
                df_bins,
                threshold=best_thr,
                use_hysteresis=USE_HYSTERESIS,
                hysteresis_delta=HYSTERESIS_DELTA,
            )
        metrics["feature"] = feat
        metrics["best_threshold"] = best_thr
        metrics["best_lag"] = best_lag_feat
        metrics["align_corr"] = best_corr_feat
        rows.append(metrics)

    if not rows:
        print("⚠️ No features to benchmark.")
    else:
        df_rank = pd.DataFrame(rows)
        if SORT_BY in df_rank.columns:
            df_rank = df_rank.sort_values(SORT_BY, ascending=False)
        display(df_rank)


Unnamed: 0,accuracy,pearson,spearman,mae_0_1,rmse_0_1,precision_not_calm,recall_not_calm,f1_not_calm,iou_not_calm,n_bins,feature,best_threshold,best_lag,align_corr
14,1.0,0.861,0.736,0.228,0.289,1.0,1.0,1.0,1.0,17,spectral_flux,18.0,3.75,0.805
12,1.0,0.764,0.707,0.28,0.323,1.0,1.0,1.0,1.0,19,spectral_bandwidth,39.0,-0.25,0.781
6,1.0,0.845,0.777,0.225,0.287,1.0,1.0,1.0,1.0,18,pitch_mean,42.0,2.75,0.847
7,1.0,0.651,0.707,0.419,0.485,1.0,1.0,1.0,1.0,19,pitch_std,18.0,-1.25,0.616
8,0.947,0.673,0.684,0.365,0.428,0.938,1.0,0.968,0.938,19,pitch_range,23.0,-1.25,0.633
11,0.947,0.733,0.707,0.317,0.368,1.0,0.933,0.966,0.933,19,spectral_centroid,38.0,-0.25,0.729
3,0.941,0.733,0.679,0.272,0.328,0.929,1.0,0.963,0.929,17,rms_ratio,45.0,3.75,0.681
1,0.941,0.752,0.791,0.262,0.378,1.0,0.917,0.957,0.917,17,vis_agitation,16.0,4.75,0.756
9,0.882,0.025,0.186,0.451,0.506,0.882,1.0,0.938,0.882,17,pitch_slope,0.0,-5.0,0.075
13,0.895,0.74,0.707,0.291,0.338,0.882,1.0,0.938,0.882,19,spectral_rolloff,34.0,-0.25,0.748


In [18]:
# Step 4 — Full transcript (original language)
if 'selected_path' not in globals() or not selected_path:
    print("⚠️ Load audio in Step 1 first.")
elif not load_text_pipelines():
    print("⚠️ Semantic models are not loaded.")
else:
    df_text = transcribe_and_classify(selected_path, window_sec=5.0, hop_sec=5.0, sr=16000)
    if df_text.empty:
        print("⚠️ Could not get transcript.")
    else:
        df_text = df_text.sort_values("start_sec")
        full_text = " ".join(df_text["text"].tolist())
        print("📜 Transcript (original language):\n")
        print(full_text)
        display(df_text[["start_sec", "end_sec", "text"]].head())


📜 Transcript (original language):

 My wife's family, they live in Gaza. They have cousins and uncles.  there and their house also was bombed. The question is  what is a proportionate response. It has been different from one tier to another. So if you look to this graph  for example, this is the death of Israeli and Palestinians, and it's changed from one year to a year.  It's like fluctuating like crypto. The lying son of a bitch lied to me. I told him you  don't understand. Ben Shapiro and Ron DeSantis keep saying that Israel warned you and Hamas asked you to  to keep to stay put. So I told you he's a loser. He never kept a job. He even like  and all of the interviews to become like a human shield, I would believe


Unnamed: 0,start_sec,end_sec,text
0,0.0,5.0,"My wife's family, they live in Gaza. They hav..."
1,5.0,10.0,there and their house also was bombed. The qu...
2,10.0,15.0,what is a proportionate response. It has been...
3,15.0,20.0,"for example, this is the death of Israeli and..."
4,20.0,25.0,It's like fluctuating like crypto. The lying ...
