## How to use (workflow)
1) Step 0: run installs once (or skip if already installed).
2) Step 1: pick audio from dropdown and click *Load file*.
3) Step 2: edit ground truth (CALM / NOT_CALM) and auto-align toggles.
4) Step 3: run analysis to see the plot.
5) Step 3b: run metrics on a 2s grid with fixed threshold.
6) Step 3c (optional): short feature ranking table.


In [4]:
!pip install librosa==0.10.1 numpy pandas matplotlib seaborn plotly scipy ipywidgets
!pip install sentencepiece




In [None]:
# Step 1 ‚Äî Select audio and compute baseline (run after installs).
# Pick a file in the dropdown and click 'Load file'. Baseline is auto-calibrated from low-energy windows.

# Universal loader for WAV/MP3 with forced 16 kHz mono
import numpy as _np

NORMALIZE_AUDIO = True
TARGET_RMS = 0.06
MAX_PEAK = 0.98
CALIBRATION_SEC = 8.0
BASELINE_LOW_QUANTILE = None

def _normalize_audio(audio, target_rms=TARGET_RMS, max_peak=MAX_PEAK):
    if audio is None or len(audio) == 0:
        return audio
    audio = _np.asarray(audio, dtype=_np.float32)
    rms = _np.sqrt(_np.mean(_np.square(audio), dtype=_np.float64))
    if not _np.isfinite(rms) or rms < 1e-6:
        return audio
    scale = float(target_rms / rms)
    peak = float(_np.max(_np.abs(audio)))
    if peak * scale > max_peak:
        scale = max_peak / (peak + 1e-6)
    return audio * scale

def load_audio_any(path, target_sr=16000):
    try:
        audio, sr = librosa.load(path, sr=target_sr, mono=True)
        audio = audio.astype(_np.float32)
        if NORMALIZE_AUDIO:
            audio = _normalize_audio(audio)
        return audio, sr
    except Exception as e:
        print(f"‚ö†Ô∏è Failed to load {path}: {e}")
        return _np.array([], dtype=_np.float32), target_sr

import librosa

import librosa
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import os, glob, warnings
from pathlib import Path
import ipywidgets as widgets
from IPython.display import display, clear_output

warnings.filterwarnings('ignore')
pd.options.display.float_format = '{:.3f}'.format



def extract_features(audio, sr=16000, hop_length=512):
    """Extracts short-term audio features for one window."""
    default_feats = {
        'rms': 0.0,
        'rms_std': 0.0,
        'pitch_jitter': 0.0,
        'voiced_ratio': 0.0,
        'pitch_mean': 0.0,
        'pitch_std': 0.0,
        'pitch_range': 0.0,
        'spectral_centroid': 0.0,
        'spectral_bandwidth': 0.0,
        'spectral_rolloff': 0.0,
        'spectral_flux': 0.0,
        'zcr': 0.0,
    }
    if audio is None or len(audio) == 0:
        return default_feats

    audio = np.asarray(audio, dtype=np.float32)
    audio = np.nan_to_num(audio, nan=0.0, posinf=0.0, neginf=0.0)

    if np.allclose(audio, 0):
        return default_feats

    rms = librosa.feature.rms(y=audio, hop_length=hop_length)[0]

    try:
        pitches, voiced_flag, voiced_probs = librosa.pyin(
            audio, fmin=50, fmax=600, sr=sr, hop_length=hop_length
        )
    except Exception:
        pitches = np.full_like(rms, np.nan)
        voiced_probs = np.zeros_like(rms)
        voiced_flag = None

    if voiced_flag is not None:
        voiced_ratio = float(np.mean(voiced_flag))
    else:
        voiced_probs_clean = voiced_probs[~np.isnan(voiced_probs)]
        voiced_ratio = float(np.mean(voiced_probs_clean)) if voiced_probs_clean.size > 0 else 0.0

    valid_idx = np.where(~np.isnan(pitches))[0]
    valid_pitches = pitches[valid_idx]
    pitch_jitter = float(np.std(valid_pitches) / np.mean(valid_pitches) * 100) if valid_pitches.size > 1 and np.mean(valid_pitches) > 0 else 0.0
    pitch_mean = float(np.nanmean(pitches)) if np.isfinite(np.nanmean(pitches)) else 0.0
    pitch_std = float(np.nanstd(valid_pitches)) if valid_pitches.size > 0 else 0.0
    pitch_range = float(np.nanmax(valid_pitches) - np.nanmin(valid_pitches)) if valid_pitches.size > 0 else 0.0

    try:
        S = np.abs(librosa.stft(audio, n_fft=2048, hop_length=hop_length))
        spectral_centroid = librosa.feature.spectral_centroid(S=S, sr=sr)[0]
        spectral_bandwidth = librosa.feature.spectral_bandwidth(S=S, sr=sr)[0]
        spectral_rolloff = librosa.feature.spectral_rolloff(S=S, sr=sr, roll_percent=0.85)[0]
        if S.shape[1] > 1:
            flux = np.sqrt(np.sum(np.diff(S, axis=1) ** 2, axis=0))
            spectral_flux = float(np.mean(flux))
        else:
            spectral_flux = 0.0
        zcr = librosa.feature.zero_crossing_rate(y=audio, hop_length=hop_length)[0]
    except Exception:
        spectral_centroid = np.array([0.0])
        spectral_bandwidth = np.array([0.0])
        spectral_rolloff = np.array([0.0])
        spectral_flux = 0.0
        zcr = np.array([0.0])

    return {
        'rms': float(np.mean(rms)),
        'rms_std': float(np.std(rms)),
        'pitch_jitter': pitch_jitter,
        'voiced_ratio': voiced_ratio,
        'pitch_mean': pitch_mean,
        'pitch_std': pitch_std,
        'pitch_range': pitch_range,
        'spectral_centroid': float(np.mean(spectral_centroid)),
        'spectral_bandwidth': float(np.mean(spectral_bandwidth)),
        'spectral_rolloff': float(np.mean(spectral_rolloff)),
        'spectral_flux': float(spectral_flux),
        'zcr': float(np.mean(zcr)),
    }

def compute_baseline(audio, sr=16000, window_sec=3.0, low_quantile=None, calibration_sec=None):
    """Compute baseline from early voiced windows (calibration period)."""
    if audio is None:
        return {'rms': 1e-6, 'rms_std': 1e-9, 'pitch_jitter': 1e-3, 'voiced_ratio': 0.0, 'pitch_mean': 0.0}

    samples_per_window = max(int(window_sec * sr), 1)
    baseline_features = []
    rms_values = []
    voiced_values = []
    max_samples = int(calibration_sec * sr) if calibration_sec else len(audio)

    for i in range(0, min(len(audio), max_samples), samples_per_window):
        window = audio[i:i+samples_per_window]
        if len(window) >= int(0.5 * sr):
            feats = extract_features(window, sr)
            baseline_features.append(feats)
            rms_values.append(float(feats.get('rms', 0.0)))
            voiced_values.append(float(feats.get('voiced_ratio', 0.0)))

    if not baseline_features:
        baseline_features.append(extract_features(audio, sr))
        rms_values = [float(baseline_features[0].get('rms', 0.0))]

    if len(baseline_features) > 2:
        voiced_mask = [v >= 0.2 for v in voiced_values]
        voiced_feats = [f for f, m in zip(baseline_features, voiced_mask) if m]
        voiced_rms = [r for r, m in zip(rms_values, voiced_mask) if m]
        if len(voiced_feats) >= 2:
            baseline_features = voiced_feats
            rms_values = voiced_rms
        if low_quantile is not None:
            thr = float(np.quantile(rms_values, low_quantile))
            keep = [f for f, r in zip(baseline_features, rms_values) if r <= thr]
            if len(keep) >= 2:
                baseline_features = keep

    keys = baseline_features[0].keys()
    baseline = {}
    for k in keys:
        values = [f.get(k, 0.0) for f in baseline_features]
        baseline[k] = float(np.nan_to_num(np.mean(values)))

    def _robust_stats(arr):
        med = float(np.median(arr))
        mad = float(np.median(np.abs(arr - med))) + 1e-6
        return med, mad

    def _add_stats(key, min_mad):
        vals = np.array([f.get(key, 0.0) for f in baseline_features], dtype=float)
        if vals.size == 0:
            vals = np.array([baseline.get(key, 0.0)], dtype=float)
        med, mad = _robust_stats(vals)
        baseline[f'{key}_median'] = med
        baseline[f'{key}_mad'] = max(mad, min_mad)

    _add_stats('rms', max(baseline.get('rms', 0.0) * 0.15, 1e-6))
    _add_stats('rms_std', max(baseline.get('rms_std', 0.0) * 0.2, 1e-7))
    _add_stats('pitch_jitter', max(baseline.get('pitch_jitter', 0.0) * 0.3, 0.5))
    _add_stats('pitch_mean', max(baseline.get('pitch_mean', 0.0) * 0.1, 10.0))
    _add_stats('pitch_std', max(baseline.get('pitch_std', 0.0) * 0.2, 5.0))
    _add_stats('pitch_range', max(baseline.get('pitch_range', 0.0) * 0.2, 15.0))
    _add_stats('spectral_centroid', max(baseline.get('spectral_centroid', 0.0) * 0.2, 50.0))
    _add_stats('spectral_bandwidth', max(baseline.get('spectral_bandwidth', 0.0) * 0.2, 50.0))
    _add_stats('spectral_rolloff', max(baseline.get('spectral_rolloff', 0.0) * 0.2, 50.0))
    _add_stats('spectral_flux', max(baseline.get('spectral_flux', 0.0) * 0.2, 1e-3))
    _add_stats('zcr', max(baseline.get('zcr', 0.0) * 0.2, 1e-3))
    _add_stats('voiced_ratio', max(baseline.get('voiced_ratio', 0.0) * 0.2, 0.05))

    baseline['rms'] = max(baseline.get('rms', 0.0), 1e-6)
    baseline['rms_std'] = max(baseline.get('rms_std', 0.0), 1e-9)
    baseline['pitch_jitter'] = max(baseline.get('pitch_jitter', 0.0), 1e-3)
    return baseline

def compute_agitation_score(features, baseline, prev_score=None, smoothing_alpha=0.5, spike_threshold=20.0, max_step=9.0, score_gain=1.0):
    """Compute agitation 0‚Äì100 using robust deviation from the baseline."""
    rms = float(np.nan_to_num(features.get('rms', 0.0), nan=0.0, posinf=0.0, neginf=0.0))
    rms_std = float(np.nan_to_num(features.get('rms_std', 0.0), nan=0.0, posinf=0.0, neginf=0.0))
    pitch_jitter = float(np.nan_to_num(features.get('pitch_jitter', 0.0), nan=0.0, posinf=0.0, neginf=0.0))
    pitch_mean = float(np.nan_to_num(features.get('pitch_mean', 0.0), nan=0.0, posinf=0.0, neginf=0.0))
    pitch_std = float(np.nan_to_num(features.get('pitch_std', 0.0), nan=0.0, posinf=0.0, neginf=0.0))
    pitch_range = float(np.nan_to_num(features.get('pitch_range', 0.0), nan=0.0, posinf=0.0, neginf=0.0))
    spectral_centroid = float(np.nan_to_num(features.get('spectral_centroid', 0.0), nan=0.0, posinf=0.0, neginf=0.0))
    spectral_bandwidth = float(np.nan_to_num(features.get('spectral_bandwidth', 0.0), nan=0.0, posinf=0.0, neginf=0.0))
    spectral_rolloff = float(np.nan_to_num(features.get('spectral_rolloff', 0.0), nan=0.0, posinf=0.0, neginf=0.0))
    spectral_flux = float(np.nan_to_num(features.get('spectral_flux', 0.0), nan=0.0, posinf=0.0, neginf=0.0))
    zcr = float(np.nan_to_num(features.get('zcr', 0.0), nan=0.0, posinf=0.0, neginf=0.0))
    voiced_ratio = float(np.nan_to_num(features.get('voiced_ratio', 0.0), nan=0.0, posinf=0.0, neginf=0.0))

    def _robust_z(val, med, mad, cap=3.0):
        return np.clip((val - med) / (1.4826 * mad + 1e-6), 0, cap)

    rms_z = _robust_z(rms, baseline.get('rms_median', baseline.get('rms', 0.0)), baseline.get('rms_mad', 1e-6))
    rms_std_z = _robust_z(rms_std, baseline.get('rms_std_median', baseline.get('rms_std', 0.0)), baseline.get('rms_std_mad', 1e-6))
    pitch_mean_z = _robust_z(pitch_mean, baseline.get('pitch_mean_median', baseline.get('pitch_mean', 0.0)), baseline.get('pitch_mean_mad', 1e-6))
    pitch_std_z = _robust_z(pitch_std, baseline.get('pitch_std_median', baseline.get('pitch_std', 0.0)), baseline.get('pitch_std_mad', 1e-6))
    pitch_range_z = _robust_z(pitch_range, baseline.get('pitch_range_median', baseline.get('pitch_range', 0.0)), baseline.get('pitch_range_mad', 1e-6))
    jitter_z = _robust_z(pitch_jitter, baseline.get('pitch_jitter_median', baseline.get('pitch_jitter', 0.0)), baseline.get('pitch_jitter_mad', 1e-6))
    centroid_z = _robust_z(spectral_centroid, baseline.get('spectral_centroid_median', baseline.get('spectral_centroid', 0.0)), baseline.get('spectral_centroid_mad', 1e-6))
    bandwidth_z = _robust_z(spectral_bandwidth, baseline.get('spectral_bandwidth_median', baseline.get('spectral_bandwidth', 0.0)), baseline.get('spectral_bandwidth_mad', 1e-6))
    rolloff_z = _robust_z(spectral_rolloff, baseline.get('spectral_rolloff_median', baseline.get('spectral_rolloff', 0.0)), baseline.get('spectral_rolloff_mad', 1e-6))
    flux_z = _robust_z(spectral_flux, baseline.get('spectral_flux_median', baseline.get('spectral_flux', 0.0)), baseline.get('spectral_flux_mad', 1e-6))
    zcr_z = _robust_z(zcr, baseline.get('zcr_median', baseline.get('zcr', 0.0)), baseline.get('zcr_mad', 1e-6))

    voiced_gate = np.clip((voiced_ratio - 0.2) / 0.5, 0.3, 1.0)
    pitch_mean_z *= voiced_gate
    pitch_std_z *= voiced_gate
    pitch_range_z *= voiced_gate
    jitter_z *= voiced_gate
    centroid_z *= voiced_gate
    bandwidth_z *= voiced_gate
    rolloff_z *= voiced_gate
    flux_z *= voiced_gate
    zcr_z *= voiced_gate

    weights = {
        'rms': 3.0,
        'rms_std': 2.0,
        'pitch_mean': 22.0,
        'pitch_std': 20.0,
        'pitch_range': 20.0,
        'jitter': 6.0,
        'centroid': 14.0,
        'bandwidth': 14.0,
        'rolloff': 14.0,
        'flux': 6.0,
        'zcr': 6.0,
    }

    weighted = (
        rms_z * weights['rms'] +
        rms_std_z * weights['rms_std'] +
        pitch_mean_z * weights['pitch_mean'] +
        pitch_std_z * weights['pitch_std'] +
        pitch_range_z * weights['pitch_range'] +
        jitter_z * weights['jitter'] +
        centroid_z * weights['centroid'] +
        bandwidth_z * weights['bandwidth'] +
        rolloff_z * weights['rolloff'] +
        flux_z * weights['flux'] +
        zcr_z * weights['zcr']
    )
    max_weighted = 3.0 * sum(weights.values())
    raw_score = float(np.clip((weighted / max_weighted) * 100.0, 0, 100))
    if score_gain > 0:
        raw_score = float(np.clip(raw_score * score_gain, 0, 100))

    if prev_score is None or not np.isfinite(prev_score):
        prev_score = raw_score

    delta_raw = raw_score - prev_score
    direction = np.sign(delta_raw)

    if abs(delta_raw) > spike_threshold:
        candidate = prev_score + direction * max_step
    else:
        candidate = smoothing_alpha * raw_score + (1 - smoothing_alpha) * prev_score

    candidate = float(np.clip(candidate, 0, 100))
    return round(candidate, 1)

def get_mood_state(features, baseline, agitation_score, prev_state='CALM', prev_score=None, recent_scores=None, state_streak=1, pending_state=None, pending_count=0, calm_threshold=40.0, escalate_threshold=65.0):
    """Infer CALM/TENSE/ESCALATING from smoothed score with hysteresis and streaks.
    - CALM<->TENSE: switch only after >=3 consecutive windows.
    - TENSE<->ESCALATING: switch only after >=4 windows; keep ESC while score >=(escalate_threshold-5).
    - ESC triggers at score >=escalate_threshold or growth >25 over ~3 windows (6‚Äì9s).
    Returns state and updated streak/pending counters plus metrics.
    """
    recent_scores = recent_scores or []

    baseline_rms = max(baseline.get('rms', 1e-6) or 1e-6, 1e-6)
    rms_ratio = float(np.clip((features.get('rms', 0.0) / baseline_rms), 0, 10))
    pitch_jitter = float(np.nan_to_num(features.get('pitch_jitter', 0.0), nan=0.0, posinf=0.0, neginf=0.0))

    last_ref = prev_score if prev_score is not None else (recent_scores[-1] if recent_scores else None)
    delta = abs(agitation_score - last_ref) if last_ref is not None else 0.0

    if len(recent_scores) >= 3:
        growth_current = agitation_score - recent_scores[-3]
    else:
        growth_current = agitation_score - last_ref if last_ref is not None else 0.0

    escalate_cond = (agitation_score >= escalate_threshold) or (growth_current > 25)
    calm_cond = (agitation_score < calm_threshold) and (delta < 10)
    hold_escalating = (prev_state == 'ESCALATING' and agitation_score >= (escalate_threshold - 5))

    if hold_escalating:
        candidate_state = 'ESCALATING'
    elif escalate_cond:
        candidate_state = 'ESCALATING'
    elif calm_cond:
        candidate_state = 'CALM'
    elif agitation_score >= calm_threshold:
        candidate_state = 'TENSE'
    else:
        candidate_state = 'CALM'

    borderline = False
    new_pending_state = pending_state
    new_pending_count = pending_count
    new_state_streak = state_streak

    def required_streak(prev_state, cand_state):
        if {'CALM', 'TENSE'} == {prev_state, cand_state}:
            return 3
        if 'ESCALATING' in (prev_state, cand_state):
            return 4
        return 3

    if candidate_state == prev_state:
        final_state = prev_state
        new_state_streak = state_streak + 1
        new_pending_state = None
        new_pending_count = 0
    else:
        needed = required_streak(prev_state, candidate_state)
        if candidate_state == pending_state:
            new_pending_count = pending_count + 1
        else:
            new_pending_state = candidate_state
            new_pending_count = 1

        if new_pending_count >= needed:
            final_state = candidate_state
            new_state_streak = 1
            new_pending_state = None
            new_pending_count = 0
        else:
            final_state = prev_state
            new_state_streak = state_streak + 1
            borderline = True

    return {
        'state': final_state,
        'agitation_score': round(float(agitation_score), 1),
        'rms_ratio': round(rms_ratio, 2),
        'pitch_jitter': round(pitch_jitter, 1),
        'state_streak': new_state_streak,
        'pending_state': new_pending_state,
        'pending_count': new_pending_count,
        'borderline_state': borderline
    }


print("üìÇ Scanning audio folder...")
audio_dir = str((Path.cwd() / 'audio_samples').resolve())
wav_files = glob.glob(os.path.join(audio_dir, '*.wav')) + glob.glob(os.path.join(audio_dir, '*.mp3'))

audio = None
baseline = None
duration = 0
sr = 16000
audio_ready = False
selected_path = None

if not wav_files:
    dropdown = None
    print("‚ùå No audio files found. Check path.")
else:
    file_names = [os.path.basename(f) for f in wav_files]
    dropdown = widgets.Dropdown(
        options=file_names,
        description='Pick audio:',
        style={'description_width': 'initial'}
    )
    load_btn = widgets.Button(description='Load file', button_style='primary')
    status = widgets.Output()

    def load_selected(_):
        global audio, baseline, duration, sr, audio_ready, selected_path
        with status:
            status.clear_output()
            if dropdown.value is None:
                print("‚ö†Ô∏è Pick a file in the dropdown.")
                audio_ready = False
                return
            selected_file = os.path.join(audio_dir, dropdown.value)
            if not os.path.exists(selected_file):
                print("‚ùå File not found (maybe removed).")
                audio_ready = False
                return
            print(f"üîÑ Loading: {dropdown.value}")
            audio, sr = load_audio_any(selected_file, target_sr=16000)
            duration = len(audio) / sr if sr else 0
            baseline = compute_baseline(audio, sr, low_quantile=BASELINE_LOW_QUANTILE, calibration_sec=CALIBRATION_SEC)
            readable = {k: round(v, 3) for k, v in baseline.items()}
            selected_path = selected_file
            audio_ready = True
            print(f"‚úÖ Loaded: {duration:.1f} s, sr={sr}Hz")
            print("‚úÖ Baseline:", readable)

    load_btn.on_click(load_selected)

    print(f"‚úÖ Found {len(wav_files)} WAV files. Pick a file and click 'Load file'.")
    display(widgets.VBox([dropdown, load_btn, status]))


üìÇ Scanning audio folder...
‚úÖ Found 15 WAV files. Pick a file and click 'Load file'.


VBox(children=(Dropdown(description='Pick audio:', options=('untitled #2.wav', 'good_interview.wav', 'good ass‚Ä¶

In [54]:
# Step 2 ‚Äî Ground truth setup (2 classes)
from pathlib import Path

# test wmotions
GT_PRESETS = {"test_emotions": [
        {"start": 0, "end": 9, "label": "CALM"},
        {"start": 9, "end": 39, "label": "NOT_CALM"},
    ],}


# GT_PRESETS = {"good_assist": [
#         {"start": 0, "end": 49, "label": "CALM"},]}


# Manual GT only (no auto-alignment)

audio_key = Path(selected_path).stem.lower().replace(" ", "_") if 'selected_path' in globals() and selected_path else None
manual_annotations = GT_PRESETS.get(audio_key, []).copy() if audio_key else []

if manual_annotations:
    print(f"üéØ GT for '{audio_key}': {manual_annotations}")
else:
    available = list(GT_PRESETS.keys())
    print(f"‚ÑπÔ∏è No preset GT found for '{audio_key}'. Available keys: {available}")


üéØ GT for 'test_emotions': [{'start': 0, 'end': 9, 'label': 'CALM'}, {'start': 9, 'end': 39, 'label': 'NOT_CALM'}]


In [55]:
# Step 2b ‚Äî Optional text/ASR pipelines (leave as-is if no network)

try:
    import torch
    from transformers import pipeline
except Exception:
    pipeline = None
    print("‚ö†Ô∏è transformers not available, semantic layer disabled")

asr_pipe = None
sent_pipe = None

def load_text_pipelines(asr_model="openai/whisper-small", cls_model="nlptown/bert-base-multilingual-uncased-sentiment"): 
    """Lazy load ASR and text model; safe if no network/models."""
    global asr_pipe, sent_pipe
    if pipeline is None:
        print("‚ö†Ô∏è transformers not installed")
        return False
    device = 0 if "torch" in globals() and hasattr(torch, "cuda") and torch.cuda.is_available() else -1
    if asr_pipe is None:
        try:
            asr_pipe = pipeline("automatic-speech-recognition", model=asr_model, device=device)
        except Exception as e:
            print("‚ö†Ô∏è Failed to load ASR:", e)
            asr_pipe = None
    if sent_pipe is None:
        try:
            sent_pipe = pipeline("text-classification", model=cls_model, device=device)
        except Exception as e:
            print("‚ö†Ô∏è Failed to load text model:", e)
            sent_pipe = None
    return asr_pipe is not None and sent_pipe is not None


def transcribe_and_classify(path, window_sec=5.0, hop_sec=5.0, sr=16000):
    """Split audio into windows, run ASR + sentiment; returns a DataFrame with time and sentiment_score."""
    if asr_pipe is None or sent_pipe is None:
        return pd.DataFrame()
    audio, _ = librosa.load(path, sr=sr)
    win = int(window_sec * sr)
    hop = int(hop_sec * sr)
    rows = []
    for start in range(0, len(audio), hop):
        end = min(len(audio), start + win)
        chunk = audio[start:end]
        if len(chunk) < 0.5 * sr:
            continue
        try:
            text = asr_pipe({"array": chunk, "sampling_rate": sr}).get("text", "")
        except Exception:
            text = ""
        try:
            senti = sent_pipe(text)[0]
            if isinstance(senti, dict):
                label = str(senti.get("label", "")).strip()
                score = float(senti.get("score", 0.0))
                stars = None
                if label:
                    try:
                        stars = int(label.split()[0])
                    except Exception:
                        stars = None
                if stars is None or stars < 1 or stars > 5:
                    sentiment_score = 0.0
                else:
                    negativity = (5 - stars) / 4.0
                    sentiment_score = float(negativity * 100.0 * score)
            else:
                sentiment_score = 0.0
        except Exception:
            sentiment_score = 0.0
        rows.append({
            "start_sec": start / sr,
            "end_sec": end / sr,
            "sentiment_score": sentiment_score,
            "text": text
        })
    return pd.DataFrame(rows)


def fuse_audio_text(df_audio: pd.DataFrame, df_text: pd.DataFrame):
    """Late fusion: audio (fast) + text (calibration). Returns df with fusion_score."""
    if df_audio is None or df_audio.empty:
        return df_audio
    df_audio = df_audio.copy()
    if df_text is None or df_text.empty:
        df_audio["fusion_score"] = df_audio.get("vis_agitation", df_audio.get("agitation_score", 0.0))
        df_audio["sentiment_score"] = 0.0
        return df_audio

    df_text = df_text.copy()
    df_text["mid_sec"] = (df_text["start_sec"] + df_text["end_sec"]) / 2
    if "time_sec" in df_audio.columns:
        df_audio["mid_sec"] = df_audio["time_sec"]
    elif "start_sec" in df_audio.columns and "end_sec" in df_audio.columns:
        df_audio["mid_sec"] = (df_audio["start_sec"] + df_audio["end_sec"]) / 2
    else:
        df_audio["mid_sec"] = range(len(df_audio))

    merged = pd.merge_asof(
        df_audio.sort_values("mid_sec"),
        df_text.sort_values("mid_sec")[["mid_sec", "sentiment_score"]],
        on="mid_sec", direction="nearest", tolerance=3
    )
    merged["sentiment_score"] = merged["sentiment_score"].fillna(0.0)

    base = merged.get("vis_agitation", merged.get("agitation_score", merged.get("frustration_proxy", 0.0)))
    text_component = merged["sentiment_score"]
    if text_component.max() <= 1:
        text_component = text_component * 100
    
    # Smooth and clip to avoid short drops
    text_component = text_component.rolling(window=3, min_periods=1, center=True).mean()
    text_component = text_component.clip(lower=0)
    text_weight = float(globals().get("TEXT_WEIGHT", 0.3))
    text_weight = min(max(text_weight, 0.0), 1.0)
    merged["fusion_score"] = (1.0 - text_weight) * base + text_weight * text_component
            
    return merged


In [56]:
# Step 3 ‚Äî Analyze and visualize (run after GT setup)
if 'audio_ready' not in globals() or not audio_ready or audio is None or len(audio) == 0:
    print("‚ö†Ô∏è No audio to analyze. Load audio in Step 1 first.")
else:
    # Core analysis settings (fixed zone threshold)
    window_sec = 3
    hop_sec = 2
    sr = 16000

    SMOOTHING_ALPHA = 0.4
    SPIKE_THRESHOLD = 25.0
    MAX_STEP = 7.0
    HYSTERESIS_DELTA = 3.0
    THRESHOLD_NOT_CALM = 40.0
    SCORE_GAIN = 1.8
    TEXT_WEIGHT = 0.25
    VIS_SMOOTH_WINDOW = max(3, int(6 / hop_sec))
    VIS_LONG_WINDOW = max(8, int(16 / hop_sec))

    THRESHOLD_ENTER_NOT_CALM = THRESHOLD_NOT_CALM + HYSTERESIS_DELTA
    THRESHOLD_EXIT_NOT_CALM = max(0, THRESHOLD_NOT_CALM - HYSTERESIS_DELTA)
    window_samples = int(window_sec * sr)
    hop_samples = int(hop_sec * sr)

    starts = list(range(0, max(len(audio) - window_samples, 0) + hop_samples, hop_samples))
    if not starts:
        starts = [0]

    print("üîÑ Analyzing 3s windows...")
    progress = widgets.IntProgress(value=0, min=0, max=len(starts), description='‚è≥', bar_style='info')
    progress_label = widgets.HTML(value="‚è≥ Preparing...")
    display(widgets.VBox([progress, progress_label]))

    results = []
    prev_state = 'CALM'
    prev_agitation = None
    last_active_agitation = None
    last_active_state = 'CALM'
    tension_trend = None
    score_history = []
    trend_alpha = 0.03  # slow trend ~30-50s
    state_streak = 0
    pending_state = None
    pending_count = 0

    pause_voiced_thr = 0.12  # more sensitive to silence
    pause_rms_scale = 0.8

    for idx, start_sample in enumerate(starts):
        end_sample = min(len(audio), start_sample + window_samples)
        window_audio = audio[start_sample:end_sample]

        if len(window_audio) < int(0.5 * sr) and len(audio) > int(window_samples):
            continue

        features = extract_features(window_audio, sr)
        baseline_rms = max(baseline.get('rms', 1e-6) or 1e-6, 1e-6)
        voiced_ratio_val = float(np.nan_to_num(features.get('voiced_ratio', 0.0), nan=0.0, posinf=0.0, neginf=0.0))

        rms_val = float(np.nan_to_num(features.get('rms', 0.0), nan=0.0, posinf=0.0, neginf=0.0))
        rms_ratio_val = float(np.clip(rms_val / baseline_rms, 0, 10))
        pitch_jitter_val = float(np.nan_to_num(features.get('pitch_jitter', 0.0), nan=0.0, posinf=0.0, neginf=0.0))
        is_pause = (voiced_ratio_val < pause_voiced_thr) and (rms_val < baseline_rms * pause_rms_scale)

        feature_row = {
            "rms": float(features.get("rms", 0.0)),
            "rms_std": float(features.get("rms_std", 0.0)),
            "rms_ratio": rms_ratio_val,
            "voiced_ratio": voiced_ratio_val,
            "pitch_mean": float(features.get("pitch_mean", 0.0)),
            "pitch_std": float(features.get("pitch_std", 0.0)),
            "pitch_range": float(features.get("pitch_range", 0.0)),
            "pitch_jitter": pitch_jitter_val,
            "spectral_centroid": float(features.get("spectral_centroid", 0.0)),
            "spectral_bandwidth": float(features.get("spectral_bandwidth", 0.0)),
            "spectral_rolloff": float(features.get("spectral_rolloff", 0.0)),
            "spectral_flux": float(features.get("spectral_flux", 0.0)),
            "zcr": float(features.get("zcr", 0.0)),
            "pause_flag": 1 if is_pause else 0,
        }

        if is_pause and last_active_agitation is not None:
            agitation_score = last_active_agitation
            state_for_row = last_active_state
            if tension_trend is None:
                tension_trend = agitation_score
            dialogue_escalation = (tension_trend > 50) and (agitation_score > 65)
            out_row = {
                'state': state_for_row,
                'agitation_score': round(float(agitation_score), 1),
                'rms_ratio': round(rms_ratio_val, 2),
                'pitch_jitter': round(pitch_jitter_val, 1),
                'time_sec': round(((start_sample + end_sample) / 2) / sr, 2),
                'tension_trend': round(float(tension_trend), 1),
                'dialogue_escalation': bool(dialogue_escalation)
            }
            out_row.update(feature_row)
            results.append(out_row)
            score_history.append(agitation_score)
        else:
            agitation_score = compute_agitation_score(
                features, baseline, prev_score=prev_agitation,
                smoothing_alpha=SMOOTHING_ALPHA, spike_threshold=SPIKE_THRESHOLD, max_step=MAX_STEP, score_gain=SCORE_GAIN
            )

            if tension_trend is None:
                tension_trend = agitation_score
            else:
                tension_trend = trend_alpha * agitation_score + (1 - trend_alpha) * tension_trend

            mood_info = get_mood_state(
                features, baseline, agitation_score,
                prev_state=prev_state, prev_score=prev_agitation,
                recent_scores=score_history,
                state_streak=state_streak,
                pending_state=pending_state,
                pending_count=pending_count,
                calm_threshold=THRESHOLD_NOT_CALM
            )
            display_state = 'CALM' if mood_info['state'] == 'CALM' else 'NOT_CALM'

            dialogue_escalation = (tension_trend > 50) and (agitation_score > 65)

            out_row = {
                'state': display_state,
                'agitation_score': mood_info['agitation_score'],
                'rms_ratio': mood_info['rms_ratio'],
                'pitch_jitter': mood_info['pitch_jitter'],
                'time_sec': round(((start_sample + end_sample) / 2) / sr, 2),
                'tension_trend': round(float(tension_trend), 1),
                'dialogue_escalation': bool(dialogue_escalation)
            }
            out_row.update(feature_row)
            results.append(out_row)

            score_history.append(agitation_score)
            prev_agitation = agitation_score
            prev_state = mood_info['state']
            state_streak = mood_info.get('state_streak', state_streak)
            pending_state = mood_info.get('pending_state', None)
            pending_count = mood_info.get('pending_count', 0)
            last_active_agitation = agitation_score
            last_active_state = display_state

        progress.value = idx + 1
        progress_label.value = (
            f"üîé Window {idx+1}/{len(starts)} ‚Äî t={round(((start_sample + end_sample) / 2) / sr, 2):.1f}s | {results[-1]['state']} | "
            f"score={results[-1]['agitation_score']:.1f} | trend={results[-1]['tension_trend']:.1f}"
        )

    if not results:
        fallback_features = extract_features(audio, sr)
        fallback_agitation = compute_agitation_score(fallback_features, baseline, prev_score=None, smoothing_alpha=SMOOTHING_ALPHA, spike_threshold=SPIKE_THRESHOLD, max_step=MAX_STEP, score_gain=SCORE_GAIN)
        fallback_trend = fallback_agitation

        fallback_mood = get_mood_state(fallback_features, baseline, fallback_agitation)
        fallback_feature_row = {
            "rms": float(fallback_features.get("rms", 0.0)),
            "rms_std": float(fallback_features.get("rms_std", 0.0)),
            "rms_ratio": float(fallback_features.get("rms", 0.0)) / max(baseline.get('rms', 1e-6) or 1e-6, 1e-6),
            "voiced_ratio": float(fallback_features.get("voiced_ratio", 0.0)),
            "pitch_mean": float(fallback_features.get("pitch_mean", 0.0)),
            "pitch_std": float(fallback_features.get("pitch_std", 0.0)),
            "pitch_range": float(fallback_features.get("pitch_range", 0.0)),
            "pitch_jitter": float(fallback_features.get("pitch_jitter", 0.0)),
            "spectral_centroid": float(fallback_features.get("spectral_centroid", 0.0)),
            "spectral_bandwidth": float(fallback_features.get("spectral_bandwidth", 0.0)),
            "spectral_rolloff": float(fallback_features.get("spectral_rolloff", 0.0)),
            "spectral_flux": float(fallback_features.get("spectral_flux", 0.0)),
            "zcr": float(fallback_features.get("zcr", 0.0)),
            "pause_flag": 0,
        }
        out_row = {
            'state': 'CALM' if fallback_mood['state'] == 'CALM' else 'NOT_CALM',
            'agitation_score': fallback_mood['agitation_score'],
            'rms_ratio': fallback_mood['rms_ratio'],
            'pitch_jitter': fallback_mood['pitch_jitter'],
            'time_sec': round(len(audio) / (2 * sr), 2),
            'tension_trend': round(float(fallback_trend), 1),
            'dialogue_escalation': bool(False)
        }
        out_row.update(fallback_feature_row)
        results.append(out_row)

    df = pd.DataFrame(results)

    # Extra smoothing for visualization only (yellow line)
    df['rolling_mean_15s'] = df['agitation_score'].rolling(window=VIS_SMOOTH_WINDOW, min_periods=1).mean()
    df['rolling_mean_30s'] = df['agitation_score'].rolling(window=VIS_LONG_WINDOW, min_periods=1).mean()

    # Session-level stats for agitation
    session_mean = float(df['agitation_score'].mean())
    session_std = float(df['agitation_score'].std(ddof=0) or 0.0)
    session_p90 = float(df['agitation_score'].quantile(0.9))
    df.attrs['session_mean_agitation'] = session_mean
    df.attrs['session_std_agitation'] = session_std
    df.attrs['session_p90_agitation'] = session_p90
    df.attrs['session_state'] = 'CALM_SESSION' if (session_p90 < 40 and session_mean < 35) else 'NORMAL_SESSION'

    print(f"‚úÖ Analysis finished! {len(df)} time points")
    display(df.head())

    # Prepare visualization df
    def _prepare_agitation_vis(df_in: pd.DataFrame, window_points_30: int = VIS_LONG_WINDOW):
        if df_in.empty or 'agitation_score' not in df_in.columns:
            return df_in
        dfv = df_in.copy()
        mean_fp = dfv.attrs.get('session_mean_agitation', float(dfv['agitation_score'].mean()))
        std_fp = dfv.attrs.get('session_std_agitation', float(dfv['agitation_score'].std(ddof=0) or 0.0))
        eps = 1e-6
        dfv['ag_centered'] = dfv['agitation_score'] - mean_fp
        dfv['ag_z'] = dfv['ag_centered'] / max(std_fp, eps)
        window = max(3, int(window_points_30))
        dfv['ag_centered_30s'] = dfv['ag_centered'].rolling(window=window, min_periods=1).mean()
        dfv['vis_agitation'] = dfv['rolling_mean_15s']
        return dfv

    df_vis = _prepare_agitation_vis(df)
    df_vis = _prepare_agitation_vis(df)

    # Semantics: ASR + text (optional)
    # Semantics: ASR + text (optional)
    df_text = pd.DataFrame()
    if 'selected_path' in globals() and selected_path and load_text_pipelines():
        df_text = transcribe_and_classify(selected_path, window_sec=5.0, hop_sec=5.0, sr=16000)
        if df_text.empty:
            print('‚ö†Ô∏è Semantic layer unavailable (ASR/classifier not loaded)')
    else:
        print('‚ÑπÔ∏è Semantic layer not active (no selected_path or model)')


    df_fused = fuse_audio_text(df_vis, df_text)
    
    # Ground truth to plot (manual only)
    annotations_for_plot = manual_annotations.copy() if 'manual_annotations' in globals() else []
    best_lag = 0.0
    best_corr = None
    score_col = "fusion_score" if "fusion_score" in df_fused.columns else ("vis_agitation" if "vis_agitation" in df_fused.columns else "agitation_score")
    if not annotations_for_plot:
        print("‚ÑπÔ∏è No GT annotations: edit Step 2.")
    else:
        print("‚ÑπÔ∏è Using manual GT (no auto-alignment or calibration).")

    # ===== MANUAL GROUND TRUTH =====
    # ===== MANUAL GROUND TRUTH =====

    label_colors = {
        "CALM": "rgba(0,170,90,0.30)",
        "NOT_CALM": "rgba(220,40,30,0.30)"
    }

    # ===== FIGURE: Reference-style (NO OVERLAP) =====
    fig = go.Figure()

    # =========================
    # 1. Ground Truth ‚Äî separate Y2 axis
    # =========================
    GT_COLORS = {
        "CALM": "rgba(0, 170, 90, 0.70)",
        "NOT_CALM": "rgba(220, 40, 30, 0.70)",
    }

    for seg in annotations_for_plot:
        label = seg["label"].upper()

        fig.add_shape(
            type="rect",
            x0=seg["start"],
            x1=seg["end"],
            y0=0,
            y1=1,
            yref="y2",
            fillcolor=GT_COLORS[label],
            line=dict(width=0),
            layer="below"
        )

        fig.add_annotation(
            x=(seg["start"] + seg["end"]) / 2,
            xref="x",
            y=0.99,
            yref="paper",
            text=label,
            showarrow=False,
            font=dict(size=12, color="#2c3e50"),
        )

    def _score_to_label(score, threshold=THRESHOLD_NOT_CALM):
        return "NOT_CALM" if score >= threshold else "CALM"

    def _apply_hysteresis(scores, thr_enter, thr_exit):
        if len(scores) == 0:
            return []
        state = "CALM" if scores[0] < thr_enter else "NOT_CALM"
        labels = []
        for s in scores:
            if state == "CALM" and s >= thr_enter:
                state = "NOT_CALM"
            elif state == "NOT_CALM" and s < thr_exit:
                state = "CALM"
            labels.append(state)
        return labels

    # GT change markers
    if annotations_for_plot:
        gt_change_points = sorted({seg["start"] for seg in annotations_for_plot} | {seg["end"] for seg in annotations_for_plot})
        for x in gt_change_points:
            fig.add_vline(x=x, line_width=1, line_dash="dot", line_color="rgba(0,0,0,0.25)")

    # Model change markers (hysteresis)
    if score_col in df_fused.columns:
        scores = df_fused[score_col].to_numpy()
        pred_labels = _apply_hysteresis(scores, THRESHOLD_ENTER_NOT_CALM, THRESHOLD_EXIT_NOT_CALM)
        change_times = [
            df_fused["time_sec"].iloc[i]
            for i in range(1, len(pred_labels))
            if pred_labels[i] != pred_labels[i - 1]
        ]
        for x in change_times:
            fig.add_vline(x=x, line_width=1, line_dash="dash", line_color="rgba(0,0,0,0.2)")

    # 2. Model ‚Äî main Y axis

    # =========================

    # Raw (background)
    fig.add_trace(
        go.Scatter(
            x=df_fused["time_sec"],
            y=df_fused["agitation_score"],
            name="Acoustic (raw)",
            mode="lines",
            line=dict(color="rgba(120,120,120,0.3)", width=1),
            hoverinfo="skip",
            yaxis="y"
        )
    )

    # Smoothed (main line)
    if "vis_agitation" in df_fused:
        fig.add_trace(
            go.Scatter(
                x=df_fused["time_sec"],
                y=df_fused["vis_agitation"],
                name="Acoustic (smoothed)",
                mode="lines",
                line=dict(color="#E67E22", width=3),
                yaxis="y"
            )
        )

    # Fusion
    if "fusion_score" in df_fused:
        fig.add_trace(
            go.Scatter(
                x=df_fused["time_sec"],
                y=df_fused["fusion_score"],
                name="Fusion (audio + text)",
                mode="lines",
                line=dict(color="#C0392B", width=2.5, dash="dash"),
                yaxis="y"
            )
        )

    # =========================
    # 3. Emotion zones (model)
    # =========================
    fig.add_hrect(
        y0=0, y1=THRESHOLD_NOT_CALM,
        fillcolor="rgba(0,170,90,0.18)",
        line_width=0,
        layer="below",
        yref="y"
    )

    fig.add_hrect(
        y0=THRESHOLD_NOT_CALM, y1=100,
        fillcolor="rgba(220,40,30,0.18)",
        line_width=0,
        layer="below",
        yref="y"
    )

    # =========================
    # 4. Layout
    # =========================
    # Keep x-axis aligned to audio duration (avoid negative time from GT shift)
    audio_duration_sec = float(df_fused['time_sec'].max()) if 'time_sec' in df_fused.columns else 0.0
    if 'audio' in globals() and audio is not None and len(audio) > 0:
        audio_duration_sec = len(audio) / sr
    audio_duration_sec = max(0.0, audio_duration_sec)

    fig.update_layout(
        title=f"Agitation vs Ground Truth ‚Äî {dropdown.value}",
        xaxis=dict(title="Time (sec)", range=[0, audio_duration_sec]),

        # Main axis ‚Äî model
        yaxis=dict(
            title="Agitation score (0‚Äì100)",
            range=[0, 100],
            domain=[0.0, 0.82]   # model below
        ),

        # Second axis ‚Äî GT
        yaxis2=dict(
            range=[0, 1],
            domain=[0.86, 1.0],  # GT above
            visible=False
        ),

        hovermode="x unified",
        plot_bgcolor="white",
        paper_bgcolor="white",
        template="plotly_white",
        font=dict(size=13),

        legend=dict(
            orientation="h",
            yanchor="bottom",
            y=1.15,
            xanchor="left",
            x=0
        ),

        margin=dict(l=70, r=20, t=90, b=50),
        height=560
    )

    fig.show()

üîÑ Analyzing 3s windows...


VBox(children=(IntProgress(value=0, bar_style='info', description='‚è≥', max=19), HTML(value='‚è≥ Preparing...')))

‚úÖ Analysis finished! 19 time points


Unnamed: 0,state,agitation_score,rms_ratio,pitch_jitter,time_sec,tension_trend,dialogue_escalation,rms,rms_std,voiced_ratio,pitch_mean,pitch_std,pitch_range,spectral_centroid,spectral_bandwidth,spectral_rolloff,spectral_flux,zcr,pause_flag,rolling_mean_15s,rolling_mean_30s
0,CALM,12.5,1.023,14.261,1.5,12.5,False,0.048,0.033,0.585,97.081,13.845,70.037,1291.501,1267.681,2368.767,27.228,0.102,0,12.5,12.5
1,CALM,12.0,1.07,14.263,3.5,12.5,False,0.05,0.029,0.745,97.859,13.958,71.909,1172.717,1261.648,2162.234,26.189,0.085,0,12.25,12.25
2,CALM,9.3,0.995,13.485,5.5,12.4,False,0.046,0.031,0.745,98.985,13.348,56.373,1061.046,1151.995,1865.691,22.406,0.073,0,11.267,11.267
3,CALM,5.6,1.012,3.781,7.5,12.2,False,0.047,0.028,0.691,89.147,3.37,15.475,1044.877,1090.138,1741.689,23.302,0.081,0,8.967,9.85
4,CALM,12.6,0.724,23.283,9.5,12.2,False,0.034,0.023,0.628,123.564,28.77,108.718,1333.482,1364.543,2469.914,19.715,0.096,0,9.167,10.4


‚ÑπÔ∏è Using manual GT (no auto-alignment or calibration).


In [59]:

# Step 3b ‚Äî Metrics on a 2s grid (2 classes, fixed threshold)
import numpy as np
import pandas as pd

BIN_SIZE = 2.0
THRESHOLD_NOT_CALM = globals().get("THRESHOLD_NOT_CALM", 40.0)
THRESHOLD_SEARCH_STEPS = 101
HYSTERESIS_DELTA = 3.0
USE_HYSTERESIS = True

LABEL_TO_CODE = {"CALM": 0, "NOT_CALM": 1}


def _gt_label_at(t, annotations):
    for seg in annotations:
        if t >= seg["start"] and t < seg["end"]:
            return seg["label"].upper()
    return None


def build_bins(df, feature, bin_size):
    if df is None or df.empty:
        return None
    if feature == "pause_ratio":
        if "pause_flag" not in df.columns:
            return None
        work = df[["time_sec", "pause_flag"]].copy()
        work["bin_idx"] = (work["time_sec"] / bin_size).astype(int)
        grouped = work.groupby("bin_idx").agg(time_sec=("time_sec", "mean"), score=("pause_flag", "mean"))
    else:
        if feature not in df.columns:
            return None
        work = df[["time_sec", feature]].copy()
        work["bin_idx"] = (work["time_sec"] / bin_size).astype(int)
        grouped = work.groupby("bin_idx").agg(time_sec=("time_sec", "mean"), score=(feature, "mean"))
    return grouped.reset_index(drop=True)


def _predict_label(score, threshold):
    return "NOT_CALM" if score >= threshold else "CALM"


def _apply_hysteresis(scores, thr_enter, thr_exit, start_label="CALM"):
    if len(scores) == 0:
        return []
    state = start_label
    labels = []
    for s in scores:
        if state == "CALM" and s >= thr_enter:
            state = "NOT_CALM"
        elif state == "NOT_CALM" and s < thr_exit:
            state = "CALM"
        labels.append(state)
    return labels


def calibrate_threshold(scores, gt_labels, steps=101, center=None, radius=None):
    scores = np.asarray(scores, dtype=float)
    gt_labels = np.asarray(gt_labels)
    if scores.size == 0:
        return None, {"f1_not_calm": 0.0, "precision": 0.0, "recall": 0.0}

    vmin = float(np.nanmin(scores))
    vmax = float(np.nanmax(scores))
    if not np.isfinite(vmin) or not np.isfinite(vmax) or vmax - vmin < 1e-9:
        return vmin, {"f1_not_calm": 0.0, "precision": 0.0, "recall": 0.0}

    if center is not None and radius is not None:
        vmin = max(vmin, center - radius)
        vmax = min(vmax, center + radius)
        if vmax - vmin < 1e-9:
            return vmin, {"f1_not_calm": 0.0, "precision": 0.0, "recall": 0.0}

    thresholds = np.linspace(vmin, vmax, steps)
    best_thr = thresholds[0]
    best_f1 = -1.0
    best_precision = 0.0
    best_recall = 0.0
    best_acc = -1.0

    gt_pos = gt_labels == "NOT_CALM"

    for thr in thresholds:
        pred_pos = scores >= thr
        tp = int((gt_pos & pred_pos).sum())
        fp = int((~gt_pos & pred_pos).sum())
        fn = int((gt_pos & ~pred_pos).sum())
        precision = tp / (tp + fp) if (tp + fp) else 0.0
        recall = tp / (tp + fn) if (tp + fn) else 0.0
        f1 = 2 * precision * recall / (precision + recall) if (precision + recall) else 0.0
        acc = float((pred_pos == gt_pos).mean())
        if f1 > best_f1 or (f1 == best_f1 and acc > best_acc):
            best_f1 = f1
            best_precision = precision
            best_recall = recall
            best_thr = float(thr)
            best_acc = acc

    return best_thr, {"f1_not_calm": best_f1, "precision": best_precision, "recall": best_recall}


def compute_metrics_from_bins(df_bins, threshold, use_hysteresis=True, hysteresis_delta=2.0):
    df_bins = df_bins.copy()
    scores = df_bins["score"].to_numpy()
    start_label = df_bins["gt_label"].iloc[0] if len(df_bins) else "CALM"

    if use_hysteresis:
        thr_enter = threshold + hysteresis_delta
        thr_exit = max(0, threshold - hysteresis_delta)
        pred_labels = _apply_hysteresis(scores, thr_enter, thr_exit, start_label=start_label)
    else:
        pred_labels = ["NOT_CALM" if s >= threshold else "CALM" for s in scores]

    df_bins["gt_code"] = df_bins["gt_label"].map(LABEL_TO_CODE)
    df_bins["pred_label"] = pred_labels
    df_bins["pred_code"] = df_bins["pred_label"].map(LABEL_TO_CODE)

    accuracy = float((df_bins["pred_label"] == df_bins["gt_label"]).mean())

    # NOT_CALM metrics
    gt_pos = df_bins["gt_label"] == "NOT_CALM"
    pred_pos = df_bins["pred_label"] == "NOT_CALM"
    tp = int((gt_pos & pred_pos).sum())
    fp = int((~gt_pos & pred_pos).sum())
    fn = int((gt_pos & ~pred_pos).sum())
    precision = tp / (tp + fp) if (tp + fp) else 0.0
    recall = tp / (tp + fn) if (tp + fn) else 0.0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) else 0.0
    iou = tp / (tp + fp + fn) if (tp + fp + fn) else 0.0

    # Correlations (score vs GT code)
    pearson = float(df_bins["score"].corr(df_bins["gt_code"])) if df_bins["gt_code"].nunique() > 1 else np.nan
    try:
        spearman = float(df_bins["score"].corr(df_bins["gt_code"], method="spearman")) if df_bins["gt_code"].nunique() > 1 else np.nan
    except Exception:
        spearman = np.nan

    # MAE/RMSE on 0‚Äì1 scale (normalized by min/max)
    s_min = float(df_bins["score"].min())
    s_max = float(df_bins["score"].max())
    if s_max - s_min > 1e-9:
        score_scaled = (df_bins["score"] - s_min) / (s_max - s_min)
    else:
        score_scaled = df_bins["score"] * 0.0
    mae = float(np.mean(np.abs(score_scaled - df_bins["gt_code"])))
    rmse = float(np.sqrt(np.mean((score_scaled - df_bins["gt_code"]) ** 2)))

    metrics = {
        "accuracy": accuracy,
        "pearson": pearson,
        "spearman": spearman,
        "mae_0_1": mae,
        "rmse_0_1": rmse,
        "precision_not_calm": precision,
        "recall_not_calm": recall,
        "f1_not_calm": f1,
        "iou_not_calm": iou,
        "n_bins": int(len(df_bins)),
    }

    cm = pd.crosstab(
        df_bins["gt_label"],
        df_bins["pred_label"],
        rownames=["GT"],
        colnames=["Pred"],
        dropna=False,
    ).reindex(index=["CALM", "NOT_CALM"], columns=["CALM", "NOT_CALM"], fill_value=0)

    return metrics, cm


if 'df_fused' not in globals() or df_fused is None or df_fused.empty:
    print("‚ö†Ô∏è No data to evaluate. Run Step 3 first.")
elif 'annotations_for_plot' not in globals() or not annotations_for_plot:
    print("‚ö†Ô∏è No GT annotations. Edit Step 2 first.")
else:
    score_col = "fusion_score" if "fusion_score" in df_fused.columns else ("vis_agitation" if "vis_agitation" in df_fused.columns else "agitation_score")
    df_bins = build_bins(df_fused, score_col, BIN_SIZE)
    if df_bins is None or df_bins.empty:
        print("‚ö†Ô∏è No bins for evaluation.")
    else:
        df_bins["gt_label"] = df_bins["time_sec"].apply(lambda t: _gt_label_at(t, annotations_for_plot))
        df_bins = df_bins[df_bins["gt_label"].notna()].reset_index(drop=True)
        if df_bins.empty:
            print("‚ö†Ô∏è No GT coverage on this grid.")
        else:

            THRESHOLD_ENTER_NOT_CALM = THRESHOLD_NOT_CALM + HYSTERESIS_DELTA
            THRESHOLD_EXIT_NOT_CALM = max(0, THRESHOLD_NOT_CALM - HYSTERESIS_DELTA)

            metrics_summary, cmatrix = compute_metrics_from_bins(
                df_bins,
                threshold=THRESHOLD_NOT_CALM,
                use_hysteresis=USE_HYSTERESIS,
                hysteresis_delta=HYSTERESIS_DELTA,
            )
            best_lag_val = globals().get("best_lag", 0.0)
            best_corr_val = globals().get("best_corr", None)

            print(f"‚úÖ Metrics on {BIN_SIZE:.1f}s grid using '{score_col}'")
            print(f"GT shift: {best_lag_val:+.2f}s | align corr: {best_corr_val}")
            print(f"Fixed threshold NOT_CALM: {THRESHOLD_NOT_CALM:.2f} (enter {THRESHOLD_ENTER_NOT_CALM:.2f}, exit {THRESHOLD_EXIT_NOT_CALM:.2f})")
            for k, v in metrics_summary.items():
                if isinstance(v, float):
                    print(f"- {k}: {v:.3f}")
                else:
                    print(f"- {k}: {v}")
            display(cmatrix)


‚úÖ Metrics on 2.0s grid using 'fusion_score'
GT shift: +0.00s | align corr: None
Fixed threshold NOT_CALM: 40.00 (enter 43.00, exit 37.00)
- accuracy: 0.684
- pearson: 0.593
- spearman: 0.613
- mae_0_1: 0.347
- rmse_0_1: 0.466
- precision_not_calm: 1.000
- recall_not_calm: 0.600
- f1_not_calm: 0.750
- iou_not_calm: 0.600
- n_bins: 19


Pred,CALM,NOT_CALM
GT,Unnamed: 1_level_1,Unnamed: 2_level_1
CALM,4,0
NOT_CALM,6,9


In [60]:

# Step 3c ‚Äî Feature benchmark (2s grid, 2 classes)
import numpy as np
import pandas as pd

FEATURE_CANDIDATES = [
    "agitation_score",
    "vis_agitation",
    "fusion_score",
    "rms_ratio",
    "rms_std",
    "voiced_ratio",
    "pitch_mean",
    "pitch_std",
    "pitch_range",
    "pitch_jitter",
    "spectral_centroid",
    "spectral_bandwidth",
    "spectral_rolloff",
    "spectral_flux",
    "zcr",
    "pause_ratio",
    "sentiment_score",
]

SORT_BY = "f1_not_calm"
MAX_ROWS = 6
USE_HYSTERESIS = True
HYSTERESIS_DELTA = 3.0

if "build_bins" not in globals() or "compute_metrics_from_bins" not in globals() or "_gt_label_at" not in globals() or "calibrate_threshold" not in globals():
    print("‚ö†Ô∏è Run Step 3b first to define helper functions.")
    raise SystemExit

if 'df_fused' not in globals() or df_fused is None or df_fused.empty:
    print("‚ö†Ô∏è No data to benchmark. Run Step 3 first.")
elif 'annotations_for_plot' not in globals() or not annotations_for_plot:
    print("‚ö†Ô∏è No GT annotations. Edit Step 2 first.")
else:
    feature_cols = [f for f in FEATURE_CANDIDATES if f in df_fused.columns]
    if not feature_cols:
        print("‚ö†Ô∏è No feature columns found.")
    else:
        rows = []
        for feat in feature_cols:
            df_bins = build_bins(df_fused, feat, BIN_SIZE)
            if df_bins is None or df_bins.empty:
                continue

            df_bins["gt_label"] = df_bins["time_sec"].apply(lambda t: _gt_label_at(t, annotations_for_plot))
            df_bins = df_bins[df_bins["gt_label"].notna()].reset_index(drop=True)
            if df_bins.empty:
                continue

            # Normalize feature to 0‚Äì100 for fair thresholding
            vals = df_bins["score"].to_numpy()
            vmin = float(np.nanmin(vals))
            vmax = float(np.nanmax(vals))
            if not np.isfinite(vmin) or not np.isfinite(vmax) or vmax - vmin < 1e-6:
                continue
            df_bins["score"] = (vals - vmin) / (vmax - vmin) * 100.0

            best_thr, _ = calibrate_threshold(
                df_bins["score"].to_numpy(),
                df_bins["gt_label"].to_numpy(),
                steps=THRESHOLD_SEARCH_STEPS,
            )
            if best_thr is None:
                continue

            metrics, _ = compute_metrics_from_bins(
                df_bins,
                threshold=best_thr,
                use_hysteresis=USE_HYSTERESIS,
                hysteresis_delta=HYSTERESIS_DELTA,
            )
            metrics["feature"] = feat
            metrics["best_threshold"] = best_thr
            rows.append(metrics)

        if not rows:
            print("‚ö†Ô∏è No features to benchmark.")
        else:
            df_rank = pd.DataFrame(rows)
            if SORT_BY in df_rank.columns:
                df_rank = df_rank.sort_values(SORT_BY, ascending=False)
            display(df_rank.head(MAX_ROWS))


Unnamed: 0,accuracy,pearson,spearman,mae_0_1,rmse_0_1,precision_not_calm,recall_not_calm,f1_not_calm,iou_not_calm,n_bins,feature,best_threshold
6,1.0,0.822,0.707,0.231,0.311,1.0,1.0,1.0,1.0,19,pitch_mean,9.0
7,1.0,0.73,0.707,0.285,0.342,1.0,1.0,1.0,1.0,19,pitch_std,29.0
8,1.0,0.71,0.707,0.353,0.394,1.0,1.0,1.0,1.0,19,pitch_range,32.0
10,1.0,0.75,0.707,0.294,0.355,1.0,1.0,1.0,1.0,19,spectral_centroid,39.0
11,1.0,0.814,0.707,0.256,0.299,1.0,1.0,1.0,1.0,19,spectral_bandwidth,36.0
12,1.0,0.778,0.707,0.283,0.328,1.0,1.0,1.0,1.0,19,spectral_rolloff,45.0


In [61]:
import numpy as np
import pandas as pd
import plotly.graph_objects as go

FEATURES = [
    "rms","rms_std","rms_ratio","voiced_ratio",
    "pitch_mean","pitch_std","pitch_range","pitch_slope","pitch_jitter",
    "spectral_centroid","spectral_bandwidth","spectral_rolloff","spectral_flux","zcr",
    "pause_flag","sentiment_score"
]

def _label_at(t, ann):
    for seg in ann:
        if t >= seg["start"] and t < seg["end"]:
            return seg["label"].upper()
    return None

if 'df_fused' not in globals() or df_fused is None or df_fused.empty:
    print("Run Step 3 first.")
elif 'manual_annotations' not in globals() or not manual_annotations:
    print("No manual_annotations. Edit Step 2.")
else:
    df = df_fused.copy()
    df["gt"] = df["time_sec"].apply(lambda t: _label_at(t, manual_annotations))
    df = df[df["gt"].notna()].copy()
    df["gt_code"] = df["gt"].map({"CALM": 0, "NOT_CALM": 1})

    # Correlation with GT (CALM/NOT_CALM)
    rows = []
    for feat in FEATURES:
        if feat not in df.columns:
            continue
        x = pd.to_numeric(df[feat], errors="coerce")
        y = df["gt_code"].astype(float)
        if x.nunique() < 2:
            continue
        pearson = x.corr(y)
        try:
            spearman = x.corr(y, method="spearman")
        except Exception:
            spearman = np.nan
        rows.append({"feature": feat, "pearson": pearson, "spearman": spearman})

    df_corr_gt = pd.DataFrame(rows)
    if df_corr_gt.empty:
        print("No valid features for GT correlation.")
    else:
        df_corr_gt["abs_pearson"] = df_corr_gt["pearson"].abs()
        df_corr_gt = df_corr_gt.sort_values("abs_pearson", ascending=False)
        display(df_corr_gt)

        fig = go.Figure()
        fig.add_trace(go.Bar(x=df_corr_gt["feature"], y=df_corr_gt["pearson"], name="Pearson vs GT"))
        fig.update_layout(title="Feature correlation with GT (CALM/NOT_CALM)",
                          xaxis_title="Feature", yaxis_title="Correlation",
                          height=360)
        fig.show()

    # Correlation with model output (agitation_score)
    if "agitation_score" in df.columns:
        rows = []
        for feat in FEATURES:
            if feat not in df.columns:
                continue
            x = pd.to_numeric(df[feat], errors="coerce")
            y = pd.to_numeric(df["agitation_score"], errors="coerce")
            if x.nunique() < 2:
                continue
            pearson = x.corr(y)
            try:
                spearman = x.corr(y, method="spearman")
            except Exception:
                spearman = np.nan
            rows.append({"feature": feat, "pearson": pearson, "spearman": spearman})

        df_corr_model = pd.DataFrame(rows)
        if not df_corr_model.empty:
            df_corr_model["abs_pearson"] = df_corr_model["pearson"].abs()
            df_corr_model = df_corr_model.sort_values("abs_pearson", ascending=False)
            display(df_corr_model)

            fig = go.Figure()
            fig.add_trace(go.Bar(x=df_corr_model["feature"], y=df_corr_model["pearson"], name="Pearson vs agitation_score"))
            fig.update_layout(title="Feature correlation with model output",
                              xaxis_title="Feature", yaxis_title="Correlation",
                              height=360)
            fig.show()


Unnamed: 0,feature,pearson,spearman,abs_pearson
4,pitch_mean,0.822,0.707,0.822
9,spectral_bandwidth,0.814,0.707,0.814
10,spectral_rolloff,0.778,0.707,0.778
8,spectral_centroid,0.75,0.707,0.75
5,pitch_std,0.73,0.707,0.73
6,pitch_range,0.71,0.707,0.71
12,zcr,0.591,0.566,0.591
7,pitch_jitter,0.505,0.377,0.505
11,spectral_flux,0.473,0.471,0.473
0,rms,0.216,0.377,0.216


Unnamed: 0,feature,pearson,spearman,abs_pearson
4,pitch_mean,0.855,0.851,0.855
8,spectral_centroid,0.653,0.681,0.653
9,spectral_bandwidth,0.62,0.623,0.62
10,spectral_rolloff,0.617,0.589,0.617
6,pitch_range,0.573,0.575,0.573
5,pitch_std,0.561,0.512,0.561
11,spectral_flux,0.495,0.368,0.495
12,zcr,0.422,0.426,0.422
2,rms_ratio,0.356,0.282,0.356
0,rms,0.356,0.282,0.356


In [None]:
# Step 4 ‚Äî Per-feature plots (visual only)
import numpy as np
import pandas as pd
import plotly.graph_objects as go

if 'df_fused' not in globals() or df_fused is None or df_fused.empty:
    print("Run Step 3 first.")
elif 'annotations_for_plot' not in globals():
    print("No GT annotations. Run Step 2.")
else:
    FEATURES_TO_PLOT = [
        "pitch_mean",
        "pitch_std",
        "pitch_range",
        "spectral_centroid",
        "spectral_bandwidth",
        "spectral_rolloff",
        "spectral_flux",
        "zcr",
        "pitch_jitter",
        "rms",
        "rms_std",
        "rms_ratio",
        "voiced_ratio",
        "pause_flag",
        "sentiment_score",
    ]

    USE_QUANTILES = True
    LOW_Q = 0.05
    HIGH_Q = 0.95

    def _scale_to_0_100(series):
        s = pd.to_numeric(series, errors="coerce")
        if USE_QUANTILES:
            lo = float(s.quantile(LOW_Q))
            hi = float(s.quantile(HIGH_Q))
        else:
            lo = float(s.min())
            hi = float(s.max())
        if not np.isfinite(lo) or not np.isfinite(hi) or hi - lo < 1e-9:
            return None
        return ((s - lo) / (hi - lo) * 100.0).clip(0, 100)

    def _apply_hysteresis(scores, thr_enter, thr_exit):
        if len(scores) == 0:
            return []
        state = "CALM" if scores[0] < thr_enter else "NOT_CALM"
        labels = []
        for s in scores:
            if state == "CALM" and s >= thr_enter:
                state = "NOT_CALM"
            elif state == "NOT_CALM" and s < thr_exit:
                state = "CALM"
            labels.append(state)
        return labels

    # Keep x-axis aligned to audio duration
    audio_duration_sec = float(df_fused['time_sec'].max()) if 'time_sec' in df_fused.columns else 0.0
    if 'audio' in globals() and audio is not None and len(audio) > 0:
        audio_duration_sec = len(audio) / sr
    audio_duration_sec = max(0.0, audio_duration_sec)

    for feat in FEATURES_TO_PLOT:
        if feat not in df_fused.columns:
            continue
        scaled = _scale_to_0_100(df_fused[feat])
        if scaled is None:
            print(f"Skipping {feat}: not enough variance")
            continue

        df_plot = pd.DataFrame({
            "time_sec": df_fused["time_sec"].to_numpy(),
            "raw": scaled.to_numpy(),
        })
        smooth_win = int(globals().get("VIS_SMOOTH_WINDOW", 3))
        df_plot["smoothed"] = pd.Series(df_plot["raw"]).rolling(window=smooth_win, min_periods=1).mean()

        fig = go.Figure()

        # GT ‚Äî separate Y2 axis
        GT_COLORS = {
            "CALM": "rgba(0, 170, 90, 0.70)",
            "NOT_CALM": "rgba(220, 40, 30, 0.70)",
        }
        for seg in annotations_for_plot:
            label = seg["label"].upper()
            fig.add_shape(
                type="rect",
                x0=seg["start"],
                x1=seg["end"],
                y0=0,
                y1=1,
                yref="y2",
                fillcolor=GT_COLORS[label],
                line=dict(width=0),
                layer="below",
            )
            fig.add_annotation(
                x=(seg["start"] + seg["end"]) / 2,
                xref="x",
                y=0.99,
                yref="paper",
                text=label,
                showarrow=False,
                font=dict(size=12, color="#2c3e50"),
            )

        # GT change markers
        if annotations_for_plot:
            gt_change_points = sorted({seg["start"] for seg in annotations_for_plot} | {seg["end"] for seg in annotations_for_plot})
            for x in gt_change_points:
                fig.add_vline(x=x, line_width=1, line_dash="dot", line_color="rgba(0,0,0,0.25)")

        # Feature raw + smoothed
        fig.add_trace(
            go.Scatter(
                x=df_plot["time_sec"],
                y=df_plot["raw"],
                name=f"{feat} (raw)",
                mode="lines",
                line=dict(color="rgba(120,120,120,0.3)", width=1),
                hoverinfo="skip",
                yaxis="y",
            )
        )
        fig.add_trace(
            go.Scatter(
                x=df_plot["time_sec"],
                y=df_plot["smoothed"],
                name=f"{feat} (smoothed)",
                mode="lines",
                line=dict(color="#E67E22", width=3),
                yaxis="y",
            )
        )

        # Model change markers (hysteresis on smoothed feature)
        scores = df_plot["smoothed"].to_numpy()
        thr_enter = THRESHOLD_NOT_CALM + HYSTERESIS_DELTA
        thr_exit = max(0, THRESHOLD_NOT_CALM - HYSTERESIS_DELTA)
        pred_labels = _apply_hysteresis(scores, thr_enter, thr_exit)
        change_times = [
            df_plot["time_sec"].iloc[i]
            for i in range(1, len(pred_labels))
            if pred_labels[i] != pred_labels[i - 1]
        ]
        for x in change_times:
            fig.add_vline(x=x, line_width=1, line_dash="dash", line_color="rgba(0,0,0,0.2)")

        # Emotion zones (reference only)
        fig.add_hrect(
            y0=0, y1=THRESHOLD_NOT_CALM,
            fillcolor="rgba(0,170,90,0.18)",
            line_width=0,
            layer="below",
            yref="y",
        )
        fig.add_hrect(
            y0=THRESHOLD_NOT_CALM, y1=100,
            fillcolor="rgba(220,40,30,0.18)",
            line_width=0,
            layer="below",
            yref="y",
        )

        fig.update_layout(
            title=f"Feature vs Ground Truth ‚Äî {feat} ‚Äî {dropdown.value}",
            xaxis=dict(title="Time (sec)", range=[0, audio_duration_sec]),
            yaxis=dict(
                title="Scaled feature (0‚Äì100)",
                range=[0, 100],
                domain=[0.0, 0.82],
            ),
            yaxis2=dict(
                range=[0, 1],
                domain=[0.86, 1.0],
                visible=False,
            ),
            hovermode="x unified",
            plot_bgcolor="white",
            paper_bgcolor="white",
            template="plotly_white",
            font=dict(size=13),
            legend=dict(
                orientation="h",
                yanchor="bottom",
                y=1.15,
                xanchor="left",
                x=0,
            ),
            margin=dict(l=70, r=20, t=90, b=50),
            height=560,
        )
        fig.show()
