In [225]:
!pip install librosa==0.10.1 numpy pandas matplotlib seaborn plotly scipy ipywidgets
!pip install sentencepiece




In [None]:

# –£–Ω–∏–≤–µ—Ä—Å–∞–ª—å–Ω—ã–π –ª–æ–∞–¥–µ—Ä –¥–ª—è WAV/MP3 —Å –ø—Ä–∏–Ω—É–¥–∏—Ç–µ–ª—å–Ω–æ–π –∫–æ–Ω–≤–µ—Ä—Ç–∞—Ü–∏–µ–π –≤ 16 kHz mono
import numpy as _np

def load_audio_any(path, target_sr=16000):
    try:
        audio, sr = librosa.load(path, sr=target_sr, mono=True)
        return audio.astype(_np.float32), sr
    except Exception as e:
        print(f"‚ö†Ô∏è –ù–µ —É–¥–∞–ª–æ—Å—å –∑–∞–≥—Ä—É–∑–∏—Ç—å {path}: {e}")
        return _np.array([], dtype=_np.float32), target_sr


import librosa
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import os, glob, warnings
import ipywidgets as widgets
from IPython.display import display, clear_output

warnings.filterwarnings('ignore')
pd.options.display.float_format = '{:.3f}'.format


def extract_features(audio, sr=16000, hop_length=512):
    """–ò–∑–≤–ª–µ–∫–∞–µ—Ç RMS, pitch, voiced_ratio –¥–ª—è 3—Å –æ–∫–Ω–∞; —É—Å—Ç–æ–π—á–∏–≤ –∫ —Ç–∏—à–∏–Ω–µ."""
    default_feats = {
        'rms': 0.0,
        'rms_std': 0.0,
        'pitch_jitter': 0.0,
        'voiced_ratio': 0.0,
        'pitch_mean': 0.0
    }
    if audio is None or len(audio) == 0:
        return default_feats

    audio = np.asarray(audio, dtype=np.float32)
    audio = np.nan_to_num(audio, nan=0.0, posinf=0.0, neginf=0.0)

    if np.allclose(audio, 0):
        return default_feats

    rms = librosa.feature.rms(y=audio, hop_length=hop_length)[0]

    try:
        pitches, voiced_flag, voiced_probs = librosa.pyin(
            audio, fmin=50, fmax=600, sr=sr, hop_length=hop_length
        )
    except Exception:
        pitches = np.full_like(rms, np.nan)
        voiced_probs = np.zeros_like(rms)

    voiced_probs_clean = voiced_probs[~np.isnan(voiced_probs)]
    voiced_ratio = float(np.mean(voiced_probs_clean)) if voiced_probs_clean.size > 0 else 0.0

    valid_pitches = pitches[~np.isnan(pitches)]
    pitch_jitter = float(np.std(valid_pitches) / np.mean(valid_pitches) * 100) if valid_pitches.size > 1 and np.mean(valid_pitches) > 0 else 0.0
    pitch_mean = float(np.nanmean(pitches)) if np.isfinite(np.nanmean(pitches)) else 0.0

    return {
        'rms': float(np.mean(rms)),
        'rms_std': float(np.std(rms)),
        'pitch_jitter': pitch_jitter,
        'voiced_ratio': voiced_ratio,
        'pitch_mean': pitch_mean
    }


def compute_baseline(audio, sr=16000, window_sec=15):
    """–í—ã—á–∏—Å–ª—è–µ—Ç –ø–µ—Ä—Å–æ–Ω–∞–ª—å–Ω—ã–π baseline –Ω–∞ –ø–µ—Ä–≤—ã—Ö window_sec —Å–µ–∫; —É—Å—Ç–æ–π—á–∏–≤ –∫ –∫–æ—Ä–æ—Ç–∫–∏–º —Ñ–∞–π–ª–∞–º."""
    if audio is None:
        return {'rms': 1e-6, 'rms_std': 1e-9, 'pitch_jitter': 1e-3, 'voiced_ratio': 0.0, 'pitch_mean': 0.0}

    samples_per_window = max(int(window_sec * sr), 1)
    baseline_features = []

    for i in range(0, len(audio), samples_per_window):
        window = audio[i:i+samples_per_window]
        if len(window) >= int(0.5 * sr):
            feats = extract_features(window, sr)
            baseline_features.append(feats)

    if not baseline_features:
        baseline_features.append(extract_features(audio, sr))

    keys = baseline_features[0].keys()
    baseline = {}
    for k in keys:
        values = [f.get(k, 0.0) for f in baseline_features]
        baseline[k] = float(np.nan_to_num(np.mean(values)))

    baseline['rms'] = max(baseline.get('rms', 0.0), 1e-6)
    baseline['rms_std'] = max(baseline.get('rms_std', 0.0), 1e-9)
    baseline['pitch_jitter'] = max(baseline.get('pitch_jitter', 0.0), 1e-3)
    return baseline


def compute_agitation_score(features, baseline, prev_score=None, smoothing_alpha=0.5, spike_threshold=20.0, max_step=9.0):
    """–°—á–∏—Ç–∞–µ—Ç agitation 0-100 —Å —É—Å–∏–ª–µ–Ω–Ω—ã–º —Å–≥–ª–∞–∂–∏–≤–∞–Ω–∏–µ–º –∏ –∞–Ω—Ç–∏-—Å–ø–∞–π–∫ —Ñ–∏–ª—å—Ç—Ä–æ–º.
    - smoothing_alpha (0.4-0.6): –¥–æ–ª—è –Ω–æ–≤–æ–≥–æ –æ–∫–Ω–∞; –æ—Å—Ç–∞–ª—å–Ω–æ–µ - –ø—Ä–æ—à–ª—ã–π score.
    - spike_threshold: –ø–æ—Ä–æ–≥ |raw - prev|, –ø—Ä–∏ –∫–æ—Ç–æ—Ä–æ–º –ø—Ä–∏—Ä–æ—Å—Ç —Ä–µ–∂–µ—Ç—Å—è –¥–æ max_step.
    - max_step: –º–∞–∫—Å–∏–º–∞–ª—å–Ω–æ –¥–æ–ø—É—Å—Ç–∏–º—ã–π —à–∞–≥ –∑–∞ –æ–∫–Ω–æ –ø—Ä–∏ —Ä–µ–∑–∫–∏—Ö —Å–∫–∞—á–∫–∞—Ö.
    """
    baseline_rms = max(baseline.get('rms', 1e-6) or 1e-6, 1e-6)
    rms_ratio = max(float(features.get('rms', 0.0)) / baseline_rms, 0.0)
    rms_volatility = max(float(features.get('rms_std', 0.0)) / baseline_rms, 0.0)
    pitch_jitter = float(np.nan_to_num(features.get('pitch_jitter', 0.0), nan=0.0, posinf=0.0, neginf=0.0))
    voiced_ratio = float(np.nan_to_num(features.get('voiced_ratio', 0.0), nan=0.0, posinf=0.0, neginf=0.0))

    rms_term = np.clip((rms_ratio - 0.9), 0, 2.0) * 8.0
    jitter_term = np.clip(pitch_jitter, 0, 40) * 1.3
    volatility_term = np.clip(rms_volatility, 0, 3.0) * 18.0
    voiced_term = np.clip(voiced_ratio, 0, 1.0) * 5.0

    raw_score = float(np.clip(rms_term + jitter_term + volatility_term + voiced_term, 0, 100))

    if prev_score is None or not np.isfinite(prev_score):
        prev_score = raw_score

    delta_raw = raw_score - prev_score
    direction = np.sign(delta_raw)

    if abs(delta_raw) > spike_threshold:
        candidate = prev_score + direction * max_step
    else:
        candidate = smoothing_alpha * raw_score + (1 - smoothing_alpha) * prev_score

    candidate = float(np.clip(candidate, 0, 100))
    return round(candidate, 1)


def get_mood_state(features, baseline, agitation_score, prev_state='CALM', prev_score=None, recent_scores=None, state_streak=1, pending_state=None, pending_count=0):
    """–û–ø—Ä–µ–¥–µ–ª—è–µ—Ç CALM/TENSE/ESCALATING –ø–æ —Å–≥–ª–∞–∂–µ–Ω–Ω–æ–º—É score —Å –≥–∏—Å—Ç–µ—Ä–µ–∑–∏—Å–æ–º –∏ streak-–ø–æ—Ä–æ–≥–∞–º–∏.
    - CALM<->TENSE: —Å–º–µ–Ω–∞ —Ç–æ–ª—å–∫–æ –µ—Å–ª–∏ –Ω–æ–≤–æ–µ —Å–æ—Å—Ç–æ—è–Ω–∏–µ –ø–æ–¥—Ä—è–¥ >=3 –æ–∫–Ω–∞.
    - TENSE<->ESCALATING: —Å–º–µ–Ω–∞ —Ç–æ–ª—å–∫–æ –µ—Å–ª–∏ –ø–æ–¥—Ä—è–¥ >=4 –æ–∫–æ–Ω; –≤—ã—Ö–æ–¥ –∏–∑ ESC –ø—Ä–∏ score <60 —É–¥–µ—Ä–∂–∏–≤–∞–µ–º —á–µ—Ä–µ–∑ streak.
    - ESC –∞–∫—Ç–∏–≤–∏—Ä—É–µ–º –ø—Ä–∏ score >=65 –∏–ª–∏ —Ä–æ—Å—Ç–µ >25 –∑–∞ ~3 –æ–∫–Ω–∞ (6-9 c), —É–¥–µ—Ä–∂–∏–≤–∞–µ–º –ø—Ä–∏ score >=60.
    –í–æ–∑–≤—Ä–∞—â–∞–µ—Ç —Å–æ—Å—Ç–æ—è–Ω–∏–µ –∏ –æ–±–Ω–æ–≤–ª–µ–Ω–Ω—ã–µ —Å—á–µ—Ç—á–∏–∫–∏ streak/pending, –ø–ª—é—Å –º–µ—Ç—Ä–∏–∫–∏.
    """
    recent_scores = recent_scores or []

    baseline_rms = max(baseline.get('rms', 1e-6) or 1e-6, 1e-6)
    rms_ratio = float(np.clip((features.get('rms', 0.0) / baseline_rms), 0, 10))
    pitch_jitter = float(np.nan_to_num(features.get('pitch_jitter', 0.0), nan=0.0, posinf=0.0, neginf=0.0))

    last_ref = prev_score if prev_score is not None else (recent_scores[-1] if recent_scores else None)
    delta = abs(agitation_score - last_ref) if last_ref is not None else 0.0

    if len(recent_scores) >= 3:
        growth_current = agitation_score - recent_scores[-3]
    else:
        growth_current = agitation_score - last_ref if last_ref is not None else 0.0

    escalate_cond = (agitation_score >= 65) or (growth_current > 25)
    calm_cond = (agitation_score < 30) and (delta < 10)
    hold_escalating = (prev_state == 'ESCALATING' and agitation_score >= 60)

    if hold_escalating:
        candidate_state = 'ESCALATING'
    elif escalate_cond:
        candidate_state = 'ESCALATING'
    elif calm_cond:
        candidate_state = 'CALM'
    elif agitation_score >= 30:
        candidate_state = 'TENSE'
    else:
        candidate_state = 'CALM'

    borderline = False
    new_pending_state = pending_state
    new_pending_count = pending_count
    new_state_streak = state_streak

    def required_streak(prev_state, cand_state):
        if {'CALM', 'TENSE'} == {prev_state, cand_state}:
            return 3
        if 'ESCALATING' in (prev_state, cand_state):
            return 4
        return 3

    if candidate_state == prev_state:
        final_state = prev_state
        new_state_streak = state_streak + 1
        new_pending_state = None
        new_pending_count = 0
    else:
        needed = required_streak(prev_state, candidate_state)
        if candidate_state == pending_state:
            new_pending_count = pending_count + 1
        else:
            new_pending_state = candidate_state
            new_pending_count = 1

        if new_pending_count >= needed:
            final_state = candidate_state
            new_state_streak = 1
            new_pending_state = None
            new_pending_count = 0
        else:
            final_state = prev_state
            new_state_streak = state_streak + 1
            borderline = True

    return {
        'state': final_state,
        'agitation_score': round(float(agitation_score), 1),
        'rms_ratio': round(rms_ratio, 2),
        'pitch_jitter': round(pitch_jitter, 1),
        'state_streak': new_state_streak,
        'pending_state': new_pending_state,
        'pending_count': new_pending_count,
        'borderline_state': borderline
    }


print("üìÇ –°–∫–∞–Ω–∏—Ä—É–µ–º –∞—É–¥–∏–æ –ø–∞–ø–∫—É...")
audio_dir = '/Users/zentrovoy/Documents/Insight Genie/VOICE-MODEL/Real-Time Emotion Detection/audio_samples'
wav_files = glob.glob(os.path.join(audio_dir, '*.wav')) + glob.glob(os.path.join(audio_dir, '*.mp3'))

audio = None
baseline = None
duration = 0
sr = 16000
audio_ready = False
selected_path = None

if not wav_files:
    dropdown = None
    print("‚ùå WAV —Ñ–∞–π–ª—ã –Ω–µ –Ω–∞–π–¥–µ–Ω—ã! –ü—Ä–æ–≤–µ—Ä—å –ø—É—Ç—å.")
else:
    file_names = [os.path.basename(f) for f in wav_files]
    dropdown = widgets.Dropdown(
        options=file_names,
        description='–í—ã–±–µ—Ä–∏ –∞—É–¥–∏–æ:',
        style={'description_width': 'initial'}
    )
    load_btn = widgets.Button(description='–ó–∞–≥—Ä—É–∑–∏—Ç—å —Ñ–∞–π–ª', button_style='primary')
    status = widgets.Output()

    def load_selected(_):
        global audio, baseline, duration, sr, audio_ready, selected_path
        with status:
            status.clear_output()
            if dropdown.value is None:
                print("‚ö†Ô∏è –í—ã–±–µ—Ä–∏ —Ñ–∞–π–ª –≤ –≤—ã–ø–∞–¥–∞—é—â–µ–º —Å–ø–∏—Å–∫–µ.")
                audio_ready = False
                return
            selected_file = os.path.join(audio_dir, dropdown.value)
            if not os.path.exists(selected_file):
                print("‚ùå –§–∞–π–ª –Ω–µ –Ω–∞–π–¥–µ–Ω (–≤–æ–∑–º–æ–∂–Ω–æ —É–¥–∞–ª–µ–Ω).")
                audio_ready = False
                return
            print(f"üîÑ –ó–∞–≥—Ä—É–∂–∞–µ–º: {dropdown.value}")
            audio, sr = load_audio_any(selected_file, target_sr=16000)
            duration = len(audio) / sr if sr else 0
            baseline_audio = audio[:min(15 * sr, len(audio))]
            baseline = compute_baseline(baseline_audio, sr)
            readable = {k: round(v, 3) for k, v in baseline.items()}
            selected_path = selected_file
            audio_ready = True
            print(f"‚úÖ –ó–∞–≥—Ä—É–∂–µ–Ω–æ: {duration:.1f} —Å–µ–∫, sr={sr}Hz")
            print("‚úÖ Baseline:", readable)

    load_btn.on_click(load_selected)

    print(f"‚úÖ –ù–∞–π–¥–µ–Ω–æ {len(wav_files)} WAV —Ñ–∞–π–ª–æ–≤. –í—ã–±–µ—Ä–∏ –∏ –Ω–∞–∂–º–∏ '–ó–∞–≥—Ä—É–∑–∏—Ç—å —Ñ–∞–π–ª'.")
    display(widgets.VBox([dropdown, load_btn, status]))


üìÇ –°–∫–∞–Ω–∏—Ä—É–µ–º –∞—É–¥–∏–æ –ø–∞–ø–∫—É...
‚úÖ –ù–∞–π–¥–µ–Ω–æ 13 WAV —Ñ–∞–π–ª–æ–≤. –í—ã–±–µ—Ä–∏ –∏ –Ω–∞–∂–º–∏ '–ó–∞–≥—Ä—É–∑–∏—Ç—å —Ñ–∞–π–ª'.


VBox(children=(Dropdown(description='–í—ã–±–µ—Ä–∏ –∞—É–¥–∏–æ:', options=('untitled #2.wav', 'good_interview.wav', 'untitl‚Ä¶

In [231]:

try:
    import torch
    from transformers import pipeline
except Exception:
    pipeline = None
    print("‚ö†Ô∏è transformers –Ω–µ –¥–æ—Å—Ç—É–ø–µ–Ω, —Å–µ–º–∞–Ω—Ç–∏—á–µ—Å–∫–∏–π —Å–ª–æ–π –±—É–¥–µ—Ç –æ—Ç–∫–ª—é—á–µ–Ω")

asr_pipe = None
sent_pipe = None

def load_text_pipelines(asr_model="openai/whisper-small", cls_model="nlptown/bert-base-multilingual-uncased-sentiment"): 
    """–õ–µ–Ω–∏–≤–∞—è –∑–∞–≥—Ä—É–∑–∫–∞ ASR –∏ —Ç–µ–∫—Å—Ç–æ–≤–æ–π –º–æ–¥–µ–ª–∏; –±–µ–∑–æ–ø–∞—Å–Ω–æ –ø–∞–¥–∞–µ—Ç, –µ—Å–ª–∏ –Ω–µ—Ç —Å–µ—Ç–∏/–º–æ–¥–µ–ª–µ–π."""
    global asr_pipe, sent_pipe
    if pipeline is None:
        print("‚ö†Ô∏è transformers –Ω–µ —É—Å—Ç–∞–Ω–æ–≤–ª–µ–Ω")
        return False
    device = 0 if "torch" in globals() and hasattr(torch, "cuda") and torch.cuda.is_available() else -1
    if asr_pipe is None:
        try:
            asr_pipe = pipeline("automatic-speech-recognition", model=asr_model, device=device)
        except Exception as e:
            print("‚ö†Ô∏è –ù–µ —É–¥–∞–ª–æ—Å—å –∑–∞–≥—Ä—É–∑–∏—Ç—å ASR:", e)
            asr_pipe = None
    if sent_pipe is None:
        try:
            sent_pipe = pipeline("text-classification", model=cls_model, device=device)
        except Exception as e:
            print("‚ö†Ô∏è –ù–µ —É–¥–∞–ª–æ—Å—å –∑–∞–≥—Ä—É–∑–∏—Ç—å —Ç–µ–∫—Å—Ç–æ–≤—É—é –º–æ–¥–µ–ª—å:", e)
            sent_pipe = None
    return asr_pipe is not None and sent_pipe is not None


def transcribe_and_classify(path, window_sec=5.0, hop_sec=5.0, sr=16000):
    """–î–µ–ª–∏—Ç –∞—É–¥–∏–æ –Ω–∞ –æ–∫–Ω–∞, –¥–µ–ª–∞–µ—Ç ASR –∏ —Ç–æ–Ω–∞–ª—å–Ω–æ—Å—Ç—å; –≤–æ–∑–≤—Ä–∞—â–∞–µ—Ç DataFrame —Å –≤—Ä–µ–º–µ–Ω–µ–º –∏ sentiment_score."""
    if asr_pipe is None or sent_pipe is None:
        return pd.DataFrame()
    audio, _ = librosa.load(path, sr=sr)
    win = int(window_sec * sr)
    hop = int(hop_sec * sr)
    rows = []
    for start in range(0, len(audio), hop):
        end = min(len(audio), start + win)
        chunk = audio[start:end]
        if len(chunk) < 0.5 * sr:
            continue
        try:
            text = asr_pipe({"array": chunk, "sampling_rate": sr}).get("text", "")
        except Exception:
            text = ""
        try:
            senti = sent_pipe(text)[0]
            sentiment_score = float(senti.get("score", 0.0)) if isinstance(senti, dict) else 0.0
        except Exception:
            sentiment_score = 0.0
        rows.append({
            "start_sec": start / sr,
            "end_sec": end / sr,
            "sentiment_score": sentiment_score,
            "text": text
        })
    return pd.DataFrame(rows)


def fuse_audio_text(df_audio: pd.DataFrame, df_text: pd.DataFrame):
    """Late fusion: –∞—É–¥–∏–æ (–±—ã—Å—Ç—Ä—ã–π) + —Ç–µ–∫—Å—Ç (–∫–∞–ª–∏–±—Ä–æ–≤–∫–∞). –í–æ–∑–≤—Ä–∞—â–∞–µ—Ç df —Å fusion_score."""
    if df_audio is None or df_audio.empty:
        return df_audio
    df_audio = df_audio.copy()
    if df_text is None or df_text.empty:
        df_audio["fusion_score"] = df_audio.get("vis_agitation", df_audio.get("agitation_score", 0.0))
        df_audio["sentiment_score"] = 0.0
        return df_audio

    df_text = df_text.copy()
    df_text["mid_sec"] = (df_text["start_sec"] + df_text["end_sec"]) / 2
    if "time_sec" in df_audio.columns:
        df_audio["mid_sec"] = df_audio["time_sec"]
    elif "start_sec" in df_audio.columns and "end_sec" in df_audio.columns:
        df_audio["mid_sec"] = (df_audio["start_sec"] + df_audio["end_sec"]) / 2
    else:
        df_audio["mid_sec"] = range(len(df_audio))

    merged = pd.merge_asof(
        df_audio.sort_values("mid_sec"),
        df_text.sort_values("mid_sec")[["mid_sec", "sentiment_score"]],
        on="mid_sec", direction="nearest", tolerance=3
    )
    merged["sentiment_score"] = merged["sentiment_score"].fillna(0.0)

    base = merged.get("vis_agitation", merged.get("agitation_score", merged.get("frustration_proxy", 0.0)))
    text_component = merged["sentiment_score"]
    if text_component.max() <= 1:
        text_component = text_component * 100
    
    # –°–≥–ª–∞–∂–∏–≤–∞–µ–º –∏ —Å—Ç–∞–≤–∏–º –ø–æ—Ä–æ–≥, —á—Ç–æ–±—ã –∏–∑–±–µ–≥–∞—Ç—å –∫—Ä–∞—Ç–∫–∏—Ö –ø—Ä–æ–≤–∞–ª–æ–≤
    text_component = text_component.rolling(window=3, min_periods=1, center=True).mean()
    text_component = text_component.clip(lower=20)  # –º–∏–Ω–∏–º–∞–ª—å–Ω—ã–π –≤–∫–ª–∞–¥ —Ç–µ–∫—Å—Ç–∞
    merged["fusion_score"] = 0.7 * base + 0.3 * text_component
            
    return merged


In [232]:
# –ê–Ω–∞–ª–∏–∑ –∏ –≥—Ä–∞—Ñ–∏–∫
if 'audio_ready' not in globals() or not audio_ready or audio is None or len(audio) == 0:
    print("‚ö†Ô∏è –ù–µ—Ç –∞—É–¥–∏–æ –¥–ª—è –∞–Ω–∞–ª–∏–∑–∞. –°–Ω–∞—á–∞–ª–∞ –≤—ã–±–µ—Ä–∏ –∏ –∑–∞–≥—Ä—É–∑–∏—Ç–µ —Ñ–∞–π–ª –≤ –ø–µ—Ä–≤–æ–π —è—á–µ–π–∫–µ.")
else:
    window_sec = 3
    hop_sec = 1
    sr = 16000
    window_samples = int(window_sec * sr)
    hop_samples = int(hop_sec * sr)

    starts = list(range(0, max(len(audio) - window_samples, 0) + hop_samples, hop_samples))
    if not starts:
        starts = [0]

    print("üîÑ –ê–Ω–∞–ª–∏–∑ –ø–æ 3—Å –æ–∫–Ω–∞–º...")
    progress = widgets.IntProgress(value=0, min=0, max=len(starts), description='‚è≥', bar_style='info')
    progress_label = widgets.HTML(value="‚è≥ –ü–æ–¥–≥–æ—Ç–æ–≤–∫–∞...")
    display(widgets.VBox([progress, progress_label]))

    results = []
    prev_state = 'CALM'
    prev_agitation = None
    last_active_agitation = None
    last_active_state = 'CALM'
    tension_trend = None
    score_history = []
    trend_alpha = 0.03  # –º–µ–¥–ª–µ–Ω–Ω—ã–π —Ç—Ä–µ–Ω–¥ ~30-50—Å
    state_streak = 0
    pending_state = None
    pending_count = 0

    pause_voiced_thr = 0.12  # —á—É–≤—Å—Ç–≤–∏—Ç–µ–ª—å–Ω–µ–µ –∫ —Ç–∏—à–∏–Ω–µ
    pause_rms_scale = 0.8

    for idx, start_sample in enumerate(starts):
        end_sample = min(len(audio), start_sample + window_samples)
        window_audio = audio[start_sample:end_sample]

        if len(window_audio) < int(0.5 * sr) and len(audio) > int(window_samples):
            continue

        features = extract_features(window_audio, sr)
        baseline_rms = max(baseline.get('rms', 1e-6) or 1e-6, 1e-6)
        voiced_ratio_val = float(np.nan_to_num(features.get('voiced_ratio', 0.0), nan=0.0, posinf=0.0, neginf=0.0))
        rms_val = float(np.nan_to_num(features.get('rms', 0.0), nan=0.0, posinf=0.0, neginf=0.0))
        is_pause = (voiced_ratio_val < pause_voiced_thr) and (rms_val < baseline_rms * pause_rms_scale)

        if is_pause and last_active_agitation is not None:
            agitation_score = last_active_agitation
            state_for_row = last_active_state
            rms_ratio_val = float(np.clip(rms_val / baseline_rms, 0, 10))
            pitch_jitter_val = float(np.nan_to_num(features.get('pitch_jitter', 0.0), nan=0.0, posinf=0.0, neginf=0.0))
            if tension_trend is None:
                tension_trend = agitation_score
            dialogue_escalation = (tension_trend > 50) and (agitation_score > 65)
            out_row = {
                'state': state_for_row,
                'agitation_score': round(float(agitation_score), 1),
                'rms_ratio': round(rms_ratio_val, 2),
                'pitch_jitter': round(pitch_jitter_val, 1),
                'time_sec': round(((start_sample + end_sample) / 2) / sr, 2),
                'tension_trend': round(float(tension_trend), 1),
                'dialogue_escalation': bool(dialogue_escalation)
            }
            results.append(out_row)
            score_history.append(agitation_score)
        else:
            agitation_score = compute_agitation_score(
                features, baseline, prev_score=prev_agitation,
                smoothing_alpha=0.5, spike_threshold=20.0, max_step=9.0
            )

            if tension_trend is None:
                tension_trend = agitation_score
            else:
                tension_trend = trend_alpha * agitation_score + (1 - trend_alpha) * tension_trend

            mood_info = get_mood_state(
                features, baseline, agitation_score,
                prev_state=prev_state, prev_score=prev_agitation,
                recent_scores=score_history,
                state_streak=state_streak,
                pending_state=pending_state,
                pending_count=pending_count
            )

            dialogue_escalation = (tension_trend > 50) and (agitation_score > 65)

            out_row = {
                'state': mood_info['state'],
                'agitation_score': mood_info['agitation_score'],
                'rms_ratio': mood_info['rms_ratio'],
                'pitch_jitter': mood_info['pitch_jitter'],
                'time_sec': round(((start_sample + end_sample) / 2) / sr, 2),
                'tension_trend': round(float(tension_trend), 1),
                'dialogue_escalation': bool(dialogue_escalation)
            }
            results.append(out_row)

            score_history.append(agitation_score)
            prev_agitation = agitation_score
            prev_state = mood_info['state']
            state_streak = mood_info.get('state_streak', state_streak)
            pending_state = mood_info.get('pending_state', None)
            pending_count = mood_info.get('pending_count', 0)
            last_active_agitation = agitation_score
            last_active_state = mood_info['state']

        progress.value = idx + 1
        progress_label.value = (
            f"üîé –û–∫–Ω–æ {idx+1}/{len(starts)} ‚Äî t={round(((start_sample + end_sample) / 2) / sr, 2):.1f}—Å | {results[-1]['state']} | "
            f"score={results[-1]['agitation_score']:.1f} | trend={results[-1]['tension_trend']:.1f}"
        )

    if not results:
        fallback_features = extract_features(audio, sr)
        fallback_agitation = compute_agitation_score(fallback_features, baseline, prev_score=None)
        fallback_trend = fallback_agitation
        fallback_mood = get_mood_state(fallback_features, baseline, fallback_agitation)
        out_row = {
            'state': fallback_mood['state'],
            'agitation_score': fallback_mood['agitation_score'],
            'rms_ratio': fallback_mood['rms_ratio'],
            'pitch_jitter': fallback_mood['pitch_jitter'],
            'time_sec': round(len(audio) / (2 * sr), 2),
            'tension_trend': round(float(fallback_trend), 1),
            'dialogue_escalation': bool(False)
        }
        results.append(out_row)

    df = pd.DataFrame(results)

    # –î–æ–ø. —Å–≥–ª–∞–∂–∏–≤–∞–Ω–∏–µ —Ç–æ–ª—å–∫–æ –¥–ª—è –≤–∏–∑—É–∞–ª–∏–∑–∞—Ü–∏–∏
    df['rolling_mean_15s'] = df['agitation_score'].rolling(window=15, min_periods=1).mean()
    df['rolling_mean_30s'] = df['agitation_score'].rolling(window=30, min_periods=1).mean()

    # Session-level stats for agitation
    session_mean = float(df['agitation_score'].mean())
    session_std = float(df['agitation_score'].std(ddof=0) or 0.0)
    session_p90 = float(df['agitation_score'].quantile(0.9))
    df.attrs['session_mean_agitation'] = session_mean
    df.attrs['session_std_agitation'] = session_std
    df.attrs['session_p90_agitation'] = session_p90
    df.attrs['session_state'] = 'CALM_SESSION' if (session_p90 < 40 and session_mean < 35) else 'NORMAL_SESSION'

    print(f"‚úÖ –ê–Ω–∞–ª–∏–∑ –∑–∞–≤–µ—Ä—à–µ–Ω! {len(df)} –≤—Ä–µ–º–µ–Ω–Ω—ã—Ö —Ç–æ—á–µ–∫")
    display(df.head())

    # –ì–æ—Ç–æ–≤–∏–º –≤–∏–∑—É–∞–ª—å–Ω—ã–π df
    def _prepare_agitation_vis(df_in: pd.DataFrame, window_points_30: int = 30):
        if df_in.empty or 'agitation_score' not in df_in.columns:
            return df_in
        dfv = df_in.copy()
        mean_fp = dfv.attrs.get('session_mean_agitation', float(dfv['agitation_score'].mean()))
        std_fp = dfv.attrs.get('session_std_agitation', float(dfv['agitation_score'].std(ddof=0) or 0.0))
        eps = 1e-6
        dfv['ag_centered'] = dfv['agitation_score'] - mean_fp
        dfv['ag_z'] = dfv['ag_centered'] / max(std_fp, eps)
        window = max(3, int(window_points_30))
        dfv['ag_centered_30s'] = dfv['ag_centered'].rolling(window=window, min_periods=1).mean()
        session_state_loc = dfv.attrs.get('session_state', 'NORMAL_SESSION')
        calm_level = 35.0
        calm_band = 5.0
        if session_state_loc == 'CALM_SESSION':
            vis = calm_level + dfv['ag_centered_30s']
            vis = vis.clip(calm_level - calm_band, calm_level + calm_band)
            dfv['vis_agitation'] = vis
        else:
            dfv['vis_agitation'] = dfv['rolling_mean_15s']
        return dfv

    df_vis = _prepare_agitation_vis(df)

    # –°–µ–º–∞–Ω—Ç–∏–∫–∞: ASR + —Ç–µ–∫—Å—Ç (–æ–ø—Ü–∏–æ–Ω–∞–ª—å–Ω–æ)
    df_text = pd.DataFrame()
    if 'selected_path' in globals() and selected_path and load_text_pipelines():
        df_text = transcribe_and_classify(selected_path, window_sec=5.0, hop_sec=5.0, sr=16000)
        if df_text.empty:
            print('‚ö†Ô∏è –°–µ–º–∞–Ω—Ç–∏–∫–∞ –Ω–µ –ø–æ–ª—É—á–µ–Ω–∞ (ASR/–∫–ª–∞—Å—Å–∏—Ñ–∏–∫–∞—Ç–æ—Ä –Ω–µ–¥–æ—Å—Ç—É–ø–Ω—ã)')
    else:
        print('‚ÑπÔ∏è –°–µ–º–∞–Ω—Ç–∏—á–µ—Å–∫–∏–π —Å–ª–æ–π –Ω–µ –∞–∫—Ç–∏–≤–∏—Ä–æ–≤–∞–Ω (–Ω–µ—Ç selected_path –∏–ª–∏ –º–æ–¥–µ–ª–∏)')

    df_fused = fuse_audio_text(df_vis, df_text)

    fig = make_subplots(rows=1, cols=1, subplot_titles=('Fusion: –∞—É–¥–∏–æ + —Å–µ–º–∞–Ω—Ç–∏–∫–∞',), specs=[[{"secondary_y": False}]], vertical_spacing=0.08)

    fig.add_trace(go.Scatter(
        x=df_fused.get('time_sec', df_fused.index), y=df_fused.get('agitation_score', df_fused.index),
        mode='lines', name='–ê–∫—É—Å—Ç–∏–∫–∞ (raw)',
        line=dict(color='rgba(120,120,120,0.4)', width=1.2),
        line_shape='spline', marker=dict(size=0)
    ), row=1, col=1)

    if 'vis_agitation' in df_fused:
        fig.add_trace(go.Scatter(
            x=df_fused.get('time_sec', df_fused.index), y=df_fused['vis_agitation'],
            mode='lines', name='–ê–∫—É—Å—Ç–∏–∫–∞ (session-adjusted)',
            line=dict(color='orange', width=3),
            line_shape='spline', marker=dict(size=0)
        ), row=1, col=1)

    if 'fusion_score' in df_fused:
        fig.add_trace(go.Scatter(
            x=df_fused.get('time_sec', df_fused.index), y=df_fused['fusion_score'],
            mode='lines', name='Fusion (audio+text)',
            line=dict(color='red', width=3, dash='dash'),
            line_shape='spline', marker=dict(size=0)
        ), row=1, col=1)

    fig.add_hrect(y0=50, y1=100, line_width=0, fillcolor='rgba(255,0,0,0.08)', annotation_text='–ü—Ä–æ–±–ª–µ–º–∞/–∑–ª–∏—Ç—Å—è')
    fig.add_hrect(y0=0, y1=50, line_width=0, fillcolor='rgba(0,200,0,0.08)', annotation_text='–°–ø–æ–∫–æ–π–Ω–æ')

    session_state = df_fused.attrs.get('session_state', 'NORMAL_SESSION')
    fig.update_layout(height=520, title=f"Fusion —ç–º–æ—Ü–∏–π: {dropdown.value} ({session_state})", xaxis_title="–í—Ä–µ–º—è (—Å–µ–∫)",
                      hovermode='x unified', template='plotly_white')
    fig.update_yaxes(title_text="Score (0-100)", row=1, col=1)
    fig.show()


üîÑ –ê–Ω–∞–ª–∏–∑ –ø–æ 3—Å –æ–∫–Ω–∞–º...


VBox(children=(IntProgress(value=0, bar_style='info', description='‚è≥', max=445), HTML(value='‚è≥ –ü–æ–¥–≥–æ—Ç–æ–≤–∫–∞...')‚Ä¶

‚úÖ –ê–Ω–∞–ª–∏–∑ –∑–∞–≤–µ—Ä—à–µ–Ω! 445 –≤—Ä–µ–º–µ–Ω–Ω—ã—Ö —Ç–æ—á–µ–∫


Unnamed: 0,state,agitation_score,rms_ratio,pitch_jitter,time_sec,tension_trend,dialogue_escalation,rolling_mean_15s,rolling_mean_30s
0,CALM,33.9,1.11,12.0,1.5,33.9,False,33.9,33.9
1,CALM,34.5,1.03,13.5,2.5,33.9,False,34.2,34.2
2,TENSE,33.7,1.06,12.6,3.5,33.9,False,34.033,34.033
3,TENSE,31.8,0.74,13.6,4.5,33.8,False,33.475,33.475
4,TENSE,31.0,0.92,12.5,5.5,33.8,False,32.98,32.98


In [233]:
# –ü–æ–ª–Ω—ã–π —Ç—Ä–∞–Ω—Å–∫—Ä–∏–ø—Ç –∞—É–¥–∏–æ (–∏—Å—Ö–æ–¥–Ω—ã–π —è–∑—ã–∫)
if 'selected_path' not in globals() or not selected_path:
    print("‚ö†Ô∏è –°–Ω–∞—á–∞–ª–∞ –≤—ã–±–µ—Ä–∏ –∏ –∑–∞–≥—Ä—É–∑–∏—Ç–µ —Ñ–∞–π–ª –≤ –ø–µ—Ä–≤–æ–π —è—á–µ–π–∫–µ.")
elif not load_text_pipelines():
    print("‚ö†Ô∏è –°–µ–º–∞–Ω—Ç–∏—á–µ—Å–∫–∏–µ –º–æ–¥–µ–ª–∏ –Ω–µ –∑–∞–≥—Ä—É–∂–µ–Ω—ã.")
else:
    df_text = transcribe_and_classify(selected_path, window_sec=5.0, hop_sec=5.0, sr=16000)
    if df_text.empty:
        print("‚ö†Ô∏è –ù–µ —É–¥–∞–ª–æ—Å—å –ø–æ–ª—É—á–∏—Ç—å —Ç—Ä–∞–Ω—Å–∫—Ä–∏–ø—Ç.")
    else:
        df_text = df_text.sort_values("start_sec")
        full_text = " ".join(df_text["text"].tolist())
        print("üìú –¢—Ä–∞–Ω—Å–∫—Ä–∏–ø—Ç (–æ—Ä–∏–≥–∏–Ω–∞–ª—å–Ω—ã–π —è–∑—ã–∫):\n")
        print(full_text)
        display(df_text[["start_sec", "end_sec", "text"]].head())


üìú –¢—Ä–∞–Ω—Å–∫—Ä–∏–ø—Ç (–æ—Ä–∏–≥–∏–Ω–∞–ª—å–Ω—ã–π —è–∑—ã–∫):

 –ù—É –≤–æ—Ç, –≤ —Å–µ—Ä–∏–∞–ª–µ ¬´–°–ª–µ–¥¬ª —è —Å–Ω–∏–º–∞–ª–∞—Å—å, —É—á–∏—Ç–µ–ª—å –∑–∞–∫–æ–Ω–∞.  –ö—É–ª–∞–≥–∏–Ω –∏ –ø–∞—Ä—Ç–Ω—ë—Ä—ã. –ú–æ—Ä—Å–∫–∏–µ –¥—å—è–≤–æ–ª—ã.  –Ø —Å–µ–π—á–∞—Å –≤—ã–ø—É—è —Å —Å–µ–±—è.  –ê–π, –Ω—É –Ω–∞–¥–æ –º–µ–Ω—è, –≤—ã —Å–∞–º–∏ —É—Å–ø–æ–∫–æ–π—Ç–µ—Å—å —Ç–æ–∂–µ. –†–µ–±—è—Ç, –Ω—É —ç—Ç–æ –Ω–µ –¥–µ–ª–æ. –í—ã –º–µ–Ω—è –≤–¥–∞–≤–∏–ª–∏ –ø—Ä–æ —ç—Ç–æ.  –Ø –æ—Ç–ø—Ä–æ—Å–∏–ª—Å—è —Å —Ä–µ–ø–µ—Ç–∏—Ü–∏–∏. –í —á–µ—Ç—ã—Ä–µ —á–∞—Å–∞ —è –¥–æ–ª–∂–µ–Ω –±—ã–ª –∑–¥–µ—Å—å. –Ø –±—ã–ª –≤ —á–µ—Ç—ã—Ä–µ —á–∞—Å–∞. –í—Ä–µ–º—è —à–µ—Å—Ç—å.  –ù—É —á—Ç–æ —ç—Ç–æ —Ç–∞–∫–æ–µ? –ß—Ç–æ –ø—Ä–æ–∏—Å—Ö–æ–¥–∏—Ç? –î–µ–ª–æ –≤ —Ç–æ–º, —á—Ç–æ –¥—Ä—É–≥–∏—Ö –Ω–µ—Ç, –∞ —Ç–æ –∂–∏–∑–Ω—å –∏–¥–µ—Ç.  –î–∞ –Ω–µ—Ç, –Ω—É —Ö–æ—Ä–æ—à–æ, –º—ã –Ω–µ –±—É–¥–µ–º.  –ù–µ –ø—Ä–æ—Ç–∏–≤, –ø—Ä–∏—Å–∞–∂–∏–≤–∞–π—Ç–µ—Å—å. –î–µ–≤—É—à–∫–∞, –±–æ–ª—å—à–æ–µ —Å–ø–∞—Å–∏–±–æ. –ò–∑–≤–∏–Ω–∏—Ç–µ –º–µ–Ω—è, –ø–æ–∂–∞–ª—É–π—Å—Ç–∞.  –†–∞–¥–∏ –ë–æ–≥–∞, –Ω—É –ø—Ä–∞–≤–¥–∞, –Ω—É‚Ä¶ –ù—É –º–µ–Ω—è –ø

Unnamed: 0,start_sec,end_sec,text
0,0.0,5.0,"–ù—É –≤–æ—Ç, –≤ —Å–µ—Ä–∏–∞–ª–µ ¬´–°–ª–µ–¥¬ª —è —Å–Ω–∏–º–∞–ª–∞—Å—å, —É—á–∏—Ç–µ–ª—å..."
1,5.0,10.0,–ö—É–ª–∞–≥–∏–Ω –∏ –ø–∞—Ä—Ç–Ω—ë—Ä—ã. –ú–æ—Ä—Å–∫–∏–µ –¥—å—è–≤–æ–ª—ã.
2,10.0,15.0,–Ø —Å–µ–π—á–∞—Å –≤—ã–ø—É—è —Å —Å–µ–±—è.
3,15.0,20.0,"–ê–π, –Ω—É –Ω–∞–¥–æ –º–µ–Ω—è, –≤—ã —Å–∞–º–∏ —É—Å–ø–æ–∫–æ–π—Ç–µ—Å—å —Ç–æ–∂–µ. –†..."
4,20.0,25.0,–Ø –æ—Ç–ø—Ä–æ—Å–∏–ª—Å—è —Å —Ä–µ–ø–µ—Ç–∏—Ü–∏–∏. –í —á–µ—Ç—ã—Ä–µ —á–∞—Å–∞ —è –¥–æ–ª...
