# üß† L-MAC Evaluation (SpeechOcean762)

Questo notebook calcola **AI/AD** e genera esempi ascoltabili per L-MAC.

**Supporta ambienti:**
- üñ•Ô∏è Local
- ‚òÅÔ∏è Google Colab  
- üìä Kaggle (Dataset + Modelli da input)

**Dataset:** SpeechOcean762 (full)  
**Backbone:** HuBERT Large o Early Fusion

In [None]:
import sys, subprocess
from pathlib import Path
import os

# Fix audio decoding: monkey-patch prima di qualsiasi uso di datasets
import soundfile as sf
import io
import datasets
import datasets.features.audio as audio_module

def decode_audio_with_soundfile(self, value, token_per_repo_id=None):
    """Fallback audio decoder usando soundfile."""
    if isinstance(value, dict):
        if "bytes" in value:
            audio_bytes = value["bytes"]
            audio, sr = sf.read(io.BytesIO(audio_bytes))
            return {"array": audio, "sampling_rate": sr, "path": value.get("path", "")}
        elif "path" in value:
            audio, sr = sf.read(value["path"])
            return {"array": audio, "sampling_rate": sr, "path": value["path"]}
    return value

audio_module.Audio.decode_example = decode_audio_with_soundfile
print("‚úì Audio decoder patched to use soundfile")

def detect_environment():
    if 'COLAB_GPU' in os.environ or 'google.colab' in sys.modules:
        return 'colab'
    elif '/kaggle' in os.getcwd() or 'KAGGLE_KERNEL_RUN_TYPE' in os.environ:
        return 'kaggle'
    return 'local'

ENV = detect_environment()
print(f'üñ•Ô∏è Ambiente: {ENV.upper()}')

In [None]:
# Install dependencies + clone repo
pkgs = [
    'transformers>=4.38',
    'datasets>=2.18',
    'evaluate',
    'jiwer',
    'soundfile',
    'librosa',
    'safetensors',
    'accelerate',
    'tqdm',
    'pyyaml',
    'pandas',
]

subprocess.run([sys.executable, '-m', 'pip', 'install', '-q', *pkgs], check=False)

import torch
print(f'üî• PyTorch {torch.__version__}, CUDA: {torch.cuda.is_available()}')
if torch.cuda.is_available():
    print(f'üìä GPU: {torch.cuda.get_device_name(0)}')
    print(f'üíæ VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB')

# Clone repo
IS_KAGGLE = Path('/kaggle').exists()
SKIP_CLONE = str(os.environ.get('DL_PHONEME_SKIP_CLONE', '')).strip().lower() in ('1', 'true', 'yes')
REPO_URL = 'https://github.com/maurocarlu/pronuncIAtion.git'
PROJECT_DIR = Path('/kaggle/working/pronuncIAtion') if IS_KAGGLE else Path.cwd().parent.parent

if IS_KAGGLE and (not SKIP_CLONE) and REPO_URL:
    if not PROJECT_DIR.exists():
        print('Cloning repo:', REPO_URL)
        subprocess.run(['git', 'clone', REPO_URL, str(PROJECT_DIR)], check=False)
    else:
        print('Repo gi√† presente:', PROJECT_DIR)

if PROJECT_DIR.exists():
    os.chdir(PROJECT_DIR)
    sys.path.insert(0, str(PROJECT_DIR))
print('CWD:', os.getcwd())
print('PROJECT_DIR:', PROJECT_DIR)

In [None]:
# ====== Kaggle: Symlink dati e path modelli ======
DATA_INPUT = Path('/kaggle/input/pronunciation-data/data')
DATA_TARGET = Path(PROJECT_DIR) / 'data'

# Symlink data
if Path('/kaggle').exists() and DATA_INPUT.exists():
    try:
        if not DATA_TARGET.exists():
            os.symlink(str(DATA_INPUT), str(DATA_TARGET))
            print('‚úì data symlink creato')
    except Exception as e:
        print('‚ö†Ô∏è Symlink fallito:', e)

# ====== Model paths ======
# Kaggle: modelli da input dataset
KAGGLE_MODELS_PATH = Path('/kaggle/input/late-fusion/LateFusion')
LOCAL_MODELS_PATH = PROJECT_DIR / 'outputs' / 'backup'

if KAGGLE_MODELS_PATH.exists():
    MODELS_ROOT = KAGGLE_MODELS_PATH
    print(f'‚úì Using Kaggle models: {MODELS_ROOT}')
else:
    MODELS_ROOT = LOCAL_MODELS_PATH
    print(f'‚úì Using local models: {MODELS_ROOT}')

# Find available models
print('\nModelli disponibili:')
for p in sorted(MODELS_ROOT.glob('**/config.json'))[:10]:
    print(f'  ‚úì {p.parent.name}')

## ‚öôÔ∏è Configuration

Configura:
- `BACKBONE`: tipo di backbone (`hubert` o `early_fusion`)
- `TARGET_PHONEME`: fonema IPA target per L-MAC
- `MODEL_PATH`: path al modello fine-tuned

In [None]:
# === CONFIG ===
BACKBONE = "hubert"  # oppure "early_fusion"
TARGET_PHONEME = "…™"  # esempio IPA

# Auto-detect model path based on environment
if (MODELS_ROOT / 'final_model_hubert').exists():
    MODEL_PATH = str(MODELS_ROOT / 'final_model_hubert')
elif (MODELS_ROOT / 'hubert_large' / 'final_model_hubert').exists():
    MODEL_PATH = str(MODELS_ROOT / 'hubert_large' / 'final_model_hubert')
else:
    # Fallback: cerca il primo modello con config.json
    candidates = list(MODELS_ROOT.glob('**/config.json'))
    MODEL_PATH = str(candidates[0].parent) if candidates else ""

print(f'BACKBONE: {BACKBONE}')
print(f'MODEL_PATH: {MODEL_PATH}')
print(f'TARGET_PHONEME: {TARGET_PHONEME}')

# Auto-find decoder checkpoint (if already trained)
decoder_root = PROJECT_DIR / "outputs" / "lmac" / BACKBONE / TARGET_PHONEME
candidates = sorted(decoder_root.glob("decoder_*.pt"))
DECODER_CKPT = str(candidates[-1]) if candidates else ""
print(f'DECODER_CKPT: {DECODER_CKPT or "Not found (will train)"}')

In [None]:
# === IMPORTS ===
from torch.utils.data import DataLoader

# Import dal progetto (gi√† in sys.path)
from scripts.analysis.lmac_core import (
    LMACBackboneConfig,
    LMACSpeechOceanDataset,
    LMACWrapper,
    collate_audio_batch,
    compute_ai_ad,
    generate_listenable_map,
)

print('‚úì L-MAC imports loaded')

## üéØ Train Decoder (se non presente)

Se il decoder L-MAC non √® gi√† stato trainato per il fonema target, lo alleniamo qui.

In [None]:
# Fix audio decoding: usa soundfile invece di torchcodec
import datasets
datasets.config.TORCHCODEC_AVAILABLE = False

# Se ancora non funziona, forza soundfile:
import os
os.environ["HF_DATASETS_AUDIO_DECODER"] = "soundfile"

In [None]:
# === TRAIN (se decoder non presente) ===
from types import SimpleNamespace
from importlib import reload
from scripts.analysis import train_lmac_decoder
reload(train_lmac_decoder)  # Ricarica per avere le ultime modifiche

if not DECODER_CKPT:
    print('üèãÔ∏è Training L-MAC decoder...')
    args = SimpleNamespace(
        model_path=MODEL_PATH,
        backbone=BACKBONE,
        target_phoneme=TARGET_PHONEME,
        layer_ids="6,12,18,24",
        epochs=10,
        batch_size=2,
        lr=2e-4,
        lambda_out=1.0,
        lambda_reg=1e-4,
        max_samples=None,
        log_interval=50,
        output_dir=str(PROJECT_DIR / "outputs" / "lmac"),
    )
    train_lmac_decoder.train_lmac(args)
    candidates = sorted(decoder_root.glob("decoder_*.pt"))
    DECODER_CKPT = str(candidates[-1]) if candidates else ""
    print(f'‚úì Decoder trained: {DECODER_CKPT}')
else:
    print(f'‚úì Using existing decoder: {DECODER_CKPT}')

In [None]:
# === Load decoder + backbone ===
if not DECODER_CKPT:
    raise FileNotFoundError(f"Decoder checkpoint non trovato in {decoder_root}")

config = LMACBackboneConfig(
    backbone_type=BACKBONE,
    model_path=MODEL_PATH,
    layer_ids=(6, 12, 18, 24),
)
wrapper = LMACWrapper(config=config, target_phoneme=TARGET_PHONEME)
ckpt = torch.load(DECODER_CKPT, map_location="cpu")
wrapper.decoder.load_state_dict(ckpt["decoder_state"])
wrapper.eval()
print('‚úì L-MAC Wrapper loaded')

## üìä Evaluation: AI / AD Metrics

Calcola le metriche **Attribution Intersection (AI)** e **Attribution Deletion (AD)** su SpeechOcean762.

In [None]:
# === AI / AD on SpeechOcean762 (test) ===
print('üìä Computing AI/AD metrics on SpeechOcean762...')
test_ds = LMACSpeechOceanDataset(split="test", target_phoneme=TARGET_PHONEME, full=True)
test_loader = DataLoader(test_ds, batch_size=2, shuffle=False, collate_fn=collate_audio_batch)
metrics = compute_ai_ad(wrapper, test_loader, max_batches=None)
print('\nüìà Results:')
for k, v in metrics.items():
    print(f'  {k}: {v:.4f}')

## üîä Listenable Maps

Genera audio modificati per visualizzare/ascoltare le aree attribuite al fonema target.

In [None]:
# === Esempio ascoltabile ===
# Seleziona un audio dal dataset e genera la mappa ascoltabile
sample = test_ds[0]
audio_path = None
if isinstance(sample.get("audio"), dict) and sample["audio"].get("path"):
    audio_path = sample["audio"]["path"]

# Fallback: salva temporaneamente l'audio se non esiste path
if audio_path is None:
    import soundfile as sf
    tmp_path = Path(PROJECT_DIR) / "outputs" / "lmac" / "tmp_audio.wav"
    tmp_path.parent.mkdir(parents=True, exist_ok=True)
    sf.write(tmp_path, sample["audio"], 16000)
    audio_path = str(tmp_path)

out_dir = str(PROJECT_DIR / "outputs" / "lmac" / "listenable_maps")
out = generate_listenable_map(wrapper, audio_path, out_dir=out_dir)
print(f'\nüîä Generated listenable map: {out}')

## üßπ Cleanup (Kaggle)

Libera spazio disco rimuovendo cache HuggingFace.

In [None]:
# Cleanup disk (Kaggle)
import shutil
if ENV == 'kaggle':
    for f in ['/root/.cache/huggingface']:
        if os.path.exists(f) and not os.path.islink(f):
            shutil.rmtree(f)
            print(f'üóëÔ∏è Cleaned: {f}')
    # Check disk space
    !df -h /kaggle/working