# MotherTongue vs Spoken Language
Riconoscimento tramite video della lingua e vedere se questa lingua è la lingua madre di una persona o una semplice lingua parlata. \
Il metodo utilizzato è attraverso la Knowledge Distillation, approccio che vede l'utilizzo di un Teacher che impara la distinzione tra i due tipi di lingua con supporto audio-video, questa sua conoscenza la poi 'distilla' ad un suo Student, che tramite determinati approcci (embeddings) imparati dal Teacher prova a risolvere la stessa task solamente vedendo il labiale di una persona, quindi solamente con supporto video SENZA audio.



https://colab.research.google.com/drive/18LNEeYOwP_03wEUgJWYwfV0_fz95I4d1?usp=sharing - based on this code 

## STEP 1: Fattibilità del problema
Il modello Teacher, in questo caso Whisper-small, è in grado di poter classificare lingua madre da lingua parlata?

In [21]:
# creazione del dataset multimodale, estrazione dell'audio e csv associato con i path dei vari video
import glob
import os
import subprocess
import pandas as pd
from datasets import Dataset, Audio

dataset_dir    = "/Users/ludovicagenovese/Documents/GitHub/mothertongueVSspoken/EMODB"
audio_dir      = os.path.join(dataset_dir, "audio_wav")
csv_path       = os.path.join(dataset_dir, "dataset.csv")
os.makedirs(audio_dir, exist_ok=True)

# raccolta dei video nella cartella dataset
video_paths = []
for ext in ("*.mp4", "*.MP4", "*.mov", "*.MOV"):
    video_paths.extend(glob.glob(os.path.join(dataset_dir, "**", ext), recursive=True))

rows = []
for vp in video_paths:
    fname = os.path.basename(vp)
    label = 1 if "EN" in fname.upper() else 0

    # costruzione per i file audio
    base_no_ext = os.path.splitext(fname)[0]
    wav_path    = os.path.join(audio_dir, base_no_ext + ".wav")

    # estrazione audio con la funzione ffmpeg
    if not os.path.exists(wav_path):
        subprocess.run([
            "ffmpeg", "-i", vp,
            "-ar", "16000", "-ac", "1",    
            "-f", "wav", "-vn", wav_path 
        ], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

    rows.append({
        "audio_path": wav_path,
        "video_path": vp,
        "label":      label
    })


df = pd.DataFrame(rows)
df.to_csv(csv_path, index=False)
print(f"Creato CSV con {len(df)} righe in {csv_path}")


ds = Dataset.from_pandas(df)


ds = ds.cast_column("audio_path", Audio(sampling_rate=16_000))

print(ds)


Creato CSV con 3373 righe in /Users/ludovicagenovese/Documents/GitHub/mothertongueVSspoken/EMODB/dataset.csv
Dataset({
    features: ['audio_path', 'video_path', 'label'],
    num_rows: 3373
})


In [22]:
# fine tuning di Whisper
# os.environ["TRANSFORMERS_NO_TF"] = "1"

from datasets import load_dataset, Audio
import evaluate


ds = load_dataset("csv", data_files="/Users/ludovicagenovese/Documents/GitHub/mothertongueVSspoken/EMODB/dataset.csv", split="train")
ds = ds.class_encode_column("label")

split_ds = ds.train_test_split(
    test_size=0.3,
    seed=42,
    stratify_by_column='label',
)
train_ds = split_ds["train"]
test_ds = split_ds["test"]


#imposta la colonna audio come waveform a 16 kHz
train_ds = train_ds.cast_column("audio_path", Audio(sampling_rate=16_000))
test_ds  = test_ds.cast_column("audio_path",  Audio(sampling_rate=16_000))

#print(train_ds[0])
#print(test_ds[0])


Generating train split: 0 examples [00:00, ? examples/s]

Stringifying the column:   0%|          | 0/3373 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/3373 [00:00<?, ? examples/s]

In [None]:
import types
from peft import LoraConfig, get_peft_model, TaskType
from dataclasses import dataclass
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import torch
from transformers import (
    WhisperProcessor,
    WhisperForConditionalGeneration,
    TrainingArguments,
    Trainer,
    set_seed,
)

os.environ["ACCELERATE_MIXED_PRECISION"] = "no"   # evita override fp16
os.environ.pop("USE_FP16", None)
os.environ.setdefault("PYTORCH_MPS_HIGH_WATERMARK_RATIO", "0.0")


device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Device active: {device}")


set_seed(42)
MODEL_NAME = "openai/whisper-tiny"   # usa tiny/base per ridurre VRAM

processor = WhisperProcessor.from_pretrained(MODEL_NAME)
model     = WhisperForConditionalGeneration.from_pretrained(MODEL_NAME)
model.gradient_checkpointing_enable()  # taglia VRAM a costo di +latency

model.config.forced_decoder_ids = None
model.config.suppress_tokens    = []
model.classifier = torch.nn.Linear(model.config.d_model, 2)

#forward cambiato da trascrizione a classificazione

def forward_with_classification(self, input_features=None, labels=None, **kwargs):
    if input_features is None:
        raise ValueError("input_features cannot be None")
    enc_out = self.model.encoder(input_features)[0]
    pooled  = enc_out.mean(dim=1)
    logits  = self.classifier(pooled)
    loss = torch.nn.functional.cross_entropy(logits, labels) if labels is not None else None
    return {"loss": loss, "logits": logits}

model.forward = types.MethodType(forward_with_classification, model)

for p in model.model.encoder.parameters():
    p.requires_grad = False

# --------------------------------------------------
# 5) LoRA per ridurre la computazione
# --------------------------------------------------
peft_cfg = LoraConfig(
    task_type      = TaskType.SEQ_CLS,
    inference_mode = False,
    r              = 8,
    lora_alpha     = 16,
    lora_dropout   = 0.05,
    target_modules = ["q_proj", "k_proj", "v_proj", "out_proj"],
    bias           = "none",
)
model = get_peft_model(model, peft_cfg)
model.to(device)

# --------------------------------------------------
# 6) Dataset preprocessing (definisci train_ds/test_ds)
# --------------------------------------------------

def preprocess(batch):
    audio = batch["audio_path"]["array"]
    feats = processor.feature_extractor(audio, sampling_rate=16_000, return_tensors="pt")
    batch["input_features"] = feats.input_features[0]  # (80, F)
    batch["labels"]         = batch["label"]
    return batch

train_ds = train_ds.map(preprocess, remove_columns=train_ds.column_names, num_proc=4)
test_ds  = test_ds.map(preprocess,  remove_columns=test_ds.column_names,  num_proc=4)

# --------------------------------------------------
# 7) Data collator
# --------------------------------------------------
@dataclass
class DataCollatorWhisperCls:
    dtype = torch.float32  # for MPS evita mismatch Half vs Float
    def __call__(self, features):
        tensors = [torch.tensor(f["input_features"], dtype=torch.float32) for f in features]
        max_len = max(t.shape[-1] for t in tensors)
        padded  = [torch.nn.functional.pad(t, (0, max_len - t.shape[-1])) for t in tensors]
        input_features = torch.stack(padded)
        labels         = torch.tensor([f["labels"] for f in features], dtype=torch.long)
        return {"input_features": input_features, "labels": labels}

data_collator = DataCollatorWhisperCls()

# --------------------------------------------------
# 8) Metriche
# --------------------------------------------------
accuracy_metric = evaluate.load("accuracy")
f1_metric       = evaluate.load("f1")
recall_metric       = evaluate.load("recall")
precision_metric      = evaluate.load("precision")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=-1)
    return {
        "accuracy": accuracy_metric.compute(predictions=preds, references=labels)["accuracy"],
        "f1":       f1_metric.compute(predictions=preds, references=labels, average="weighted")["f1"],
        "recall": recall_metric.compute(predictions=preds, references=labels)["recall"],
        "precision": precision_metric.compute(predictions=preds, references=labels)["precision"],
    }

# --------------------------------------------------
# 9) TrainingArguments (batch minimale, accumulo alto)
# --------------------------------------------------
training_args = TrainingArguments(
    output_dir="./whisper_cls_tiny_lora_mps",
    per_device_train_batch_size=2,    # ↓ batch 2 per contenere VRAM
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=16,   # eff. batch 32
    learning_rate=1e-4,
    num_train_epochs=5,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    logging_steps=50,
    report_to="none",
    fp16=False,                       # fp16 non usato ma input è half
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

#stampa metriche e confusion matrix
metrics = trainer.evaluate(test_ds)
print("Metriche calcolate")
for k, v in metrics.items():
    print(f"{k:>10}: {v:.4f}")


pred_out = trainer.predict(test_ds)
y_true   = pred_out.label_ids
y_pred   = pred_out.predictions.argmax(axis=-1)


cm = confusion_matrix(y_true, y_pred)
ConfusionMatrixDisplay(cm).plot()


Device active: mps


Map (num_proc=4):   0%|          | 0/2361 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/1012 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss


NameError: name 'recall_metric' is not defined

L'accuracy sembra comunque essere buona, ma il numero di falsi negativi vede una difficoltà a capire se la lingua

## Teacher: Whisper-medium di OpenAI - Student: AV-HuBERT (Large)