In [113]:
import evaluate
import torch
from sklearn.metrics import balanced_accuracy_score
from tqdm import tqdm

def evaluate_model(model, dataloader, device=None):
    if device is None:
        device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()

    accuracy_metric  = evaluate.load("accuracy")
    precision_metric = evaluate.load("precision")
    recall_metric    = evaluate.load("recall")
    f1_metric        = evaluate.load("f1")

    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            inputs = {k: v.to(device) for k, v in batch.items() if k != "labels"}
            labels = batch["labels"].to(device)

            outputs = model(**inputs)
            preds = torch.argmax(outputs.logits, dim=-1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    acc = accuracy_metric.compute(predictions=all_preds, references=all_labels)["accuracy"]
    bal_acc = balanced_accuracy_score(all_labels, all_preds)
    prec = precision_metric.compute(predictions=all_preds, references=all_labels, average="macro")["precision"]
    rec = recall_metric.compute(predictions=all_preds, references=all_labels, average="macro")["recall"]
    f1 = f1_metric.compute(predictions=all_preds, references=all_labels, average="macro")["f1"]

    return {
        "accuracy": acc,
        "balanced_accuracy": bal_acc,
        "precision": prec,
        "recall": rec,
        "f1": f1,
    }

# Model classes

In [4]:
from dataclasses import dataclass
from typing import Any, Dict, List, Union
import torch

@dataclass
class DataCollatorForEncoderClassification:
    processor: Any 

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        input_features = [{'input_features': feature['input_features']} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors='pt')
        
        batch['labels'] = torch.tensor(
            [feature['labels'] for feature in features],
            dtype=torch.long
        )
        
        return batch

In [5]:
from transformers.modeling_outputs import SequenceClassifierOutput
from transformers import AutoConfig, PreTrainedModel, WhisperProcessor, WhisperModel
import torch.nn as nn 

class WhisperClassifier(nn.Module):
    def __init__(self, hidden_size, num_labels=5, dropout=0.2):
        super().__init__()
        self.pool_norm = nn.LayerNorm(hidden_size)
        self.pre_dropout = nn.Dropout(dropout)

        mid1 = max(hidden_size // 2, num_labels * 4)
        mid2 = max(hidden_size // 4, num_labels * 2)

        self.classifier = nn.Sequential(
            nn.Linear(hidden_size, mid1),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.LayerNorm(mid1),
            nn.Linear(mid1, mid2),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.LayerNorm(mid2),
            nn.Linear(mid2, num_labels),
        )

    def forward(self, hidden_states, attention_mask=None):
        if attention_mask is not None:
            lengths = attention_mask.sum(dim=1, keepdim=True)
            masked = hidden_states * attention_mask.unsqueeze(-1)
            pooled = masked.sum(dim=1) / lengths
        else:
            pooled = hidden_states.mean(dim=1)
        x = self.pool_norm(pooled)
        x = self.pre_dropout(x)
        logits = self.classifier(x)
        return logits

class WhisperForEmotionClassification(PreTrainedModel):
    config_class = AutoConfig

    def __init__(
        self, config, model_name, num_labels=5, dropout=0.2
    ):
        super().__init__(config)
        self.encoder = WhisperModel.from_pretrained(model_name).encoder
        hidden_size = config.hidden_size
        self.classifier = WhisperClassifier(
            hidden_size, num_labels=num_labels, dropout=dropout
        )
        self.post_init()

    def forward(self, input_features, attention_mask=None, labels=None):
        encoder_output = self.encoder(
            input_features=input_features,
            attention_mask=attention_mask,
            return_dict=True,
        )
        hidden_states = encoder_output.last_hidden_state
        logits = self.classifier(hidden_states, attention_mask=attention_mask)
        loss = None
        if labels is not None:
            loss = nn.CrossEntropyLoss()(
                logits.view(-1, logits.size(-1)), labels.view(-1)
            )
        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
        )

In [44]:
from typing import Union, Tuple, Dict, Optional
import torch
import gigaam

class GigaEmotionInferencer:
    """
    Wrapper for GigaAM emotion classification model.
    Methods:
      - infer: return full probability dict
      - predict_emotion: return top emotion index and its probability
    """
    def __init__(
        self,
        model_name: str = "emo",
        device: Union[str, torch.device] = "cpu",
        sample_rate: Optional[int] = None,
    ):
        self.device = torch.device(device if isinstance(device, str) else device)
        try:
            self.model = gigaam.load_model(model_name)
        except Exception as e:
            raise RuntimeError(f"Failed to load GigaAM model '{model_name}': {e}")
        self.sample_rate = sample_rate

    def infer(
        self,
        input: str
    ) -> Dict[str, float]:
        try:
            probs = self.model.get_probs(input)
        except Exception as e:
            raise RuntimeError(f"Emotion inference failed: {e}")

        return {emotion: float(prob) for emotion, prob in probs.items()}

    def predict_emotion(
        self,
        input: Union[str, torch.Tensor]
    ) -> Tuple[int, float]:
        probs = self.infer(input)
        top_emotion = max(probs, key=probs.get)
        return top_emotion, probs[top_emotion]

In [7]:
from transformers import (
    PreTrainedModel,
    AutoConfig,
    AutoModel,
    AutoProcessor,
)
from transformers.modeling_outputs import SequenceClassifierOutput
import torch
import torch.nn as nn

class EmotionClassifier(nn.Module):
    def __init__(self, hidden_size, num_labels=5, dropout=0.2):
        super().__init__()
        self.pool_norm = nn.LayerNorm(hidden_size)
        self.pre_dropout = nn.Dropout(dropout)

        mid1 = max(hidden_size // 2, num_labels * 4)
        mid2 = max(hidden_size // 4, num_labels * 2)

        self.classifier = nn.Sequential(
            nn.Linear(hidden_size, mid1),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.LayerNorm(mid1),
            nn.Linear(mid1, mid2),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.LayerNorm(mid2),
            nn.Linear(mid2, num_labels),
        )

    def forward(self, hidden_states, attention_mask=None):
        if attention_mask is not None:
            lengths = attention_mask.sum(dim=1, keepdim=True)
            masked = hidden_states * attention_mask.unsqueeze(-1)
            pooled = masked.sum(dim=1) / lengths
        else:
            pooled = hidden_states.mean(dim=1)
        x = self.pool_norm(pooled)
        x = self.pre_dropout(x)
        logits = self.classifier(x)
        return logits
    
class ModelForEmotionClassification(PreTrainedModel):
    config_class = AutoConfig

    def __init__(
        self, config, model_name, num_labels=5, dropout=0.2
    ):
        super().__init__(config)
        self.encoder = AutoModel.from_pretrained(model_name, trust_remote_code=True).model.encoder
        hidden_size = config.encoder['d_model']
        self.classifier = EmotionClassifier(
            hidden_size, num_labels=num_labels, dropout=dropout
        )
        self.post_init()

    def forward(
        self,
        input_features: torch.Tensor,
        input_lengths: torch.Tensor,
        attention_mask: torch.Tensor = None,
        labels: torch.Tensor = None
    ) -> SequenceClassifierOutput:
        encoded, out_lens = self.encoder(input_features, input_lengths)
        hidden_states = encoded.transpose(1, 2)

        if attention_mask is None:
            max_t = hidden_states.size(1)
            attention_mask = (
                torch.arange(max_t, device=out_lens.device)
                .unsqueeze(0)
                .lt(out_lens.unsqueeze(1))
                .long()
            )

        logits = self.classifier(hidden_states, attention_mask=attention_mask)

        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits, labels)

        return SequenceClassifierOutput(loss=loss, logits=logits)

In [71]:
from torch.utils.data import DataLoader
@dataclass
class DataCollatorForEncoderClassificationGigaAM:
    processor: Any

    def __call__(
        self, features: List[Dict[str, Union[List[int], torch.Tensor]]]
    ) -> Dict[str, torch.Tensor]:
        tensors = [
            f["input_features"]
            if isinstance(f["input_features"], torch.Tensor)
            else torch.tensor(f["input_features"], dtype=torch.float32)
            for f in features
        ]
        seq_lens = [t.shape[0] for t in tensors]
        assert len(set(seq_lens)) == 1, "Все sequences в батче должны быть одинаковой длины"
        batch_inputs = torch.stack(tensors, dim=0)  # shape: [B, T, feat_in]

        batch_labels = torch.tensor(
            [f["labels"] for f in features], dtype=torch.long
        )
        batch_lens = torch.tensor(
            [f["input_lengths"] for f in features], dtype=torch.long
        )

        return {
            "input_features": batch_inputs,
            "input_lengths": batch_lens,
            "labels": batch_labels,
        }

# dusha test

In [2]:
from datasets import load_dataset

dusha_test_dataset = load_dataset("nixiieee/dusha_balanced", split="test")

In [3]:
dusha_test_dataset

Dataset({
    features: ['audio', 'emotion'],
    num_rows: 3601
})

In [4]:
dusha_test_dataset['audio'][0]

{'path': 'c7ecb2e501e4bcf41b714cd93e720368.wav',
 'array': array([0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        1.83105469e-04, 6.10351562e-05, 1.22070312e-04], shape=(105600,)),
 'sampling_rate': 16000}

## Whisper-based classifier

### whisper small simple classfifier

In [7]:
from transformers import AutoModelForAudioClassification, WhisperProcessor

model = AutoModelForAudioClassification.from_pretrained(
    "/home/llm_agent/video_audio_pipeline/emotion_dusha/whisper-emotion-classfication-layernorm/checkpoint-14094"
)

processor = WhisperProcessor.from_pretrained("openai/whisper-small")

In [8]:
# label_mapping = {'neutral': 0, 'angry': 1, 'positive': 2, 'sad': 3, 'other': 4}

def prepare_dataset(batch):
    audio = batch['audio']
    batch['input_features'] = processor(audio['array'], sampling_rate=audio['sampling_rate']).input_features[0]
    batch['labels'] = batch["emotion"]
    # batch['labels'] = label_mapping[batch["emotion"]] #[label_mapping[emotion] for emotion in batch["emotion"]]
    return batch

processed_dusha = dusha_test_dataset.map(
    prepare_dataset,
    num_proc=32
)

Map (num_proc=32):   0%|          | 0/3601 [00:00<?, ? examples/s]

In [15]:
from torch.utils.data import DataLoader
data_collator = DataCollatorForEncoderClassification(processor)
dataloader = DataLoader(processed_dusha, batch_size=16, collate_fn=data_collator)

In [16]:
metrics = evaluate_model(model, dataloader)
print(f"Accuracy:           {metrics['accuracy']:.2%}")
print(f"Balanced accuracy:  {metrics['balanced_accuracy']:.2%}")
print(f"Precision:          {metrics['precision']:.2%}")
print(f"Recall:             {metrics['recall']:.2%}")
print(f"F1 Score:           {metrics['f1']:.2%}")

Evaluating: 100%|██████████| 226/226 [09:31<00:00,  2.53s/it]


Accuracy:           71.37%
Balanced accuracy:  70.58%
Precision:          82.59%
Recall:             70.58%
F1 Score:           74.14%


### whisper small mlp classifier (dropout=0.2)

In [19]:
model_name = "nixiieee/whisper-small-emotion-classifier-dusha"
processor = WhisperProcessor.from_pretrained("openai/whisper-small", return_attention_mask=True)
config = AutoConfig.from_pretrained(model_name)
model = WhisperForEmotionClassification.from_pretrained(pretrained_model_name_or_path=model_name, model_name=model_name, num_labels=5, dropout=0.2)

Some weights of WhisperModel were not initialized from the model checkpoint at nixiieee/whisper-small-emotion-classifier-dusha and are newly initialized: ['decoder.embed_positions.weight', 'decoder.embed_tokens.weight', 'decoder.layer_norm.bias', 'decoder.layer_norm.weight', 'decoder.layers.0.encoder_attn.k_proj.weight', 'decoder.layers.0.encoder_attn.out_proj.bias', 'decoder.layers.0.encoder_attn.out_proj.weight', 'decoder.layers.0.encoder_attn.q_proj.bias', 'decoder.layers.0.encoder_attn.q_proj.weight', 'decoder.layers.0.encoder_attn.v_proj.bias', 'decoder.layers.0.encoder_attn.v_proj.weight', 'decoder.layers.0.encoder_attn_layer_norm.bias', 'decoder.layers.0.encoder_attn_layer_norm.weight', 'decoder.layers.0.fc1.bias', 'decoder.layers.0.fc1.weight', 'decoder.layers.0.fc2.bias', 'decoder.layers.0.fc2.weight', 'decoder.layers.0.final_layer_norm.bias', 'decoder.layers.0.final_layer_norm.weight', 'decoder.layers.0.self_attn.k_proj.weight', 'decoder.layers.0.self_attn.out_proj.bias', 'de

In [21]:
metrics = evaluate_model(model, dataloader)
print(f"Accuracy:           {metrics['accuracy']:.2%}")
print(f"Balanced accuracy:  {metrics['balanced_accuracy']:.2%}")
print(f"Precision:          {metrics['precision']:.2%}")
print(f"Recall:             {metrics['recall']:.2%}")
print(f"F1 Score:           {metrics['f1']:.2%}")

Evaluating: 100%|██████████| 226/226 [10:00<00:00,  2.66s/it]


Accuracy:           74.70%
Balanced accuracy:  77.39%
Precision:          79.01%
Recall:             77.39%
F1 Score:           77.93%


### whisper small mlp classifier (dropout=0.1)

In [22]:
model_name = "nixiieee/whisper-small-emotion-classifier-dusha2"
model = WhisperForEmotionClassification.from_pretrained(model_name, num_labels=5, dropout=0.1)
metrics = evaluate_model(model, dataloader)
print(f"Accuracy:           {metrics['accuracy']:.2%}")
print(f"Balanced accuracy:  {metrics['balanced_accuracy']:.2%}")
print(f"Precision:          {metrics['precision']:.2%}")
print(f"Recall:             {metrics['recall']:.2%}")
print(f"F1 Score:           {metrics['f1']:.2%}")

config.json:   0%|          | 0.00/2.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/354M [00:00<?, ?B/s]

Evaluating: 100%|██████████| 226/226 [10:05<00:00,  2.68s/it]


Accuracy:           76.87%
Balanced accuracy:  79.30%
Precision:          81.04%
Recall:             79.30%
F1 Score:           79.99%


### whisper large v3 turbo mlp classifier

In [8]:
processor = WhisperProcessor.from_pretrained("openai/whisper-large-v3-turbo")

def prepare_dataset(batch):
    audio = batch['audio']
    batch['input_features'] = processor(audio['array'], sampling_rate=audio['sampling_rate']).input_features[0]
    batch['labels'] = batch["emotion"]
    return batch

processed_dusha = dusha_test_dataset.map(
    prepare_dataset,
    num_proc=32,
    remove_columns=["audio", "emotion"],
)

Map (num_proc=32):   0%|          | 0/3601 [00:00<?, ? examples/s]

In [9]:
from torch.utils.data import DataLoader
data_collator = DataCollatorForEncoderClassification(processor)
dataloader = DataLoader(processed_dusha, batch_size=16, collate_fn=data_collator)

In [20]:
model_name = "nixiieee/whisper-large-v3-emotion-classifier-dusha"
# config = AutoConfig.from_pretrained(model_name)
model = WhisperForEmotionClassification.from_pretrained(pretrained_model_name_or_path=model_name, model_name=model_name, num_labels=5, dropout=0.05)

metrics = evaluate_model(model, dataloader)
print(f"Accuracy:           {metrics['accuracy']:.2%}")
print(f"Balanced accuracy:  {metrics['balanced_accuracy']:.2%}")
print(f"Precision:          {metrics['precision']:.2%}")
print(f"Recall:             {metrics['recall']:.2%}")
print(f"F1 Score:           {metrics['f1']:.2%}")

Some weights of WhisperModel were not initialized from the model checkpoint at nixiieee/whisper-large-v3-emotion-classifier-dusha and are newly initialized: ['decoder.embed_positions.weight', 'decoder.embed_tokens.weight', 'decoder.layer_norm.bias', 'decoder.layer_norm.weight', 'decoder.layers.0.encoder_attn.k_proj.weight', 'decoder.layers.0.encoder_attn.out_proj.bias', 'decoder.layers.0.encoder_attn.out_proj.weight', 'decoder.layers.0.encoder_attn.q_proj.bias', 'decoder.layers.0.encoder_attn.q_proj.weight', 'decoder.layers.0.encoder_attn.v_proj.bias', 'decoder.layers.0.encoder_attn.v_proj.weight', 'decoder.layers.0.encoder_attn_layer_norm.bias', 'decoder.layers.0.encoder_attn_layer_norm.weight', 'decoder.layers.0.fc1.bias', 'decoder.layers.0.fc1.weight', 'decoder.layers.0.fc2.bias', 'decoder.layers.0.fc2.weight', 'decoder.layers.0.final_layer_norm.bias', 'decoder.layers.0.final_layer_norm.weight', 'decoder.layers.0.self_attn.k_proj.weight', 'decoder.layers.0.self_attn.out_proj.bias', 

Accuracy:           78.87%
Balanced accuracy:  81.55%
Precision:          80.71%
Recall:             81.55%
F1 Score:           81.05%





## Gigaam-EMO

In [5]:
device = "cuda" if torch.cuda.is_available() else "cpu"
audio_emotion_model = GigaEmotionInferencer(device=device)

  checkpoint = torch.load(model_path, map_location="cpu")


In [None]:
import evaluate
import torch
from sklearn.metrics import balanced_accuracy_score
from tqdm import tqdm
import soundfile as sf
import tempfile

accuracy_metric = evaluate.load("accuracy")
precision_metric = evaluate.load("precision")
recall_metric = evaluate.load("recall")
f1_metric = evaluate.load("f1")

all_predictions = []
all_labels = []

EMOTION_LABELS = {'neutral' : 0, 'angry' : 1, 'positive' : 2, 'sad' : 3, 'other' : 4}

with torch.no_grad():
    for sample in tqdm(dusha_test_dataset):
        audio = sample['audio']['array']
        sr = sample['audio']['sampling_rate']
        with tempfile.NamedTemporaryFile(suffix=".wav", delete=True) as tmp:
            wav_tmp_path = tmp.name
            sf.write(wav_tmp_path, audio, samplerate=sr)
            emo, _ = audio_emotion_model.predict_emotion(wav_tmp_path)
        labels = sample["emotion"]
        all_predictions.append(EMOTION_LABELS[emo])
        all_labels.append(labels)

accuracy = accuracy_metric.compute(predictions=all_predictions, references=all_labels)
balanced_accuracy = balanced_accuracy_score(all_labels, all_predictions)
precision = precision_metric.compute(predictions=all_predictions, references=all_labels, average="macro")
recall = recall_metric.compute(predictions=all_predictions, references=all_labels, average="macro")
f1 = f1_metric.compute(predictions=all_predictions, references=all_labels, average="macro")

print(f"Accuracy: {accuracy['accuracy']:.2%}")
print(f"Balanced accuracy: {balanced_accuracy:.2%}")
print(f"Precision: {precision['precision']:.2%}")
print(f"Recall: {recall['recall']:.2%}")
print(f"F1 Score: {f1['f1']:.2%}")

100%|██████████| 3601/3601 [04:28<00:00, 13.39it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Accuracy: 86.81%
Balanced accuracy: 71.85%
Precision: 69.85%
Recall: 71.85%
F1 Score: 70.81%


## Gigaam mlp classifier

In [None]:
model_name = "nixiieee/gigaam-rnnt-emotion-classifier-dusha"
processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
model = ModelForEmotionClassification.from_pretrained(model_name, config=config, model_name=model_name)

In [22]:
def prepare_dataset(batch):
    audio = batch["audio"]
    processed = processor(
        audio["array"], sampling_rate=audio["sampling_rate"]
    )   
    batch["input_features"] = processed["input_features"][0]
    batch["input_lengths"] = processed["input_lengths"][0]
    batch["labels"] = batch["emotion"]
    return batch

processed_dusha = dusha_test_dataset.map(prepare_dataset, remove_columns=['audio','emotion'], num_proc=1)

Map:   0%|          | 0/3601 [00:00<?, ? examples/s]

In [None]:
data_collator = DataCollatorForEncoderClassificationGigaAM(processor)
dataloader = DataLoader(processed_dusha, batch_size=16, collate_fn=data_collator)

In [33]:
metrics = evaluate_model(model, dataloader)
print(f"Accuracy:           {metrics['accuracy']:.2%}")
print(f"Balanced accuracy:  {metrics['balanced_accuracy']:.2%}")
print(f"Precision:          {metrics['precision']:.2%}")
print(f"Recall:             {metrics['recall']:.2%}")
print(f"F1 Score:           {metrics['f1']:.2%}")

Evaluating: 100%|██████████| 226/226 [08:36<00:00,  2.29s/it]


Accuracy:           81.53%
Balanced accuracy:  83.68%
Precision:          84.44%
Recall:             83.68%
F1 Score:           84.03%


# resd

In [114]:
from datasets import load_dataset, concatenate_datasets

resd_dataset = load_dataset("Aniemore/resd")
resd_dataset = concatenate_datasets([resd_dataset["train"], resd_dataset["test"]])

In [115]:
resd_dataset[0]

{'name': '32_happiness_enthusiasm_h_120',
 'path': 'happiness_enthusiasm_32/32_happiness_enthusiasm_h_120.wav',
 'emotion': 'happiness',
 'speech': {'path': '32_happiness_enthusiasm_h_120.wav',
  'array': array([-0.00018311, -0.00061035, -0.00076294, ...,  0.00085449,
          0.00048828,  0.00030518], shape=(82211,)),
  'sampling_rate': 16000}}

In [116]:
import numpy as np
np.unique(resd_dataset['emotion'])

array(['anger', 'disgust', 'enthusiasm', 'fear', 'happiness', 'neutral',
       'sadness'], dtype='<U10')

In [117]:
from collections import Counter

def get_label_dist(ds, label="emotion"):
    emotions_split = ds[label]
    emotion_counts_split = Counter(emotions_split)
    print(emotion_counts_split)
    return

get_label_dist(resd_dataset)

Counter({'fear': 223, 'anger': 219, 'happiness': 218, 'enthusiasm': 198, 'neutral': 191, 'disgust': 185, 'sadness': 162})


In [118]:
def remap_emotion(example):
    emotion = example["emotion"]
    path = example["path"]

    if emotion in ["fear", "disgust", "enthusiasm"]:
        # Извлекаем первую часть пути: 'happiness_enthusiasm_32'
        base = path.split("/")[0]
        # Затем разбиваем её по `_` и берём эмоции
        emotion_parts = base.split("_")
        
        # Предположим, что первые два токена — это эмоции
        if len(emotion_parts) >= 2:
            new_emotion = emotion_parts[1]  # вторая эмоция
            example["emotion"] = new_emotion

    return example

resd_dataset = resd_dataset.map(remap_emotion)
get_label_dist(resd_dataset)

Counter({'happiness': 269, 'anger': 237, 'neutral': 231, 'sadness': 203, 'fear': 172, 'disgust': 161, 'enthusiasm': 123})


In [119]:
def limit_class_samples(ds, label_field='label', class_limits=None, seed=42):
    filtered_subsets = []

    for class_value, max_count in class_limits.items():
        subset = ds.filter(lambda x: x[label_field] == class_value)
        subset = subset.select(range(min(len(subset), max_count)))
        filtered_subsets.append(subset)

    all_limited_classes = set(class_limits.keys())
    remaining = ds.filter(lambda x: x[label_field] not in all_limited_classes)
    filtered_subsets.append(remaining)

    new_split = concatenate_datasets(filtered_subsets).shuffle(seed=seed)

    return new_split

resd_dataset = limit_class_samples(resd_dataset, label_field='emotion', class_limits={ "disgust" : 0, "fear" : 0})
get_label_dist(resd_dataset)

Filter:   0%|          | 0/1396 [00:00<?, ? examples/s]

Counter({'happiness': 269, 'anger': 237, 'neutral': 231, 'sadness': 203, 'enthusiasm': 123})


## Whisper models

### whisper small simple classifier

In [120]:
from transformers import AutoModelForAudioClassification, WhisperProcessor

model = AutoModelForAudioClassification.from_pretrained(
    "/home/llm_agent/video_audio_pipeline/emotion_dusha/whisper-emotion-classfication-layernorm/checkpoint-14094"
)

processor = WhisperProcessor.from_pretrained("openai/whisper-small")

In [121]:
label_mapping = {'neutral': 0, 'anger': 1, 'happiness': 2, 'enthusiasm' : 2, 'sadness': 3}

def preprocess_function(example):
    audio = example['speech']
    inputs = processor(audio["array"], sampling_rate=16000)
    example["input_features"] = inputs.input_features[0]
    example["labels"] = label_mapping[example["emotion"]]
    return example

processed_resd = resd_dataset.map(preprocess_function, num_proc=1)

Map:   0%|          | 0/1063 [00:00<?, ? examples/s]

In [122]:
get_label_dist(processed_resd, label='labels')

Counter({2: 392, 1: 237, 0: 231, 3: 203})


In [123]:
from torch.utils.data import DataLoader
data_collator = DataCollatorForEncoderClassification(processor)
dataloader = DataLoader(processed_resd, batch_size=8, collate_fn=data_collator)

In [124]:
metrics = evaluate_model(model, dataloader)
print(f"Accuracy:           {metrics['accuracy']:.2%}")
print(f"Balanced accuracy:  {metrics['balanced_accuracy']:.2%}")
print(f"Precision:          {metrics['precision']:.2%}")
print(f"Recall:             {metrics['recall']:.2%}")
print(f"F1 Score:           {metrics['f1']:.2%}")

Evaluating: 100%|██████████| 133/133 [03:41<00:00,  1.66s/it]

Accuracy:           35.37%
Balanced accuracy:  39.10%
Precision:          33.34%
Recall:             31.28%
F1 Score:           29.36%



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


### whisper small mlp classifier (dropout=0.2)

In [125]:
model_name = "nixiieee/whisper-small-emotion-classifier-dusha"
model = WhisperForEmotionClassification.from_pretrained(pretrained_model_name_or_path=model_name, model_name=model_name, num_labels=5, dropout=0.2)

metrics = evaluate_model(model, dataloader)
print(f"Accuracy:           {metrics['accuracy']:.2%}")
print(f"Balanced accuracy:  {metrics['balanced_accuracy']:.2%}")
print(f"Precision:          {metrics['precision']:.2%}")
print(f"Recall:             {metrics['recall']:.2%}")
print(f"F1 Score:           {metrics['f1']:.2%}")

Some weights of WhisperModel were not initialized from the model checkpoint at nixiieee/whisper-small-emotion-classifier-dusha and are newly initialized: ['decoder.embed_positions.weight', 'decoder.embed_tokens.weight', 'decoder.layer_norm.bias', 'decoder.layer_norm.weight', 'decoder.layers.0.encoder_attn.k_proj.weight', 'decoder.layers.0.encoder_attn.out_proj.bias', 'decoder.layers.0.encoder_attn.out_proj.weight', 'decoder.layers.0.encoder_attn.q_proj.bias', 'decoder.layers.0.encoder_attn.q_proj.weight', 'decoder.layers.0.encoder_attn.v_proj.bias', 'decoder.layers.0.encoder_attn.v_proj.weight', 'decoder.layers.0.encoder_attn_layer_norm.bias', 'decoder.layers.0.encoder_attn_layer_norm.weight', 'decoder.layers.0.fc1.bias', 'decoder.layers.0.fc1.weight', 'decoder.layers.0.fc2.bias', 'decoder.layers.0.fc2.weight', 'decoder.layers.0.final_layer_norm.bias', 'decoder.layers.0.final_layer_norm.weight', 'decoder.layers.0.self_attn.k_proj.weight', 'decoder.layers.0.self_attn.out_proj.bias', 'de

Accuracy:           34.81%
Balanced accuracy:  37.83%
Precision:          33.64%
Recall:             30.27%
F1 Score:           27.15%



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


### whisper small mlp classifier (dropout=0.1)

In [126]:
model_name = "nixiieee/whisper-small-emotion-classifier-dusha2"
model = WhisperForEmotionClassification.from_pretrained(pretrained_model_name_or_path=model_name, model_name=model_name, num_labels=5, dropout=0.1)

metrics = evaluate_model(model, dataloader)
print(f"Accuracy:           {metrics['accuracy']:.2%}")
print(f"Balanced accuracy:  {metrics['balanced_accuracy']:.2%}")
print(f"Precision:          {metrics['precision']:.2%}")
print(f"Recall:             {metrics['recall']:.2%}")
print(f"F1 Score:           {metrics['f1']:.2%}")

Some weights of WhisperModel were not initialized from the model checkpoint at nixiieee/whisper-small-emotion-classifier-dusha2 and are newly initialized: ['decoder.embed_positions.weight', 'decoder.embed_tokens.weight', 'decoder.layer_norm.bias', 'decoder.layer_norm.weight', 'decoder.layers.0.encoder_attn.k_proj.weight', 'decoder.layers.0.encoder_attn.out_proj.bias', 'decoder.layers.0.encoder_attn.out_proj.weight', 'decoder.layers.0.encoder_attn.q_proj.bias', 'decoder.layers.0.encoder_attn.q_proj.weight', 'decoder.layers.0.encoder_attn.v_proj.bias', 'decoder.layers.0.encoder_attn.v_proj.weight', 'decoder.layers.0.encoder_attn_layer_norm.bias', 'decoder.layers.0.encoder_attn_layer_norm.weight', 'decoder.layers.0.fc1.bias', 'decoder.layers.0.fc1.weight', 'decoder.layers.0.fc2.bias', 'decoder.layers.0.fc2.weight', 'decoder.layers.0.final_layer_norm.bias', 'decoder.layers.0.final_layer_norm.weight', 'decoder.layers.0.self_attn.k_proj.weight', 'decoder.layers.0.self_attn.out_proj.bias', 'd

Accuracy:           36.22%
Balanced accuracy:  39.22%
Precision:          34.69%
Recall:             31.38%
F1 Score:           28.32%



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


### whisper large v3 turbo mlp classifier

In [127]:
processor = WhisperProcessor.from_pretrained("openai/whisper-large-v3-turbo")

def prepare_dataset(example):
    audio = example['speech']
    inputs = processor(audio["array"], sampling_rate=16000)
    example["input_features"] = inputs.input_features[0]
    example["labels"] = label_mapping[example["emotion"]]
    return example

processed_resd = resd_dataset.map(
    prepare_dataset,
    num_proc=1,
)

Map:   0%|          | 0/1063 [00:00<?, ? examples/s]

In [128]:
from torch.utils.data import DataLoader
data_collator = DataCollatorForEncoderClassification(processor)
dataloader = DataLoader(processed_resd, batch_size=16, collate_fn=data_collator)

In [None]:
model_name = "nixiieee/whisper-large-v3-emotion-classifier-dusha"
# config = AutoConfig.from_pretrained(model_name)
model = WhisperForEmotionClassification.from_pretrained(pretrained_model_name_or_path=model_name, model_name=model_name, num_labels=5, dropout=0.05)

metrics = evaluate_model(model, dataloader)
print(f"Accuracy:           {metrics['accuracy']:.2%}")
print(f"Balanced accuracy:  {metrics['balanced_accuracy']:.2%}")
print(f"Precision:          {metrics['precision']:.2%}")
print(f"Recall:             {metrics['recall']:.2%}")=
print(f"F1 Score:           {metrics['f1']:.2%}")

Some weights of WhisperModel were not initialized from the model checkpoint at nixiieee/whisper-large-v3-emotion-classifier-dusha and are newly initialized: ['decoder.embed_positions.weight', 'decoder.embed_tokens.weight', 'decoder.layer_norm.bias', 'decoder.layer_norm.weight', 'decoder.layers.0.encoder_attn.k_proj.weight', 'decoder.layers.0.encoder_attn.out_proj.bias', 'decoder.layers.0.encoder_attn.out_proj.weight', 'decoder.layers.0.encoder_attn.q_proj.bias', 'decoder.layers.0.encoder_attn.q_proj.weight', 'decoder.layers.0.encoder_attn.v_proj.bias', 'decoder.layers.0.encoder_attn.v_proj.weight', 'decoder.layers.0.encoder_attn_layer_norm.bias', 'decoder.layers.0.encoder_attn_layer_norm.weight', 'decoder.layers.0.fc1.bias', 'decoder.layers.0.fc1.weight', 'decoder.layers.0.fc2.bias', 'decoder.layers.0.fc2.weight', 'decoder.layers.0.final_layer_norm.bias', 'decoder.layers.0.final_layer_norm.weight', 'decoder.layers.0.self_attn.k_proj.weight', 'decoder.layers.0.self_attn.out_proj.bias', 

Accuracy:           27.38%
Balanced accuracy:  26.95%
Precision:          35.55%
Recall:             21.56%
F1 Score:           26.01%



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


## Gigaam-EMO

In [130]:
device = "cuda" if torch.cuda.is_available() else "cpu"
audio_emotion_model = GigaEmotionInferencer(device=device)

  checkpoint = torch.load(model_path, map_location="cpu")


In [131]:
import evaluate
import torch
from sklearn.metrics import balanced_accuracy_score
from tqdm import tqdm
import soundfile as sf
import tempfile

accuracy_metric = evaluate.load("accuracy")
precision_metric = evaluate.load("precision")
recall_metric = evaluate.load("recall")
f1_metric = evaluate.load("f1")

all_predictions = []
all_labels = []

EMOTION_LABELS = {'neutral' : 0, 'angry' : 1, 'positive' : 2, 'sad' : 3, 'other' : 4}
label_mapping = {'neutral': 0, 'anger': 1, 'happiness': 2, 'enthusiasm' : 2, 'sadness': 3, 'disgust' : 4, 'fear' : 4}

with torch.no_grad():
    for sample in tqdm(resd_dataset):
        audio = sample['speech']['array']
        sr = sample['speech']['sampling_rate']
        with tempfile.NamedTemporaryFile(suffix=".wav", delete=True) as tmp:
            wav_tmp_path = tmp.name
            sf.write(wav_tmp_path, audio, samplerate=sr)
            emo, _ = audio_emotion_model.predict_emotion(wav_tmp_path)
        labels = sample["emotion"]
        all_predictions.append(EMOTION_LABELS[emo])
        all_labels.append(label_mapping[labels])

accuracy = accuracy_metric.compute(predictions=all_predictions, references=all_labels)
balanced_accuracy = balanced_accuracy_score(all_labels, all_predictions)
precision = precision_metric.compute(predictions=all_predictions, references=all_labels, average="macro")
recall = recall_metric.compute(predictions=all_predictions, references=all_labels, average="macro")
f1 = f1_metric.compute(predictions=all_predictions, references=all_labels, average="macro")

print(f"Accuracy: {accuracy['accuracy']:.2%}")
print(f"Balanced accuracy: {balanced_accuracy:.2%}")
print(f"Precision: {precision['precision']:.2%}")
print(f"Recall: {recall['recall']:.2%}")
print(f"F1 Score: {f1['f1']:.2%}")

100%|██████████| 1063/1063 [01:25<00:00, 12.36it/s]


Accuracy: 53.34%
Balanced accuracy: 53.68%
Precision: 57.65%
Recall: 53.68%
F1 Score: 50.84%


## Gigaam mlp classifier

In [132]:
model_name = "nixiieee/gigaam-rnnt-emotion-classifier-dusha"
processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
model = ModelForEmotionClassification.from_pretrained(model_name, config=config, model_name=model_name)

Some weights of GigaAMRNNTHF were not initialized from the model checkpoint at nixiieee/gigaam-rnnt-emotion-classifier-dusha and are newly initialized: ['model.encoder.layers.0.conv.batch_norm.bias', 'model.encoder.layers.0.conv.batch_norm.num_batches_tracked', 'model.encoder.layers.0.conv.batch_norm.running_mean', 'model.encoder.layers.0.conv.batch_norm.running_var', 'model.encoder.layers.0.conv.batch_norm.weight', 'model.encoder.layers.0.conv.depthwise_conv.bias', 'model.encoder.layers.0.conv.depthwise_conv.weight', 'model.encoder.layers.0.conv.pointwise_conv1.bias', 'model.encoder.layers.0.conv.pointwise_conv1.weight', 'model.encoder.layers.0.conv.pointwise_conv2.bias', 'model.encoder.layers.0.conv.pointwise_conv2.weight', 'model.encoder.layers.0.feed_forward1.linear1.bias', 'model.encoder.layers.0.feed_forward1.linear1.weight', 'model.encoder.layers.0.feed_forward1.linear2.bias', 'model.encoder.layers.0.feed_forward1.linear2.weight', 'model.encoder.layers.0.feed_forward2.linear1.bi

In [133]:
def prepare_dataset(batch):
    audio = batch["speech"]
    processed = processor(
        audio["array"], sampling_rate=audio["sampling_rate"]
    )   
    batch["input_features"] = processed["input_features"][0]
    batch["input_lengths"] = processed["input_lengths"][0]
    batch["labels"] = label_mapping[batch["emotion"]]
    return batch

processed_resd = resd_dataset.map(prepare_dataset, remove_columns=['speech', 'name', 'path', 'emotion'], num_proc=1)

Map:   0%|          | 0/1063 [00:00<?, ? examples/s]

In [134]:
from torch.nn.utils.rnn import pad_sequence

class DataCollatorForEncoderClassificationGigaAM:
    def __call__(self, features):
        sequences = [torch.tensor(f["input_features"]).T for f in features]

        # паддим по первой размерности до max_length
        padded = pad_sequence(sequences, batch_first=True, padding_value=0.0)
        # теперь padded.shape == [B, max_length, feat_dim]
        batch_inputs = padded.transpose(1, 2)
        batch_labels = torch.tensor(
            [f["labels"] for f in features], dtype=torch.long
        )
        batch_lens = torch.tensor(
            [f["input_lengths"] for f in features], dtype=torch.long
        )

        return {
            "input_features": batch_inputs,
            "labels": batch_labels,
            "input_lengths": batch_lens,
        }

In [135]:
data_collator = DataCollatorForEncoderClassificationGigaAM()
dataloader = DataLoader(processed_resd, batch_size=16, collate_fn=data_collator)

In [136]:
metrics = evaluate_model(model, dataloader, device="cuda:1")
print(f"Accuracy:           {metrics['accuracy']:.2%}")
print(f"Balanced accuracy:  {metrics['balanced_accuracy']:.2%}")
print(f"Precision:          {metrics['precision']:.2%}")
print(f"Recall:             {metrics['recall']:.2%}")
print(f"F1 Score:           {metrics['f1']:.2%}")

Evaluating: 100%|██████████| 67/67 [09:48<00:00,  8.79s/it]

Accuracy:           28.03%
Balanced accuracy:  26.50%
Precision:          36.45%
Recall:             21.20%
F1 Score:           26.60%



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
