In [11]:
from transformers import Wav2Vec2Processor, Wav2Vec2Model
import torch
import librosa
import os
from tqdm import tqdm
import pandas as pd
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
import umap
import plotly.express as px
import numpy as np
import openl3
import soundfile as sf
import numpy as np

In [26]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
audio_processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base")
audio_model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base")



In [3]:
audio_files = os.listdir('data/audio')

In [18]:
audio_embeddings = []
for file in tqdm(audio_files):
    audio, sr = librosa.load(f'data/audio/{file}', sr=16000)
    inputs = audio_processor(audio, sampling_rate=sr, return_tensors="pt", padding=True)
    with torch.no_grad():
        outputs = audio_model(**inputs)
        embedding = outputs.last_hidden_state.mean(dim=1)
    audio_embeddings.append(embedding[0])
# audio_embeddings = [embedding[0] for embedding in audio_embeddings]

100%|██████████| 7442/7442 [16:44<00:00,  7.41it/s]


In [21]:
np.save("audio_embeddings.npy", np.array(audio_embeddings))

In [22]:
audio_embeddings = np.load("audio_embeddings.npy")

In [23]:
audio_embeddings.shape

(7442, 768)

# Klasyfikacja

In [24]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import accuracy_score

In [25]:
class MLPClassifier(nn.Module):
    def __init__(self, input_size, layer_sizes, num_classes):
        """
        Args:
        - input_size (int): Rozmiar wektora wejściowego (np. 768).
        - layer_sizes (list of int): Lista z rozmiarami każdej warstwy ukrytej.
        - num_classes (int): Liczba klas w klasyfikacji.
        """
        super(MLPClassifier, self).__init__()
        
        layers = []
        
        in_features = input_size
        for out_features in layer_sizes:
            layers.append(nn.Linear(in_features, out_features))
            layers.append(nn.ReLU())
            in_features = out_features
        layers.append(nn.Linear(in_features, num_classes))
        
        self.model = nn.Sequential(*layers)
    
    def forward(self, x):
        return self.model(x)

In [27]:
def train_model(model, train_loader, val_loader, criterion, optimizer, epochs):
    """
    Funkcja do trenowania modelu.
    
    Args:
    - model: Model PyTorch do trenowania.
    - train_loader: Dataloader dla danych treningowych.
    - val_loader: Dataloader dla danych walidacyjnych.
    - criterion: Funkcja kosztu (np. CrossEntropyLoss).
    - optimizer: Optymalizator (np. Adam).
    - epochs: Liczba epok.
    """
    model.to(device)
    
    for epoch in range(epochs):
        model.train()
        train_loss = 0
        train_targets = []
        train_preds = []
        for inputs, targets in train_loader:
            inputs, targets = inputs.to(device), targets.to(device)
            
            optimizer.zero_grad()
            outputs = model(inputs)
            
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item()
            train_preds.extend(torch.argmax(outputs, dim=1).cpu().numpy())
            train_targets.extend(targets.cpu().numpy())
        
        train_accuracy = accuracy_score(train_targets, train_preds)
        
        model.eval()
        val_loss = 0
        val_targets = []
        val_preds = []
        with torch.no_grad():
            for inputs, targets in val_loader:
                inputs, targets = inputs.to(device), targets.to(device)
                outputs = model(inputs)
                
                loss = criterion(outputs, targets)
                val_loss += loss.item()
                
                val_preds.extend(torch.argmax(outputs, dim=1).cpu().numpy())
                val_targets.extend(targets.cpu().numpy())
        
        val_accuracy = accuracy_score(val_targets, val_preds)
        
        print(f"Epoch {epoch+1}/{epochs}")
        print(f"Train Loss: {train_loss/len(train_loader):.4f}, Train Accuracy: {train_accuracy:.4f}")
        print(f"Val Loss: {val_loss/len(val_loader):.4f}, Val Accuracy: {val_accuracy:.4f}")
        print("-" * 50)

In [66]:
input_size = 768               # Rozmiar wejścia (embedding)
layer_sizes = [512, 256, 128]  # Rozmiary warstw ukrytych
num_classes = 6                # Liczba klas

model = MLPClassifier(input_size, layer_sizes, num_classes)

batch_size = 32
learning_rate = 0.001
epochs = 20

In [29]:
files = os.listdir('data/audio')

In [31]:
speaker = []
y = []
for file in files:
    emo = file.split('_')[2]
    sp = file.split('_')[0]
    y.append(emo)
    speaker.append(sp)

In [34]:
from collections import Counter
c = Counter(speaker)

In [42]:
sum(list(c.values())[:70])

5721

In [43]:
sum(list(c.values())[70:83])

1065

In [44]:
sum(list(c.values())[83:])

656

In [55]:
y_map = {
    'NEU': 0,
    'HAP': 1,
    'SAD': 2,
    'ANG': 3,
    'FEA': 4,
    'DIS': 5
}

In [61]:
x_train = torch.Tensor(audio_embeddings[:5721])
x_val = torch.Tensor(audio_embeddings[5721:6786])
x_test = torch.Tensor(audio_embeddings[6786:])
y_train = torch.Tensor([y_map[a] for a in y[:5721]]).long()
y_val = torch.Tensor([y_map[a] for a in y[5721:6786]]).long()
y_test = torch.Tensor([y_map[a] for a in y[6786:]]).long()

In [62]:
train_dataset = torch.utils.data.TensorDataset(x_train, y_train)
val_dataset = torch.utils.data.TensorDataset(x_val, y_val)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

In [67]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
train_model(model, train_loader, val_loader, criterion, optimizer, epochs)

Epoch 1/20
Train Loss: 1.4534, Train Accuracy: 0.4036
Val Loss: 1.5624, Val Accuracy: 0.4254
--------------------------------------------------
Epoch 2/20
Train Loss: 1.2893, Train Accuracy: 0.4837
Val Loss: 1.3892, Val Accuracy: 0.4695
--------------------------------------------------
Epoch 3/20
Train Loss: 1.2099, Train Accuracy: 0.5268
Val Loss: 1.6170, Val Accuracy: 0.4423
--------------------------------------------------
Epoch 4/20
Train Loss: 1.1649, Train Accuracy: 0.5508
Val Loss: 1.5867, Val Accuracy: 0.4479
--------------------------------------------------
Epoch 5/20
Train Loss: 1.1128, Train Accuracy: 0.5770
Val Loss: 1.5235, Val Accuracy: 0.4498
--------------------------------------------------
Epoch 6/20
Train Loss: 1.0820, Train Accuracy: 0.5903
Val Loss: 1.5010, Val Accuracy: 0.4948
--------------------------------------------------
Epoch 7/20
Train Loss: 1.0556, Train Accuracy: 0.5966
Val Loss: 1.4963, Val Accuracy: 0.4657
-------------------------------------------