In [1]:
import os
import torch
import numpy as np 
import matplotlib.pyplot as pyplot

In [2]:
import torch
import torch.nn as nn
from speechbrain.lobes.models.ECAPA_TDNN import AttentiveStatisticsPooling

from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC

BASE_MODEL = "ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition"
PROC_ID = "jonatasgrosman/wav2vec2-large-xlsr-53-english"
# LINGUISTICS_MODEL = "jonatasgrosman/wav2vec2-large-xlsr-53-english"

INFO:speechbrain.utils.quirks:Applied quirks (see `speechbrain.utils.quirks`): [allow_tf32, disable_jit_profiling]
INFO:speechbrain.utils.quirks:Excluded quirks specified by the `SB_DISABLE_QUIRKS` environment (comma-separated list): []
2024-11-17 23:34:11.399130: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1731882851.420468  319509 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1731882851.426954  319509 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-17 23:34:11.449985: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the

In [3]:
from torch.utils.data import DataLoader
from utils.datasets import InTheWildDataset


train_dataset = InTheWildDataset(
        root_dir="/home/infres/amathur-23/DADA/datasets/InTheWild",
        metadata_file='meta.csv',
        include_spoofs=False,
        bonafide_label="bona-fide",
        filename_col="file",
        sampling_rate=16000,
        max_duration=4,
        split="train",
        config='configs/data/inthewild_toy.yaml',
        mode="triplet",
    )

train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)

In [4]:
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
device = "cuda" if torch.cuda.is_available() else "cpu"

In [5]:
input = next(iter(train_loader))
input = {k: v.to(device) for k, v in input.items()}

In [None]:
class BottleNeck(nn.Module):
    def __init__(self, K = 199, F_in=1024, F_out=256, bottleneck_dropout=0.5):
        super(BottleNeck, self).__init__()
        self.bottleneck_dropout = bottleneck_dropout


        self.lin1 =  nn.Linear(F_in, F_out)
        self.bn1  =  nn.BatchNorm1d(K)
        self.relu = nn.LeakyReLU()
        self.drop =  nn.Dropout(self.bottleneck_dropout)
        self.lin2 = nn.Linear(F_out, F_in)

    def forward(self, x):
        x = self.lin1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.drop(x)
        x = self.lin2(x)
        return x

class CompressionModule(nn.Module):
    def __init__(
        self, K=199, F_in=1024, F_out=256, bottleneck_dropout=0.5, head_dropout=0.5
    ):
        super(CompressionModule, self).__init__()
        self.K = K
        self.bottleneck_dropout = bottleneck_dropout
        self.head_dropout = head_dropout
        self.pool = lambda x : torch.mean(x, dim=-1)

        self.bottleneck = BottleNeck(K, F_in, F_out, bottleneck_dropout)

        self.head = nn.Sequential(
            nn.Dropout(self.head_dropout),
            nn.LeakyReLU(),
            nn.Linear(F_in, F_out),
        )

    def forward(self, x):
        x_pool = self.pool(x)
        x = self.bottleneck(x_pool)
        x = self.head(x + x_pool)
        x = nn.functional.normalize(x, p=2, dim=-1)
        return x

In [7]:
class SpeechEmbedder(nn.Module):
    def __init__(
        self,
        feature_layers=(0, 10),
        K=199,
        F_in=1024,
        F_out=256,
        bottleneck_dropout=0.5,
        head_dropout=0.5,
        device = 'cuda'
    ):
        super(SpeechEmbedder, self).__init__()
        self.processor = Wav2Vec2Processor.from_pretrained(PROC_ID)
        self.feature_model = Wav2Vec2ForCTC.from_pretrained(BASE_MODEL)
        self.feature_model.eval()
        self.compression = CompressionModule(
            K=K,
            F_in=F_in,
            F_out=F_out,
            bottleneck_dropout=bottleneck_dropout,
            head_dropout=head_dropout,
        )
        self.feature_layers = feature_layers
        self.device = device

    def get_features(self, x):
        with torch.no_grad():
            x = self.processor(x, return_tensors="pt", padding=True, sampling_rate=16_000, device=self.device).input_values[0]
            x = x.to(self.device)
            out = self.feature_model(x, output_hidden_states=True, return_dict=True)
            feat = torch.stack(out.hidden_states[self.feature_layers[0] : self.feature_layers[1]], dim =-1)
        return feat
    
    def to(self, device):
        self.compression.to(device)
        self.feature_model.to(device)
        return super().to(device)

    def train(self):
        self.compression.train()

    def eval(self):
        self.compression.eval()

    def forward(self, input):

        a, p, n = input["anchor"], input["positive"], input["negative"]
        x_a, x_p, x_n = self.get_features(a), self.get_features(p), self.get_features(n)

        x_a, x_p, x_n = (
            self.compression(x_a),
            self.compression(x_p),
            self.compression(x_n),
        )

        x_a, x_p, x_n = (
            rearrange(x_a, "n t f -> n f t"),
            rearrange(x_p, "n t f -> n f t"),
            rearrange(x_n, "n t f -> n f t"),
        )

        x_a, x_p, x_n = (
            x_a.mean(dim=-1),
            x_p.mean(dim=-1),
            x_n.mean(dim=-1),
        )

        return {
            "anchor": x_a,
            "positive": x_p,
            "negative": x_n,
        }


In [8]:
model = SpeechEmbedder().to(device)

output = model(input)

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition and are newly initialized: ['lm_head.bias', 'lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
[x.shape for x in output.values()]

[torch.Size([4, 256]), torch.Size([4, 256]), torch.Size([4, 256])]