In [3]:
# This file is to extract the features from the audio files into embeddings 

import os
import json
import librosa
import torch
from tqdm import tqdm
from transformers import Wav2Vec2Processor, Wav2Vec2Model
import numpy as np


In [17]:
import math
def getMaximumScore(arr, k):
    total = []

    for val in arr:
        current = val
        count = 0
        while current >0 and count < k:
            total.append(current)
            next_val = math.ceil(current/3)
            if next_val == current:
                break
            current = next_val
            count += 1
    total = sorted(total, reverse=True)
    result = sum(total[:k])
    return result

arr = [4,4,5,18,1]
k=3
print(getMaximumScore(arr, k))

29


In [None]:
def getCount(binary):
    x = set()
    y = set()
    for i in binary:
        new = set()
        for j in y:
            if j =='0':
                num = i
            else:
                num = j + i
            new.add(num)
        new.add(i)
        y.update(new)
        x.update(y)
    return len(y)

In [None]:
def countDiceSequences(N, rollMax):
    max_val = 10**9 + 7


    dp = [[[0] * (max(rollMax) + 1) for _ in range(7)] for _ in range(N + 1)]
    

    for i in range(1, 7):
        dp[1][i][1] = 1
    
    for n in range(2, N + 1):  
        for i in range(1, 7):  
            for k in range(1, rollMax[i - 1] + 1):  
                for j in range(1, 7):
                    if i != j:
                        dp[n][i][1] = (dp[n][i][1] + dp[n - 1][j][k]) % max_val
                

                if k > 1:
                    dp[n][i][k] = (dp[n][i][k] + dp[n - 1][i][k - 1]) % max_val

    total_sequences = 0
    for i in range(1, 7):
        for k in range(1, rollMax[i - 1] + 1):
            total_sequences = (total_sequences + dp[N][i][k]) % max_val

    return total_sequences


In [4]:

# Load the pre-trained Wav2Vec2 processor and model
model_name = "facebook/wav2vec2-large-xlsr-53"
processor = Wav2Vec2Processor.from_pretrained(model_name)
model = Wav2Vec2Model.from_pretrained(model_name).to("cuda")  # Move model to GPU if available

path_name = 'audio/final'  # Path to the folder containing audio files
files = os.listdir(path_name)

# Hyperparameters for windowing
window_size = 2  # Window size in seconds
stride = 1  # Stride in seconds

result = {}

for file in tqdm(files):
    audio_file = os.path.join(path_name, file)  # Create the full file path
    file_id = file.split('.')[0]  # Extract file id without the extension
    
    # Load audio file using librosa, resample to 16 kHz (required by Wav2Vec2)
    input_audio, sample_rate = librosa.load(audio_file, sr=16000)  # Load with target sample rate
    total_duration = librosa.get_duration(y=input_audio, sr=sample_rate)  # Get duration of the audio
    
    # Compute number of windows
    window_length = window_size * sample_rate  # Number of samples in each window
    stride_length = stride * sample_rate  # Number of samples to move between windows
    
    # Split audio into windows
    windows = []
    for start in range(0, len(input_audio) - window_length + 1, stride_length):
        end = start + window_length
        windows.append(input_audio[start:end])
    
    embeddings = []
    
    # Process each window
    for window in windows:
        # Extract features using Wav2Vec2Processor
        inputs = processor(window, return_tensors="pt", sampling_rate=16000)  # Return PyTorch tensors
        inputs = inputs.to("cuda")  # Move to GPU
        
        # Extract features using the Wav2Vec2 model
        with torch.no_grad():
            outputs = model(inputs.input_values)  # Forward pass
            
        # Get the mean-pooled embedding for this window
        window_embedding = torch.mean(outputs.last_hidden_state, dim=1).squeeze().to('cpu').tolist()
        embeddings.append(window_embedding)

    # Optionally: Mean pool across all window embeddings for a global feature vector
    mean_pooled_embedding = np.mean(embeddings, axis=0).tolist()
    
    # Store the result for this audio file
    result[file_id] = mean_pooled_embedding

# Save the extracted features as a JSON file
with open('audio_features.json', 'w') as f:
    json.dump(result, f, indent=4)

print("Audio features successfully extracted and saved!")


Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-large-xlsr-53 and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
from transformers import AutoProcessor, Wav2Vec2BertModel
from torch import nn

class RegressionHead(nn.Module):
    r"""Classification head."""

    def __init__(self, config):

        super().__init__()

        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.dropout = nn.Dropout(config.final_dropout)
        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)

    def forward(self, features, **kwargs):

        x = features
        x = self.dropout(x)
        x = self.dense(x)
        x = torch.tanh(x)
        x = self.dropout(x)
        x = self.out_proj(x)

        return x
# Load the pre-trained Wav2Vec 2.0 processor and model
path_name = 'audio/final'
files = os.listdir(path_name)
from transformers.models.wav2vec2.modeling_wav2vec2 import (
    Wav2Vec2Model,
    Wav2Vec2PreTrainedModel,
)

def process_func(
    x: np.ndarray,
    sampling_rate: int,
    embeddings: bool = False,
) -> np.ndarray:
    r"""Predict emotions or extract embeddings from raw audio signal."""

    # run through processor to normalize signal
    # always returns a batch, so we just get the first entry
    # then we put it on the device
    y = processor(x, sampling_rate=sampling_rate)
    y = y['input_values'][0]
    y = y.reshape(1, -1)
    y = torch.from_numpy(y).to('cuda')

    # run through model
    with torch.no_grad():
        y = model(y)[0 if embeddings else 1]

    # convert to numpy
    y = y.detach().cpu().numpy()

    return y

class EmotionModel(Wav2Vec2PreTrainedModel):
    r"""Speech emotion classifier."""

    def __init__(self, config):

        super().__init__(config)

        self.config = config
        self.wav2vec2 = Wav2Vec2Model(config)
        self.classifier = RegressionHead(config)
        self.init_weights()

    def forward(
            self,
            input_values,
    ):

        outputs = self.wav2vec2(input_values)
        hidden_states = outputs[0]
        hidden_states = torch.mean(hidden_states, dim=1)
        logits = self.classifier(hidden_states)

        return hidden_states, logits
# Load the pre-trained Wav2Vec2 processor and model
model_name = 'audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim'
processor = Wav2Vec2Processor.from_pretrained(model_name)
model = EmotionModel.from_pretrained(model_name).to('cuda')

result = {}
for file in tqdm(files):
    # Load a waveform using torchaudio, normalize the audio signal and extract the feature
    audio_file = f"{path_name}/{file}"
    file_id = file.split('.')[0]
    input_audio, sample_rate = librosa.load(audio_file,  sr=16000)

    y = processor(input_audio, sampling_rate=16000)
    y = y['input_values'][0]
    y = y.reshape(1, -1)
    y = torch.from_numpy(y).to('cuda')

    # run through model
    with torch.no_grad():
        y = model(y)[0 if True else 1]

    # convert to numpy
    y = y.detach().cpu().numpy()
    result[file_id] = y[0].tolist()

with open('audio_features_wav2vec2_bert.json', 'w') as f:
    json.dump(result, f, indent=4)

Some weights of EmotionModel were not initialized from the model checkpoint at audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 690/690 [05:46<00:00,  1.99it/s]


In [16]:
len(y[0])

1024