Convert Audio files to embeddings with VGGish network(CNN)

In [None]:
!git clone https://github.com/harritaylor/torchvggish.git
%cd torchvggish
!pip install librosa
!pip install soundfile
!pip install resampy

In [None]:
# Load the Drive helper and mount
from google.colab import drive
drive.mount('/content/drive')

In [None]:
embeddings_src = '/content/drive/MyDrive/mostafavi/embeddings.pkl'
audio_folder_path = '/content/drive/MyDrive/mostafavi/record'
scores_filepath = '/content/drive/MyDrive/mostafavi/dataset.csv'

We start to extract embeddings with VGGish & save embeddings to a .pkl file

In [None]:
import librosa
import numpy as np
import soundfile as sf
import tempfile
import os
import pickle

def preprocess_audio(audio_path):
    with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_wav_file:
            wav_path = temp_wav_file.name  # Get the temporary file path
            y, sr = sf.read(audio_path)  # Read MP3 using soundfile
            sf.write(wav_path, y, sr, format='wav')  # Write to temporary WAV file

    return wav_path

import torch


model = torch.hub.load('harritaylor/torchvggish', 'vggish')
model.eval()
def extract_embeddings(audio_path):
    wav_path = preprocess_audio(audio_path)
    print("#",wav_path)
    embeddings = model.forward(wav_path)
    print(embeddings.shape)
    return embeddings.detach().numpy()

import os,re

embeddings = []
audio_filenames = os.listdir(audio_folder_path)
for audio_path in sorted(audio_filenames, key=lambda filename: int(re.findall(r"\d+", filename)[0])):
    filepath = os.path.join(audio_folder_path, audio_path)
    embedding = extract_embeddings(filepath)
    embeddings.append(embedding)



# Replace 'embeddings.pkl' with name that you want
with open('embeddings.pkl', 'wb') as f:
    pickle.dump(embeddings, f)

Now load saved embeddigns and extract MFCCs from this with librosa

In [None]:
import pickle
with open(embeddings_src, 'rb') as f:
    loaded_embeddings = pickle.load(f)

len(loaded_embeddings)

In [None]:
import numpy as np
import librosa
from sklearn.preprocessing import StandardScaler

# Define a function to convert VGGish embeddings to MFCCs
def vggish_to_mfccs(vggish_embeddings):
    # Assuming each VGGish embedding is a numpy array
    mfccs = []
    for embedding in vggish_embeddings:
        # Transpose the embedding to match librosa's format (shape: (time, features))
        embedding = embedding.T
        
        # Scale the embedding to have zero mean and unit variance
        scaler = StandardScaler()
        embedding_scaled = scaler.fit_transform(embedding)
        
        # Compute MFCCs from the scaled embedding
        mfcc = librosa.feature.mfcc(S=embedding_scaled, sr=44100, n_mfcc=26)  # Adjust sr and n_mfcc as needed
        mfccs.append(mfcc)
    
    return mfccs

# Convert VGGish embeddings to MFCCs
mfccs = vggish_to_mfccs(loaded_embeddings)

Because number of tracks is 580 but number of dataset is 299, we set both same size

In [None]:
import torch
import pandas as pd
scores_df = pd.read_csv(scores_filepath)
scores = scores_df['ExaminationScore'].tolist()
print(len(scores))
mfccs = mfccs[:299]
print(len(mfccs))
embeddings_tensors = [torch.tensor(embedding) for embedding in mfccs]

Now we need to Normalize embbeding tensors and pad them to have same size

In [None]:
import torch

def min_max_normalization(embeddings):
    # Find the maximum sequence length and embedding dimension
    max_seq_length = max(embedding.size(0) for embedding in embeddings)
    max_embedding_dim = max(embedding.size(1) for embedding in embeddings)

    # Add a batch dimension and reshape embeddings
    reshaped_embeddings = [embedding.unsqueeze(0).unsqueeze(0) for embedding in embeddings]

    # Resize embeddings to have the same sequence length and embedding dimension
    resized_embeddings = [
        torch.nn.functional.interpolate(
            embedding,
            size=(max_seq_length, max_embedding_dim),
            mode='nearest'
        ).squeeze(0).squeeze(0)
        for embedding in reshaped_embeddings
    ]

    # Concatenate all embeddings along the batch dimension
    concatenated_embeddings = torch.cat(resized_embeddings, dim=0)

    # Find the minimum and maximum values in the concatenated embeddings
    min_value = torch.min(concatenated_embeddings)
    max_value = torch.max(concatenated_embeddings)

    # Scale each embedding tensor individually
    normalized_embeddings = [(embedding - min_value) / (max_value - min_value) for embedding in resized_embeddings]

    return normalized_embeddings


# Normalizing embeddings tensors
normalized_embeddings = min_max_normalization(embeddings_tensors)
normalized_embeddings

Now with loop check if the embeddings are normalized and padded with same size 

In [None]:
for i, embedding in enumerate(normalized_embeddings[:10]):
    print(f"Tensor {i + 1} shape: {embedding.shape}")

Now we should normalize Scores too

In [None]:
def min_max_normalization_scores(scores):
    # Convert scores to a tensor
    scores_tensor = torch.tensor(scores, dtype=torch.float32)

    # Find the minimum and maximum values in the scores
    min_score = torch.min(scores_tensor)

    max_score = torch.max(scores_tensor)

    # Scale the scores to the range [0, 1]
    normalized_scores = (scores_tensor - min_score) / (max_score - min_score)

    return normalized_scores.tolist()  # Convert tensor back to list


# Assuming scores is a list of scores between 0 and 100
normalized_scores = min_max_normalization_scores(scores)
scores_before_romalization = scores
normalized_scores

Here we create dataset with scores and embeddings

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader


# Convert normalized scores to tensor
normalized_scores_tensor = torch.tensor(normalized_scores, dtype=torch.float32)

# Convert normalized embeddings to tensor
embeddings_tensors = [torch.tensor(embedding) for embedding in normalized_embeddings]

# Determine the maximum length of embeddings
max_length = max(embedding.shape[0] for embedding in embeddings_tensors)

# Pad embeddings to match the maximum length
padded_embeddings = [torch.nn.functional.pad(embedding, (0, 0, 0, max_length - embedding.shape[0])) for embedding in embeddings_tensors]

# Convert padded embeddings to tensor
padded_embeddings_tensor = torch.stack(padded_embeddings)

# Define a custom dataset class
class AudioDataset(Dataset):
    def __init__(self, embeddings, scores):
        self.embeddings = embeddings
        self.scores = scores

    def __len__(self):
        return len(self.embeddings)

    def __getitem__(self, idx):
        return self.embeddings[idx], self.scores[idx]

# Create a dataset instance
dataset = AudioDataset(embeddings_tensors, normalized_scores_tensor)

# Create a DataLoader for batching and shuffling
dataloader = DataLoader(dataset, batch_size=1, shuffle=True)

# Example usage of DataLoader
for batch in dataloader:
    embeddings_batch, scores_batch = batch
    print("Embeddings Batch Shape:", embeddings_batch.shape)
    print("Scores Batch:", scores_batch)

Make LSTM model 

In [None]:
import torch
import torch.nn as nn

class AudioModel(nn.Module):
    def __init__(self, input_size=37, hidden_size=128, num_layers=1, dropout_prob=0.0, bidirectional=True, weight_decay=0.0):
        super(AudioModel, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, bidirectional=bidirectional)
        self.dropout = nn.Dropout(dropout_prob)  # Dropout layer
        self.fc = nn.Linear(hidden_size * 2 if bidirectional else hidden_size, 1)  # Output score
        self.weight_decay = weight_decay  # L2 regularization strength

    def forward(self, x):
        # x shape: (batch_size, seq_length, input_size)
        lstm_out, _ = self.lstm(x)
        # lstm_out shape: (batch_size, seq_length, hidden_size * num_directions)
        # Only take the last hidden state
        lstm_out = self.dropout(lstm_out)  # Apply dropout
        # Apply fully connected layer
        output = self.fc(lstm_out[:, -1, :])
        output = torch.sigmoid(output) * 100
        return output

    def l2_regularization_loss(self):
        l2_reg_loss = 0.0
        for param in self.parameters():
            l2_reg_loss += torch.norm(param, p=2)**2
        return self.weight_decay * l2_reg_loss

Now We Train model
For GPU training Just need to run cell bellow

In [None]:
import torch
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, random_split
from sklearn.model_selection import GridSearchCV


# Split dataset into training and testing sets
dataset_size = len(dataset)
train_size = int(0.8 * dataset_size)  # 80% for training, 20% for testing
test_size = dataset_size - train_size
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

# Create DataLoader for training and testing sets
train_dataloader = DataLoader(train_dataset, batch_size=1, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=1, shuffle=False)  # No need to shuffle test data

# Define hyperparameters
input_size = 37  # Dimensionality of input embeddings
hidden_size = 256  # Size of LSTM hidden states 32-512
num_layers = 10   # Number of LSTM layers
learning_rate = 0.001
num_epochs = 70
dropout_prob = 0.0
weight_decay = 0.0 # L2 regularization strength


# Initialize the model
model = AudioModel(input_size, hidden_size, num_layers, dropout_prob=dropout_prob)
if torch.cuda.is_available():
  model.to(device) 
# Define loss function and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)  # Add L2 regularization

# Training loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0

    for embeddings, scores in train_dataloader:
        optimizer.zero_grad()
        if torch.cuda.is_available():
          embeddings = embeddings.to(device)
          scores = scores.to(device)
        # Forward pass
        outputs = model(embeddings)
        scores = scores.expand_as(outputs)

        # Calculate loss
        loss = criterion(outputs, scores)  # Assuming scores are floats
        
        # Add L2 regularization loss
        loss += model.l2_regularization_loss()

        # Backward pass
        loss.backward()

        # Update weights
        optimizer.step()

        total_loss += loss.item()

    # Calculate average loss for the epoch
    average_loss = total_loss / len(train_dataloader)

    print(f"Epoch [{epoch+1}/{num_epochs}], Train Loss: {average_loss:.4f}")

# Evaluate the model on the test set
model.eval()
test_loss = 0.0
with torch.no_grad():
    for embeddings, scores in test_dataloader:
        if torch.cuda.is_available():
          embeddings = embeddings.to(device)
          scores = scores.to(device)
        outputs = model(embeddings)
        #outputs = torch.clamp(outputs, min=0, max=100)
        test_loss += criterion(outputs.squeeze(), scores.float()).item()

test_loss /= len(test_dataloader)
print(f"Test Loss: {test_loss:.4f}")

# After training, you can save the model if needed
#torch.save(model.state_dict(), 'audio_model.pth')

Calculate Accuracy 

In [None]:
predictions = []
targets = []

with torch.no_grad():
    for embeddings, scores in test_dataloader:
        if torch.cuda.is_available():
          embeddings = embeddings.to(device)
          scores = scores.to(device)
        outputs = model(embeddings)
        print(outputs)
        predictions.append(outputs.item())  # Append the scalar value

        # Assuming scores is also a scalar value, you can directly append it to targets
        print(scores)
        targets.append(scores.item())


# Calculate accuracy
def calculate_accuracy(predictions, targets, tolerance=0.09):
    correct_predictions = 0
    total_predictions = len(predictions)

    for pred, target in zip(predictions, targets):
        if abs(pred - target) <= tolerance:
            correct_predictions += 1

    accuracy = correct_predictions / total_predictions
    return accuracy

accuracy = calculate_accuracy(predictions, targets)
print("Accuracy:", accuracy * 100)