In [1]:
import librosa

import os
import torch
import numpy as np

from torchvision import transforms, datasets
from torch.utils.data import Dataset, DataLoader

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

import librosa.display
import matplotlib.pyplot as plt

from tqdm.auto import tqdm

from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
import pandas as pd

import random

from google.colab import drive

In [2]:
torch._C._cuda_init()

In [3]:
device = torch.device("cuda")

In [4]:
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
def feature_extract(file):
    #get wave representation
    y, sr = librosa.load(file)

    #get the mel-scaled spectrogram
    spectrogram = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128,fmax=11025)

    return spectrogram

In [6]:
def plot_spectogram(sp, sr, label):
    S_DB = librosa.amplitude_to_db(sp, ref=np.max)

    # Plot the Mel-spectrogram
    plt.figure(figsize=(10, 4))
    librosa.display.specshow(S_DB, sr=sr, x_axis='time', y_axis='mel')
    plt.colorbar(format='%+2.0f dB')
    plt.title(f'Mel-Spectrogram of {label}')
    plt.tight_layout()
    plt.show()


In [6]:
class InstrumentDataset(Dataset):
    def __init__(self, audio_dir, transformation):
        self.audio_labels = []
        self.audio_paths = []
        self.audio_dir = audio_dir
        self.classes = {'cel': 0, 'cla': 1, 'flu': 2, 'gac': 3, 'gel': 4, 'org': 5, 'pia': 6, 'sax': 7, 'tru': 8, 'vio': 9, 'voi': 10}
        self.transformation = transformation

        # Scan the directory and get audio paths and labels
        for label in os.listdir(audio_dir):
            label_dir = os.path.join(audio_dir, label)
            if os.path.isdir(label_dir):
                for file in os.listdir(label_dir):
                    if file.endswith('.wav'):
                        self.audio_paths.append(os.path.join(label_dir, file))
                        self.audio_labels.append(label)

    def __len__(self):
        return len(self.audio_paths)

    def __getitem__(self, index):
        audio_path = self.audio_paths[index]
        label = self.audio_labels[index]
        return self.transformation(audio_path), self.classes[label]

In [7]:
batch_size=32

dataset = InstrumentDataset(audio_dir='drive/MyDrive/audio_data', transformation=feature_extract)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

test_dataset = InstrumentDataset(audio_dir='drive/MyDrive/audio_test_data', transformation=feature_extract)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [48]:
class InstrumentCNN(nn.Module):
    def __init__(self, output_shape: int):
        super().__init__()
        self.block_1 = nn.Sequential(
            nn.Conv2d(in_channels=1,
                      out_channels=16,
                      kernel_size=3,
                      stride=1,
                      padding=1
                      ),
            nn.BatchNorm2d(num_features=16),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(in_channels=16,
                      out_channels=32,
                      kernel_size=3,
                      stride=1,
                      padding=1
                      ),
            nn.BatchNorm2d(num_features=32),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(in_channels=32,
                      out_channels=64,
                      kernel_size=3,
                      stride=1,
                      padding=1
                      ),
            nn.BatchNorm2d(num_features=64),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2)
        )
        self.classifier = nn.Sequential(
            nn.Flatten(),
            nn.Linear(in_features=16384, out_features=512),
            nn.BatchNorm1d(num_features=512),
            nn.ReLU(inplace=True),
            nn.Dropout(p=0.5),
            nn.Linear(in_features=512, out_features=output_shape)
        )

    def forward(self, x):
        x = self.block_1(x)
        x = self.classifier(x)
        return x

In [49]:
class InstrumentLSTM(nn.Module):
    def __init__(self, output_shape: int):
        super().__init__()
        self.lstm = nn.LSTM(input_size=128, hidden_size=256, num_layers=2, batch_first=True)
        self.classifier = nn.Sequential(
            nn.Linear(256, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(inplace=True),
            nn.Dropout(0.5),
            nn.Linear(512, output_shape)
        )

    def forward(self, x):
        # x shape is expected to be [batch, seq_len, features] where features is input_size
        lstm_out, (hn, cn) = self.lstm(x)
        # Taking the last hidden state to pass through the classifier
        x = hn[-1]  # Shape: [batch, hidden_size]
        x = self.classifier(x)
        return x


In [50]:
num_classes = 11  # Set the number of classes
epochs = 30
lr = 0.001

In [None]:
model = InstrumentCNN(num_classes).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=lr)

torch.manual_seed(42)


for epoch in tqdm(range(epochs)):
    print(f"Epoch: {epoch}\n-------")
    ### Training
    train_loss = 0
    for batch, (X, y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)
        X = X.unsqueeze(1)
        model.train()
        y_pred = model(X).to(device)
        loss = criterion(y_pred, y)
        loss = loss.to(device)
        train_loss += loss

        optimizer.zero_grad()

        loss.backward()
        optimizer.step()
        if batch % 10 == 0:
            print(f"Looked at {batch * len(X)}/{len(dataloader.dataset)} samples")

    train_loss /= len(dataloader)
    model.eval()
    torch.save(model.state_dict(), 'checkpoints_lstm/epoch_{}.pth'.format(epoch))
    print(f"\nTrain loss: {train_loss:.5f}%\n")

In [36]:
model_lstm = InstrumentLSTM(num_classes).to(device)
criterion_lstm = nn.CrossEntropyLoss()
optimizer_lstm = optim.Adam(model_lstm.parameters(), lr=lr)

torch.manual_seed(42)


for epoch in tqdm(range(epochs)):
    print(f"Epoch: {epoch}\n-------")
    ### Training
    train_loss = 0
    for batch, (X, y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)
        X = X.transpose(1, 2)
        model_lstm.train()
        y_pred = model_lstm(X).to(device)
        loss = criterion_lstm(y_pred, y)
        loss = loss.to(device)
        train_loss += loss

        optimizer_lstm.zero_grad()

        loss.backward()
        optimizer_lstm.step()
        if batch % 10 == 0:
            print(f"Looked at {batch * len(X)}/{len(dataloader.dataset)} samples")

    train_loss /= len(dataloader)
    model_lstm.eval()
    torch.save(model_lstm.state_dict(), 'drive/MyDrive/checkpoints/lstm/epoch_{}.pth'.format(epoch))
    print(f"\nTrain loss: {train_loss:.5f}%\n")

  0%|          | 0/30 [00:00<?, ?it/s]

Epoch: 0
-------
Looked at 0/6705 samples
Looked at 320/6705 samples
Looked at 640/6705 samples
Looked at 960/6705 samples
Looked at 1280/6705 samples
Looked at 1600/6705 samples
Looked at 1920/6705 samples
Looked at 2240/6705 samples
Looked at 2560/6705 samples
Looked at 2880/6705 samples
Looked at 3200/6705 samples
Looked at 3520/6705 samples
Looked at 3840/6705 samples
Looked at 4160/6705 samples
Looked at 4480/6705 samples
Looked at 4800/6705 samples
Looked at 5120/6705 samples
Looked at 5440/6705 samples
Looked at 5760/6705 samples
Looked at 6080/6705 samples
Looked at 6400/6705 samples

Train loss: 2.20651%

Epoch: 1
-------
Looked at 0/6705 samples
Looked at 320/6705 samples
Looked at 640/6705 samples
Looked at 960/6705 samples
Looked at 1280/6705 samples
Looked at 1600/6705 samples
Looked at 1920/6705 samples
Looked at 2240/6705 samples
Looked at 2560/6705 samples
Looked at 2880/6705 samples
Looked at 3200/6705 samples
Looked at 3520/6705 samples
Looked at 3840/6705 samples
Loo

In [35]:
torch.save(model_lstm.state_dict(), 'drive/MyDrive/checkpoints/lstm/epoch_{}.pth'.format(0))

In [70]:
# Evaluate CNN

checkpoint = torch.load('drive/MyDrive/checkpoints/cnn/epoch_29.pth')
loaded_model = InstrumentCNN(11)
loaded_model.load_state_dict(checkpoint)
loaded_model = loaded_model.to(device)

# Evaluate the model on the test set
loaded_model.eval()
y_true_cnn, y_pred_cnn = [], []
with torch.no_grad():
    for batch, (X, y) in enumerate(test_dataloader):
        X, y = X.to(device), y.to(device)
        X = X.unsqueeze(1)
        output = loaded_model(X).to(device)
        _, predicted = torch.max(output.data, 1)
        y_true_cnn.extend(y)
        y_pred_cnn.extend(predicted.cpu())
        print(f"Batch: {batch}/{len(test_dataloader)}")


Batch: 0/42
Batch: 1/42
Batch: 2/42
Batch: 3/42
Batch: 4/42
Batch: 5/42
Batch: 6/42
Batch: 7/42
Batch: 8/42
Batch: 9/42
Batch: 10/42
Batch: 11/42
Batch: 12/42
Batch: 13/42
Batch: 14/42
Batch: 15/42
Batch: 16/42
Batch: 17/42
Batch: 18/42
Batch: 19/42
Batch: 20/42
Batch: 21/42
Batch: 22/42
Batch: 23/42
Batch: 24/42
Batch: 25/42
Batch: 26/42
Batch: 27/42
Batch: 28/42
Batch: 29/42
Batch: 30/42
Batch: 31/42
Batch: 32/42
Batch: 33/42
Batch: 34/42
Batch: 35/42
Batch: 36/42
Batch: 37/42
Batch: 38/42
Batch: 39/42
Batch: 40/42
Batch: 41/42


In [68]:
# Evaluate LSTM

checkpoint = torch.load('drive/MyDrive/checkpoints/lstm/epoch_29.pth')
loaded_model = InstrumentLSTM(num_classes)
loaded_model.load_state_dict(checkpoint)
loaded_model = loaded_model.to(device)

# Evaluate the model on the test set
loaded_model.eval()
y_true_lstm, y_pred_lstm = [], []
with torch.no_grad():
    for batch, (X, y) in enumerate(test_dataloader):
        X, y = X.to(device), y.to(device)
        X = X.transpose(1, 2)
        output = loaded_model(X).to(device)
        _, predicted = torch.max(output.data, 1)
        y_true_lstm.extend(y)
        y_pred_lstm.extend(predicted.cpu())
        print(f"Batch: {batch}/{len(test_dataloader)}")


Batch: 0/42
Batch: 1/42
Batch: 2/42
Batch: 3/42
Batch: 4/42
Batch: 5/42
Batch: 6/42
Batch: 7/42
Batch: 8/42
Batch: 9/42
Batch: 10/42
Batch: 11/42
Batch: 12/42
Batch: 13/42
Batch: 14/42
Batch: 15/42
Batch: 16/42
Batch: 17/42
Batch: 18/42
Batch: 19/42
Batch: 20/42
Batch: 21/42
Batch: 22/42
Batch: 23/42
Batch: 24/42
Batch: 25/42
Batch: 26/42
Batch: 27/42
Batch: 28/42
Batch: 29/42
Batch: 30/42
Batch: 31/42
Batch: 32/42
Batch: 33/42
Batch: 34/42
Batch: 35/42
Batch: 36/42
Batch: 37/42
Batch: 38/42
Batch: 39/42
Batch: 40/42
Batch: 41/42


In [12]:
def to_cpu(data):
    data_cpu = []
    for tensor in data:
        data_cpu.append(int(tensor.cpu().numpy()))
    return data_cpu

In [66]:
def compute_accuracy_measures(y_true, y_pred):
  classes = ['cel', 'cla', 'flu', 'gac', 'gel', 'org', 'pia', 'sax', 'tru', 'vio', 'voi']
  y_true_cpu = to_cpu(y_true)
  y_pred_cpu = to_cpu(y_pred)
  cm = confusion_matrix(y_true_cpu, y_pred_cpu)

  df_cm = pd.DataFrame(cm, columns=classes, index=classes)

  df_cm.index.name = 'Actual'
  df_cm.columns.name = 'Predicted'
  print('Confusion Matrix:')
  print(df_cm)
  # Compute the accuracy, precision, recall, and F1 score
  accuracy = accuracy_score(y_true_cpu, y_pred_cpu)
  precision = precision_score(y_true_cpu, y_pred_cpu, average='macro')
  recall = recall_score(y_true_cpu, y_pred_cpu, average='macro')
  f1 = f1_score(y_true_cpu, y_pred_cpu, average='macro')

  print(f"Accuracy: {accuracy}")
  print(f"Precision: {precision}")
  print(f"Recall: {recall}")
  print(f"F1 score: {f1}")


In [71]:
compute_accuracy_measures(y_true_cnn, y_pred_cnn)

Confusion Matrix:
Predicted  cel  cla  flu  gac  gel  org  pia  sax  tru  vio  voi
Actual                                                          
cel         76    0    0    0    0    0    2    0    0    0    0
cla          0   99    0    0    0    0    0    0    0    2    0
flu          0    0   85    0    0    0    0    0    0    5    0
gac         10    0    0   20   23    0   12    1    0   61    0
gel          0    0    0    0  152    0    0    0    0    0    0
org          2    0    0    0    2  114    2    0    0   16    0
pia          0    0    0    0    0    0  142    0    0    2    0
sax          0    0    0    0    0    0    0  118    0    7    0
tru          1    0    0    0    1    0    0    0  105    8    0
vio          0    0    0    0    0    0    0    0    0  116    0
voi          1    0    0    0    0    0    0    0    0    5  149
Accuracy: 0.878267363704257
Precision: 0.9192027606461166
Recall: 0.8817419963305728
F1 score: 0.8636655501520051


In [69]:
compute_accuracy_measures(y_true_lstm, y_pred_lstm)

Confusion Matrix:
Predicted  cel  cla  flu  gac  gel  org  pia  sax  tru  vio  voi
Actual                                                          
cel         74    1    0    0    0    0    0    0    0    3    0
cla          0   98    0    0    0    1    1    1    0    0    0
flu          0    0   90    0    0    0    0    0    0    0    0
gac          1    0    1  123    0    1    0    1    0    0    0
gel          0    0    0    1  144    0    1    6    0    0    0
org          0    0    0    0    1  132    2    0    0    0    1
pia          0    0    1    0    0    1  140    2    0    0    0
sax          0    0    0    0    2    1    0  121    0    0    1
tru          0    1    0    0    0    0    0    0  114    0    0
vio          0    0    1    0    0    0    0    0    1  114    0
voi          0    0    0    0    0    0    0    0    0    0  155
Accuracy: 0.9746079163554892
Precision: 0.9750304519174382
Recall: 0.9745237056830461
F1 score: 0.9746272183640734


In [16]:
def read_file(path):
    #load the waveform y and sampling rate s
    audio, sr = librosa.load(path)

    # Get the duration of the audio
    duration = librosa.get_duration(y=audio, sr=sr)

    # Generate a random start time within the audio duration, ensuring it is at least 3 seconds from the end
    start_time = random.uniform(0, duration - 3)

    # Calculate the end time by adding 3 seconds to the start time
    end_time = start_time + 3

    # Extract the 3-second segment from the audio
    random_3_seconds = audio[int(start_time * sr):int(end_time * sr)]

    sp = librosa.feature.melspectrogram(y=random_3_seconds, sr=sr, n_mels=128,fmax=11025)
    return torch.from_numpy(sp)

In [53]:
def predict(path):
    classes = ['cello', 'clarinet', 'flute', 'acoustic guitar', 'electric guitar', 'organ', 'piano', 'saxophone', 'trumpet', 'violin', 'human singing voice']
    sp_torch = read_file(path)
    # CNN
    checkpoint = torch.load('drive/MyDrive/checkpoints/cnn/epoch_29.pth')
    loaded_model_cnn = InstrumentCNN(num_classes)
    loaded_model_cnn.load_state_dict(checkpoint)
    loaded_model_cnn = loaded_model_cnn.to(device)
    loaded_model_cnn.eval()
    sp_cnn = sp_torch.to(device).unsqueeze(0).unsqueeze(0)
    output = loaded_model_cnn(sp_cnn).to(device)
    _, predicted = torch.max(output.data, 1)
    print(f"The predicted instrument from CNN is: {classes[predicted]}")

    # LSTM
    checkpoint = torch.load('drive/MyDrive/checkpoints/lstm/epoch_29.pth')
    loaded_model_lstm = InstrumentLSTM(num_classes)
    loaded_model_lstm.load_state_dict(checkpoint)
    loaded_model_lstm = loaded_model_lstm.to(device)
    loaded_model_lstm.eval()
    sp_lstm = sp_torch.to(device).unsqueeze(0).transpose(1, 2)
    output = loaded_model_lstm(sp_lstm).to(device)
    _, predicted = torch.max(output.data, 1)
    print(f"The predicted instrument from LSTM is: {classes[predicted]}")

In [63]:
predict('drive/MyDrive/audio_test_data/[1] - 03 - Alexandre Lagoya - Canarios (Sanz)-1.wav')

The predicted instrument from CNN is: piano
The predicted instrument from LSTM is: violin
