# 3. Distinguir géneros musicales utilizando modelos de aprendizaje

## Importaciones

In [27]:
import torch
import torchaudio

import numpy as np
import os
import csv

import pandas as pd
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import StandardScaler

import focal_loss

____

## 3.1. Modelos de aprendizaje basados en extracción de características (*bag of songs*)

In [5]:
BATCH_SIZE = 32
LEARNING_RATE = 0.001

### 3.1.2. Creación del corpus de datos tabulares de características

In [6]:
import sklearn.preprocessing


class TabularDataset(torch.utils.data.Dataset):

    def __init__(self, features_file, scaler=None):

        df = pd.read_csv(features_file)

        self.X = df.drop(['audio_file', 'label'], axis=1)
        self.y = df['label'].values.astype(np.int64)

        if scaler:
            self.X = scaler.transform(self.X)
        else:
            self.scaler = StandardScaler()
            self.X = self.scaler.fit_transform(self.X)

    def get_scaler(self):
        return self.scaler

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        if isinstance(idx, torch.Tensor):
            idx = idx.tolist()
    
        return torch.tensor(self.X[idx], dtype=torch.float32), torch.tensor(self.y[idx], dtype=torch.long)


In [9]:
ccmusic_train = TabularDataset('ccmusic/train/features.csv')
train_scaler = ccmusic_train.get_scaler()
ccmusic_train_dataloader = torch.utils.data.DataLoader(ccmusic_train, 
                                                       batch_size=BATCH_SIZE, 
                                                       shuffle=True)

ccmusic_validation = TabularDataset('ccmusic/validation/features.csv', train_scaler)
ccmusic_validation_dataloader = torch.utils.data.DataLoader(ccmusic_validation, 
                                                            batch_size=BATCH_SIZE, 
                                                            shuffle=False)

ccmusic_test = TabularDataset('ccmusic/test/features.csv', train_scaler)
ccmusic_test_dataloader = torch.utils.data.DataLoader(ccmusic_test, 
                                                      batch_size=BATCH_SIZE, 
                                                      shuffle=False)

In [16]:
ccmusic2_train = TabularDataset('ccmusic2/train/features.csv')
train_scaler2 = ccmusic2_train.get_scaler()
ccmusic2_train_dataloader = torch.utils.data.DataLoader(ccmusic2_train, 
                                                       batch_size=BATCH_SIZE, 
                                                       shuffle=True)

ccmusic2_validation = TabularDataset('ccmusic2/validation/features.csv', train_scaler2)
ccmusic2_validation_dataloader = torch.utils.data.DataLoader(ccmusic2_validation, 
                                                            batch_size=BATCH_SIZE, 
                                                            shuffle=False)

ccmusic2_test = TabularDataset('ccmusic2/test/features.csv', train_scaler2)
ccmusic2_test_dataloader = torch.utils.data.DataLoader(ccmusic2_test, 
                                                      batch_size=BATCH_SIZE, 
                                                      shuffle=False)


### 3.1.3. Definición del modelo para la clasificación de generos en base a las características

In [10]:
class MLPClassifier(torch.nn.Module):

    def __init__(self, input_dim, num_classes):
        super(MLPClassifier, self).__init__()

        self.linear_block = torch.nn.Sequential(
            torch.nn.Linear(input_dim, 128),
            torch.nn.ReLU(),
            # torch.nn.BatchNorm1d(128),
            # torch.nn.Dropout(0.5),
            torch.nn.Linear(128, 64),
            torch.nn.ReLU(),
            # torch.nn.BatchNorm1d(64),
            torch.nn.Linear(64, num_classes if num_classes > 2 else 1)
        )

    def forward(self, x):
        return self.linear_block(x)

In [11]:
import torch
from sklearn.metrics import accuracy_score, f1_score

def train_single_epoch(model, train_dataloader, val_dataloader, loss_fn, optimizer, device):

    model.train()

    for inputs, targets in train_dataloader:
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        predictions = model(inputs)
        if isinstance(loss_fn, torch.nn.BCEWithLogitsLoss):
            loss = loss_fn(predictions, targets.float().unsqueeze(1))
        elif isinstance(loss_fn, torch.nn.CrossEntropyLoss):
            loss = loss_fn(predictions, targets)
        loss.backward()
        optimizer.step()  

    model.eval()
    val_loss = 0
    with torch.no_grad():
        for inputs, targets in val_dataloader:
            inputs, targets = inputs.to(device), targets.to(device)
            predictions = model(inputs)
            if isinstance(loss_fn, torch.nn.BCEWithLogitsLoss):
                loss = loss_fn(predictions, targets.float().unsqueeze(1))
            elif isinstance(loss_fn, torch.nn.CrossEntropyLoss):
                loss = loss_fn(predictions, targets)
            val_loss += loss.item()
    val_loss /= len(val_dataloader)

    return loss.item(), val_loss

def train(model, train_dataloader, val_dataloader, loss_fn, optimizer, epochs, patience=5, device='cuda'):

    model.to(device)
    print("Inicio del entrenamiento")

    best_loss = float('inf')
    patience_counter = 0

    for epoch in range(epochs):
        print(f"Época {epoch+1} ", end='')
        loss, val_loss = train_single_epoch(model, train_dataloader, val_dataloader, loss_fn, optimizer, device)
        print(f"Loss: {loss}")

        if loss < best_loss:
            best_loss = loss
            patience_counter = 0
        else:
            patience_counter += 1

        if patience_counter >= patience:
            print(f"Deteniendo entrenamiento en la época {epoch+1}")
            break

    print("Fin del entrenamiento")

In [12]:
def evaluate_model(model, dataloader, num_classes, device='cuda'):
    
    model.eval()
    with torch.no_grad():
        predictions = []
        targets = []
        for inputs, target in dataloader:
            inputs, target = inputs.to(device), target.to(device)
            output = model(inputs)
            if num_classes > 2:
                output = torch.argmax(output, dim=1)
            else:
                output = torch.sigmoid(output)
                output = (output > 0.5)
            predictions.append(output)
            targets.append(target)
        predictions = torch.cat(predictions, dim=0)
        targets = torch.cat(targets, dim=0)
        return {
            'acc': accuracy_score(targets.cpu(), predictions.cpu()),
            'f1': f1_score(targets.cpu(), predictions.cpu()) if num_classes == 2 
            else f1_score(targets.cpu(), predictions.cpu(), average='micro')
        }

### 3.1.3. Entrenamiento e inferencia

#### CCMUSIC

In [13]:
EPOCHS = 50
modelo = MLPClassifier(input_dim=ccmusic_train.X.shape[1], num_classes=2)
loss_fn = torch.nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(modelo.parameters(), 
                             lr=LEARNING_RATE)
train(modelo, ccmusic_train_dataloader, ccmusic_validation_dataloader, loss_fn, optimizer, EPOCHS)

Inicio del entrenamiento
Época 1 Loss: 0.1144166886806488
Época 2 Loss: 0.03076895885169506
Época 3 Loss: 0.021174129098653793
Época 4 Loss: 0.01606043241918087
Época 5 Loss: 0.012176807038486004
Época 6 Loss: 0.010602782480418682
Época 7 Loss: 0.006224792916327715
Época 8 Loss: 0.006463612429797649
Época 9 Loss: 0.0033868453465402126
Época 10 Loss: 0.002547886222600937
Época 11 Loss: 0.005453024059534073
Época 12 Loss: 0.0018593418644741178
Época 13 Loss: 0.002986837876960635
Época 14 Loss: 0.003007487626746297
Época 15 Loss: 0.0013803510228171945
Época 16 Loss: 0.0014926667790859938
Época 17 Loss: 0.0010029763216152787
Época 18 Loss: 0.0008025760762393475
Época 19 Loss: 0.0005203281762078404
Época 20 Loss: 0.0005686854710802436
Época 21 Loss: 0.0003789956390392035
Época 22 Loss: 0.0005032019107602537
Época 23 Loss: 0.0003781775594688952
Época 24 Loss: 0.00039017703966237605
Época 25 Loss: 0.0002719030308071524
Época 26 Loss: 0.0001608995662536472
Época 27 Loss: 0.00010851704428205267

In [14]:
metrics_test = evaluate_model(modelo, ccmusic_test_dataloader, num_classes=2)
print(f"Accuracy en el conjunto de test: {metrics_test['acc']}")
print(f"F1 en el conjunto de test: {metrics_test['f1']}")

Accuracy en el conjunto de test: 0.9709302325581395
F1 en el conjunto de test: 0.981549815498155


#### CCMUSIC2

In [242]:
# # Calcular class_weight 
# import numpy as np
# import torch
# import sklearn.utils.class_weight

# # Calcula los pesos de clase usando scikit-learn
# class_weights = sklearn.utils.class_weight.compute_class_weight('balanced',
#                                                                 np.unique(ccmusic2_train.y),
#                                                                 ccmusic2_train.y)

# # Convierte los pesos de clase a tensores de PyTorch
# class_weights = torch.tensor(class_weights, dtype=torch.float32)

# print(class_weights)                    

In [17]:
EPOCHS = 100
modelo = MLPClassifier(input_dim=ccmusic2_train.X.shape[1], num_classes=9)
loss_fn = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(modelo.parameters(), 
                             lr=LEARNING_RATE )
train(modelo, ccmusic2_train_dataloader, ccmusic_validation_dataloader, loss_fn, optimizer, EPOCHS,
      patience=20)

Inicio del entrenamiento
Época 1 Loss: 3.721733808517456
Época 2 Loss: 5.6760640144348145
Época 3 Loss: 6.745384693145752
Época 4 Loss: 7.6535258293151855
Época 5 Loss: 8.097639083862305
Época 6 Loss: 8.481220245361328
Época 7 Loss: 8.62679672241211
Época 8 Loss: 9.123586654663086
Época 9 Loss: 9.292040824890137
Época 10 Loss: 9.637316703796387
Época 11 Loss: 9.899700164794922
Época 12 Loss: 10.337506294250488
Época 13 Loss: 10.493694305419922
Época 14 Loss: 10.646076202392578
Época 15 Loss: 10.925032615661621
Época 16 Loss: 10.982606887817383
Época 17 Loss: 11.610930442810059
Época 18 Loss: 11.708135604858398
Época 19 Loss: 11.9278564453125
Época 20 Loss: 11.982645034790039
Época 21 Loss: 12.389301300048828
Deteniendo entrenamiento en la época 21
Fin del entrenamiento


In [18]:
metrics_test = evaluate_model(modelo, ccmusic2_test_dataloader, num_classes=9)
print(f"Accuracy en el conjunto de test: {metrics_test['acc']}")
print(f"F1 en el conjunto de test: {metrics_test['f1']}")

Accuracy en el conjunto de test: 0.5523255813953488
F1 en el conjunto de test: 0.5523255813953488


___

## 3.2. Modelos de aprendizaje basados en espectograma

In [28]:
BATCH_SIZE = 32
LEARNING_RATE = 0.001
EPOCHS = 10

### 3.2.1. Creación del corpus de espectogramas

In [29]:
# Clase para la creación del corpus

class SpectrogramDataset(torch.utils.data.Dataset):

    def __init__(self, annotations_file, audios_dir, transformations=None):
        super().__init__
        self.annotations = self._leer_csv(annotations_file)
        self.audios_dir = audios_dir
        self.transformations = transformations
        
    def __len__(self):
        return len(self.annotations)

    def __getitem__(self, index):
        signal, sr = self._recupera_signal(index)
        label = self._recupera_label(index)
        if self.transformations:
            signal = self.transformations(signal, sr)
        return signal, label
        
    def mapa_label_classes(self):
        pares=set([(id_label,label) for [_,id_label,label] in self.annotations])
        mapa={}
        for (a,b) in pares:
            mapa[a]=b
        return mapa
    
    def label_class(self, index):
        return self.annotations[index][2]
    
    def get_audio_file(self, index): 
        name = self.annotations[index][0]
        return os.path.join(name)

    def _leer_csv(self,annotations_file):
        with open(annotations_file, 'r', encoding='utf-8') as f:
            lector = csv.reader(f)
            next(lector) 
            data = [ (file_name, classID, classLabel)  
                for file_name, classID, classLabel in lector]
        return data

    def _recupera_signal(self,index=0):
        audio_file=self.get_audio_file(index)        
        signal, sr = torchaudio.load(audio_file)
        return signal, sr
    
    def _recupera_label(self, index):
        return torch.tensor(int(self.annotations[index][1]))

In [30]:
def get_spectrogram(signal, sr):
    spectrogram_transform = torchaudio.transforms.Spectrogram(power=2)
    spec_amplitud_to_db_transform = torchaudio.transforms.AmplitudeToDB()
    spect = spectrogram_transform(signal)
    spect=spec_amplitud_to_db_transform(spect)
    return spect

In [31]:
ccmusic_train = SpectrogramDataset(annotations_file='./ccmusic/train/annotations.csv', 
                                    audios_dir='./ccmusic/train/audios', 
                                    transformations=get_spectrogram)
ccmusic_train_dataloader = torch.utils.data.DataLoader(ccmusic_train, 
                                                       batch_size=BATCH_SIZE, 
                                                       shuffle=True)

ccmusic_validation = SpectrogramDataset(annotations_file='./ccmusic/validation/annotations.csv',
                                        audios_dir='./ccmusic/validation/audios',
                                        transformations=get_spectrogram)
ccmusic_validation_dataloader = torch.utils.data.DataLoader(ccmusic_validation, 
                                                            batch_size=BATCH_SIZE, 
                                                            shuffle=True)

ccmusic_test = SpectrogramDataset(annotations_file='./ccmusic/test/annotations.csv',
                                    audios_dir='./ccmusic/test/audios',
                                    transformations=get_spectrogram)
ccmusic_test_dataloader = torch.utils.data.DataLoader(ccmusic_test, 
                                                      batch_size=BATCH_SIZE, 
                                                      shuffle=True)

In [32]:
ccmusic2_train = SpectrogramDataset(annotations_file='./ccmusic2/train/annotations.csv',
                                    audios_dir='./ccmusic2/train/audios',
                                    transformations=get_spectrogram)
ccmusic2_train_dataloader = torch.utils.data.DataLoader(ccmusic2_train,
                                                            batch_size=BATCH_SIZE,
                                                            shuffle=True)

ccmusic2_validation = SpectrogramDataset(annotations_file='./ccmusic2/validation/annotations.csv',
                                        audios_dir='./ccmusic2/validation/audios',
                                        transformations=get_spectrogram)
ccmusic2_validation_dataloader = torch.utils.data.DataLoader(ccmusic2_validation,
                                                            batch_size=BATCH_SIZE,
                                                            shuffle=True)

ccmusic2_test = SpectrogramDataset(annotations_file='./ccmusic2/test/annotations.csv',
                                    audios_dir='./ccmusic2/test/audios',
                                    transformations=get_spectrogram)
ccmusic2_test_dataloader = torch.utils.data.DataLoader(ccmusic2_test,
                                                            batch_size=BATCH_SIZE,
                                                            shuffle=True)

In [33]:
espectograma, etiqueta = ccmusic_train[0]
print(espectograma.shape)
print(etiqueta)

SPECTOGRAM_H = 201
SPECTOGRAM_W = 3308

RuntimeError: Couldn't find appropriate backend to handle uri ccmusic/train/audios/audio_train_0.wav and format None.

### 3.2.2. Definición del modelo CNN para clasificación de espectogramas

In [8]:
def calcula_tam_capa_lineal1(spect_height, spect_width):

    def pool_size(size, kernel_size, stride):
        return (size - kernel_size) // stride + 1
    
    def conv_size(size, kernel_size, stride, padding):
        return (size + 2*padding - kernel_size) // stride + 1

    height_salida = spect_height
    width_salida = spect_width

    # Bloque convolucional: kernel_size=3, stride=1, padding=1

    for _ in range(1, 4):
        height_salida = pool_size(conv_size(height_salida, 3, 1, 1), 2, 2)
        width_salida = pool_size(conv_size(width_salida, 3, 1, 1), 2, 2)

    return height_salida * width_salida

In [9]:
class CNNModel(torch.nn.Module):

    def __init__(self, spect_height, spect_width, num_labels):
        super(CNNModel, self).__init__()

        # Definir bloque convolucional
        self.conv_block1 = torch.nn.Sequential(
            torch.nn.Conv2d(in_channels=1, out_channels=32, 
                            kernel_size=3, 
                            stride=1, padding=1),
            torch.nn.ReLU(),
            torch.nn.Dropout(0.25),
            torch.nn.MaxPool2d(kernel_size=2, stride=2)
        )

        self.conv_block2 = torch.nn.Sequential( 
            torch.nn.Conv2d(in_channels=32, out_channels=64,
                            kernel_size=3, 
                            stride=1, padding=1),
            torch.nn.ReLU(),
            torch.nn.MaxPool2d(kernel_size=2, stride=2)
        )

        self.conv_block3 = torch.nn.Sequential(
            torch.nn.Conv2d(in_channels=64, out_channels=128,
                            kernel_size=3, 
                            stride=1, padding=1),
            torch.nn.ReLU(),
            torch.nn.MaxPool2d(kernel_size=2, stride=2)
        )

        height_width_salida = calcula_tam_capa_lineal1(spect_height, spect_width)

        self.mlp_block = torch.nn.Sequential(
            torch.nn.Flatten(1, -1),
            torch.nn.Linear( 128 * height_width_salida, 512), 
            torch.nn.ReLU(),
            torch.nn.Dropout(0.5),  
            torch.nn.Linear(512, num_labels if num_labels > 2 else 1),    
            torch.nn.Softmax(dim=1) if num_labels > 2 else torch.nn.Sigmoid()
        )

    def forward(self, input):
        x = self.conv_block1(input)
        x = self.conv_block2(x)
        x = self.conv_block3(x)
        predictions = self.mlp_block(x)
        return predictions

In [12]:
def train_single_epoch(model, dataloader, loss_fn, optimizer, device):
    model.train()
    for inputs, targets in dataloader:
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        predictions = model(inputs)
        loss = loss_fn(predictions, targets)
        loss.backward()
        optimizer.step()
    return loss.item()

def train(model, dataloader, loss_fn, optimizer, epochs, device="cuda"):
    print("Inicio del entrenamiento")
    for i in range(epochs):
        print(f"Época {i+1}", end='')
        loss = train_single_epoch(model, dataloader, loss_fn, optimizer, device)
        print(f"\nLoss: {loss:.4f}")
    print("Fin del entrenamiento")

### 3.3.3. Entrenamiento e inferencia

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
modelo = CNNModel(spect_height=SPECTOGRAM_H, spect_width=SPECTOGRAM_W, num_labels=2)
modelo.to(device)  # Mover el modelo al dispositivo apropiado
loss_fn = torch.nn.BCELoss()
optimizer = torch.optim.Adam(modelo.parameters(), lr=LEARNING_RATE)

train(modelo, ccmusic_train_dataloader, loss_fn, optimizer, EPOCHS, device)

In [None]:
def evaluate_model(model, dataloader, device, num_classes):
    model.eval()
    all_predictions = []
    all_targets = []
    
    with torch.no_grad():
        for inputs, targets in dataloader:
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)
            _, predictions = torch.max(outputs, 1)
            all_predictions.extend(predictions.cpu().numpy())
            all_targets.extend(targets.cpu().numpy())
    
    accuracy = accuracy_score(all_targets, all_predictions)
    f1 = f1_score(all_targets, all_predictions, average='weighted')
    return {'acc': accuracy, 'f1': f1}

# Evaluar el modelo en el conjunto de test
metrics_test = evaluate_model(modelo, ccmusic2_test_dataloader, device, num_classes=9)
print(f"Accuracy en el conjunto de test: {metrics_test['acc']:.2f}")
print(f"F1 en el conjunto de test: {metrics_test['f1']:.2f}")

In [None]:
modelo = CNNModel(spect_height=SPECTOGRAM_H, spect_width=SPECTOGRAM_W, num_labels=9)
loss_fn = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(modelo.parameters(), 
                             lr=LEARNING_RATE)
train(modelo, ccmusic2_train_dataloader, loss_fn, optimizer, EPOCHS)