# 3. Distinguir géneros musicales utilizando modelos de aprendizaje

## Importaciones

In [235]:
import torch
import torchaudio

import numpy as np
import os
import csv

import pandas as pd
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import StandardScaler

import focal_loss

____

## 3.1. Modelos de aprendizaje basados en extracción de características (*bag of songs*)

In [203]:
BATCH_SIZE = 32
LEARNING_RATE = 0.001

### 3.1.2. Creación del corpus de datos tabulares de características

In [204]:
import sklearn.preprocessing


class TabularDataset(torch.utils.data.Dataset):

    def __init__(self, features_file, scaler=None):

        df = pd.read_csv(features_file)

        self.X = df.drop(['audio_file', 'label'], axis=1)
        self.y = df['label'].values.astype(np.int64)

        if scaler:
            self.X = scaler.transform(self.X)
        else:
            self.scaler = StandardScaler()
            self.X = self.scaler.fit_transform(self.X)

    def get_scaler(self):
        return self.scaler

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        if isinstance(idx, torch.Tensor):
            idx = idx.tolist()
    
        return torch.tensor(self.X[idx], dtype=torch.float32), torch.tensor(self.y[idx], dtype=torch.long)


In [205]:
ccmusic_train = TabularDataset('ccmusic/train/features.csv')
train_scaler = ccmusic_train.get_scaler()
ccmusic_train_dataloader = torch.utils.data.DataLoader(ccmusic_train, 
                                                       batch_size=BATCH_SIZE, 
                                                       shuffle=True)

ccmusic_validation = TabularDataset('ccmusic/validation/features.csv', train_scaler)
ccmusic_validation_dataloader = torch.utils.data.DataLoader(ccmusic_validation, 
                                                            batch_size=BATCH_SIZE, 
                                                            shuffle=False)

ccmusic_test = TabularDataset('ccmusic/test/features.csv', train_scaler)
ccmusic_test_dataloader = torch.utils.data.DataLoader(ccmusic_test, 
                                                      batch_size=BATCH_SIZE, 
                                                      shuffle=False)

In [206]:
ccmusic2_train = TabularDataset('ccmusic2/train/features.csv')
train_scaler2 = ccmusic2_train.get_scaler()
ccmusic2_train_dataloader = torch.utils.data.DataLoader(ccmusic2_train, 
                                                       batch_size=BATCH_SIZE, 
                                                       shuffle=True)

ccmusic2_validation = TabularDataset('ccmusic2/validation/features.csv', train_scaler2)
ccmusic2_validation_dataloader = torch.utils.data.DataLoader(ccmusic2_validation, 
                                                            batch_size=BATCH_SIZE, 
                                                            shuffle=False)

ccmusic2_test = TabularDataset('ccmusic2/test/features.csv', train_scaler2)
ccmusic2_test_dataloader = torch.utils.data.DataLoader(ccmusic2_test, 
                                                      batch_size=BATCH_SIZE, 
                                                      shuffle=False)


### 3.1.3. Definición del modelo para la clasificación de generos en base a las características

In [224]:
class MLPClassifier(torch.nn.Module):

    def __init__(self, input_dim, num_classes):
        super(MLPClassifier, self).__init__()

        self.linear_block = torch.nn.Sequential(
            torch.nn.Linear(input_dim, 128),
            torch.nn.ReLU(),
            # torch.nn.BatchNorm1d(128),
            # torch.nn.Dropout(0.5),
            torch.nn.Linear(128, 64),
            torch.nn.ReLU(),
            # torch.nn.BatchNorm1d(64),
            torch.nn.Linear(64, num_classes if num_classes > 2 else 1)
        )

    def forward(self, x):
        return self.linear_block(x)

In [225]:
def train_single_epoch(model, train_dataloader, val_dataloader, loss_fn, optimizer):

    model.train()

    for inputs, targets in train_dataloader:
        print('.', end='')
        optimizer.zero_grad()
        predictions = model(inputs)
        if isinstance(loss_fn, torch.nn.BCEWithLogitsLoss):
            loss = loss_fn(predictions, targets.float().unsqueeze(1))
        elif isinstance(loss_fn, torch.nn.CrossEntropyLoss):
            loss = loss_fn(predictions, targets)
        loss.backward()
        optimizer.step()  

    model.eval()
    val_loss = 0
    with torch.no_grad():
        for inputs, targets in val_dataloader:
            predictions = model(inputs)
            if isinstance(loss_fn, torch.nn.BCEWithLogitsLoss):
                loss = loss_fn(predictions, targets.float().unsqueeze(1))
            elif isinstance(loss_fn, torch.nn.CrossEntropyLoss):
                loss = loss_fn(predictions, targets)
            val_loss += loss.item()
    val_loss /= len(val_dataloader)

    return loss.item(), val_loss



def train(model, train_dataloader, val_dataloader, loss_fn, optimizer, epochs, patience=5):

    print("Inicio del entrenamiento")

    best_loss = float('inf')
    patience_counter = 0

    for epoch in range(epochs):
        print(f"Época {epoch+1} ", end='')
        loss, val_loss = train_single_epoch(model, train_dataloader, val_dataloader, loss_fn, optimizer)
        print(f"Loss: {loss}")

        if loss < best_loss:
            best_loss = loss
            patience_counter = 0
        else:
            patience_counter += 1

        if patience_counter >= patience:
            print(f"Deteniendo entrenamiento en la época {epoch+1}")
            break

    print("Fin del entrenamiento")

In [226]:
def evaluate_model(model, dataloader, num_classes):
    
    model.eval()
    with torch.no_grad():
        predictions = []
        targets = []
        for inputs, target in dataloader:
            output = model(inputs)
            if num_classes > 2:
                output = torch.argmax(output, dim=1)
            else:
                output = torch.sigmoid(output)
                output = (output > 0.5)
            predictions.append(output)
            targets.append(target)
        predictions = torch.cat(predictions, dim=0)
        targets = torch.cat(targets, dim=0)
        return {
            'acc': accuracy_score(targets, predictions),
            'f1': f1_score(targets, predictions) if num_classes == 2 
            else f1_score(targets, predictions, average='micro')
        }

### 3.1.3. Entrenamiento e inferencia

#### CCMUSIC

In [231]:
EPOCHS = 50
modelo = MLPClassifier(input_dim=ccmusic_train.X.shape[1], num_classes=2)
loss_fn = torch.nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(modelo.parameters(), 
                             lr=LEARNING_RATE)
train(modelo, ccmusic_train_dataloader, ccmusic_validation_dataloader, loss_fn, optimizer, EPOCHS)

Inicio del entrenamiento
Época 1 ...........................................Loss: 0.12050341814756393
Época 2 ...........................................Loss: 0.03188486769795418
Época 3 ...........................................Loss: 0.021222978830337524
Época 4 ...........................................Loss: 0.02205251157283783
Época 5 ...........................................Loss: 0.008797035552561283
Época 6 ...........................................Loss: 0.014521676115691662
Época 7 ...........................................Loss: 0.008724812418222427
Época 8 ...........................................Loss: 0.006520235911011696
Época 9 ...........................................Loss: 0.006704649422317743
Época 10 ...........................................Loss: 0.00487254885956645
Época 11 ...........................................Loss: 0.005166085436940193
Época 12 ...........................................Loss: 0.003538284217938781
Época 13 ...............................

In [232]:
metrics_test = evaluate_model(modelo, ccmusic_test_dataloader, num_classes=2)
print(f"Accuracy en el conjunto de test: {metrics_test['acc']}")
print(f"F1 en el conjunto de test: {metrics_test['f1']}")

Accuracy en el conjunto de test: 0.9709302325581395
F1 en el conjunto de test: 0.9816849816849816


#### CCMUSIC2

In [242]:
# # Calcular class_weight 
# import numpy as np
# import torch
# import sklearn.utils.class_weight

# # Calcula los pesos de clase usando scikit-learn
# class_weights = sklearn.utils.class_weight.compute_class_weight('balanced',
#                                                                 np.unique(ccmusic2_train.y),
#                                                                 ccmusic2_train.y)

# # Convierte los pesos de clase a tensores de PyTorch
# class_weights = torch.tensor(class_weights, dtype=torch.float32)

# print(class_weights)                    

In [249]:
EPOCHS = 100
modelo = MLPClassifier(input_dim=ccmusic2_train.X.shape[1], num_classes=9)
loss_fn = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(modelo.parameters(), 
                             lr=LEARNING_RATE )
train(modelo, ccmusic2_train_dataloader, ccmusic_validation_dataloader, loss_fn, optimizer, EPOCHS,
      patience=20)

Inicio del entrenamiento
Época 1 ...........................................Loss: 3.6842195987701416
Época 2 ...........................................Loss: 5.746178150177002
Época 3 ...........................................Loss: 7.274298191070557
Época 4 .

..........................................Loss: 8.159215927124023
Época 5 ...........................................Loss: 8.996831893920898
Época 6 ...........................................Loss: 9.321578979492188
Época 7 ...........................................Loss: 9.929741859436035
Época 8 ...........................................Loss: 10.364989280700684
Época 9 ...........................................Loss: 10.702692985534668
Época 10 ...........................................Loss: 10.773736000061035
Época 11 ...........................................Loss: 11.30105972290039
Época 12 ...........................................Loss: 12.028177261352539
Época 13 ...........................................Loss: 12.070975303649902
Época 14 ...........................................Loss: 12.484526634216309
Época 15 ...........................................Loss: 12.801424980163574
Época 16 ...........................................Loss: 13.126769065856934
Época 17 ..........

In [250]:
metrics_test = evaluate_model(modelo, ccmusic2_test_dataloader, num_classes=9)
print(f"Accuracy en el conjunto de test: {metrics_test['acc']}")
print(f"F1 en el conjunto de test: {metrics_test['f1']}")

Accuracy en el conjunto de test: 0.5523255813953488
F1 en el conjunto de test: 0.5523255813953488


___

## 3.2. Modelos de aprendizaje basados en espectograma

In [2]:
BATCH_SIZE = 32
LEARNING_RATE = 0.001
EPOCHS = 10

### 3.2.1. Creación del corpus de espectogramas

In [3]:
# Clase para la creación del corpus

class SpectrogramDataset(torch.utils.data.Dataset):

    def __init__(self, annotations_file, audios_dir, transformations=None):
        super().__init__
        self.annotations = self._leer_csv(annotations_file)
        self.audios_dir = audios_dir
        self.transformations = transformations
        
    def __len__(self):
        return len(self.annotations)

    def __getitem__(self, index):
        signal, sr = self._recupera_signal(index)
        label = self._recupera_label(index)
        if self.transformations:
            signal = self.transformations(signal, sr)
        return signal, label
        
    def mapa_label_classes(self):
        pares=set([(id_label,label) for [_,id_label,label] in self.annotations])
        mapa={}
        for (a,b) in pares:
            mapa[a]=b
        return mapa
    
    def label_class(self, index):
        return self.annotations[index][2]
    
    def get_audio_file(self, index): 
        name = self.annotations[index][0]
        return os.path.join(name)

    def _leer_csv(self,annotations_file):
        with open(annotations_file, 'r', encoding='utf-8') as f:
            lector = csv.reader(f)
            next(lector) 
            data = [ (file_name, classID, classLabel)  
                for file_name, classID, classLabel in lector]
        return data

    def _recupera_signal(self,index=0):
        audio_file=self.get_audio_file(index)        
        signal, sr = torchaudio.load(audio_file)
        return signal, sr
    
    def _recupera_label(self, index):
        return torch.tensor(int(self.annotations[index][1]))

In [4]:
def get_spectrogram(signal, sr):
    spectrogram_transform = torchaudio.transforms.Spectrogram(power=2)
    spec_amplitud_to_db_transform = torchaudio.transforms.AmplitudeToDB()
    spect = spectrogram_transform(signal)
    spect=spec_amplitud_to_db_transform(spect)
    return spect

In [5]:
ccmusic_train = SpectrogramDataset(annotations_file='./ccmusic/train/annotations.csv', 
                                    audios_dir='./ccmusic/train/audios', 
                                    transformations=get_spectrogram)
ccmusic_train_dataloader = torch.utils.data.DataLoader(ccmusic_train, 
                                                       batch_size=BATCH_SIZE, 
                                                       shuffle=True)

ccmusic_validation = SpectrogramDataset(annotations_file='./ccmusic/validation/annotations.csv',
                                        audios_dir='./ccmusic/validation/audios',
                                        transformations=get_spectrogram)
ccmusic_validation_dataloader = torch.utils.data.DataLoader(ccmusic_validation, 
                                                            batch_size=BATCH_SIZE, 
                                                            shuffle=True)

ccmusic_test = SpectrogramDataset(annotations_file='./ccmusic/test/annotations.csv',
                                    audios_dir='./ccmusic/test/audios',
                                    transformations=get_spectrogram)
ccmusic_test_dataloader = torch.utils.data.DataLoader(ccmusic_test, 
                                                      batch_size=BATCH_SIZE, 
                                                      shuffle=True)

In [6]:
ccmusic2_train = SpectrogramDataset(annotations_file='./ccmusic2/train/annotations.csv',
                                    audios_dir='./ccmusic2/train/audios',
                                    transformations=get_spectrogram)
ccmusic2_train_dataloader = torch.utils.data.DataLoader(ccmusic2_train,
                                                            batch_size=BATCH_SIZE,
                                                            shuffle=True)

ccmusic2_validation = SpectrogramDataset(annotations_file='./ccmusic2/validation/annotations.csv',
                                        audios_dir='./ccmusic2/validation/audios',
                                        transformations=get_spectrogram)
ccmusic2_validation_dataloader = torch.utils.data.DataLoader(ccmusic2_validation,
                                                            batch_size=BATCH_SIZE,
                                                            shuffle=True)

ccmusic2_test = SpectrogramDataset(annotations_file='./ccmusic2/test/annotations.csv',
                                    audios_dir='./ccmusic2/test/audios',
                                    transformations=get_spectrogram)
ccmusic2_test_dataloader = torch.utils.data.DataLoader(ccmusic2_test,
                                                            batch_size=BATCH_SIZE,
                                                            shuffle=True)

In [7]:
espectograma, etiqueta = ccmusic_train[0]
print(espectograma.shape)
print(etiqueta)

SPECTOGRAM_H = 201
SPECTOGRAM_W = 3308

torch.Size([1, 201, 3308])
tensor(1)


### 3.2.2. Definición del modelo CNN para clasificación de espectogramas

In [8]:
def calcula_tam_capa_lineal1(spect_height, spect_width):

    def pool_size(size, kernel_size, stride):
        return (size - kernel_size) // stride + 1
    
    def conv_size(size, kernel_size, stride, padding):
        return (size + 2*padding - kernel_size) // stride + 1

    height_salida = spect_height
    width_salida = spect_width

    # Bloque convolucional: kernel_size=3, stride=1, padding=1

    for _ in range(1, 4):
        height_salida = pool_size(conv_size(height_salida, 3, 1, 1), 2, 2)
        width_salida = pool_size(conv_size(width_salida, 3, 1, 1), 2, 2)

    return height_salida * width_salida

In [9]:
class CNNModel(torch.nn.Module):

    def __init__(self, spect_height, spect_width, num_labels):
        super(CNNModel, self).__init__()

        # Definir bloque convolucional
        self.conv_block1 = torch.nn.Sequential(
            torch.nn.Conv2d(in_channels=1, out_channels=32, 
                            kernel_size=3, 
                            stride=1, padding=1),
            torch.nn.ReLU(),
            torch.nn.Dropout(0.25),
            torch.nn.MaxPool2d(kernel_size=2, stride=2)
        )

        self.conv_block2 = torch.nn.Sequential( 
            torch.nn.Conv2d(in_channels=32, out_channels=64,
                            kernel_size=3, 
                            stride=1, padding=1),
            torch.nn.ReLU(),
            torch.nn.MaxPool2d(kernel_size=2, stride=2)
        )

        self.conv_block3 = torch.nn.Sequential(
            torch.nn.Conv2d(in_channels=64, out_channels=128,
                            kernel_size=3, 
                            stride=1, padding=1),
            torch.nn.ReLU(),
            torch.nn.MaxPool2d(kernel_size=2, stride=2)
        )

        height_width_salida = calcula_tam_capa_lineal1(spect_height, spect_width)

        self.mlp_block = torch.nn.Sequential(
            torch.nn.Flatten(1, -1),
            torch.nn.Linear( 128 * height_width_salida, 512), 
            torch.nn.ReLU(),
            torch.nn.Dropout(0.5),  
            torch.nn.Linear(512, num_labels if num_labels > 2 else 1),    
            torch.nn.Softmax(dim=1) if num_labels > 2 else torch.nn.Sigmoid()
        )

    def forward(self, input):
        x = self.conv_block1(input)
        x = self.conv_block2(x)
        x = self.conv_block3(x)
        predictions = self.mlp_block(x)
        return predictions

In [12]:
def train_single_epoch(model, dataloader, loss_fn, optimizer):
    model.train()
    for inputs, targets in dataloader:
        print('.', end='')
        predictions = model(inputs)
        loss = loss_fn(predictions, targets.unsqueeze(1))
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    return(loss.item())


def train(model, dataloader, loss_fn, optimizer, epochs):
    print("Inicio del entrenamiento")
    for i in range(epochs):
        print(f"Época {i+1} " ,end='') 
        print('.', end='')
        loss=train_single_epoch(model, dataloader, loss_fn, optimizer)
        print(f"\nLoss: {loss}")
    print("Fin del entrenamiento")

### 3.3.3. Entrenamiento e inferencia

In [None]:
modelo = CNNModel(spect_height=SPECTOGRAM_H, spect_width=SPECTOGRAM_W, num_labels=2)
loss_fn = torch.nn.BCELoss()
optimizer = torch.optim.Adam(modelo.parameters(), 
                             lr=LEARNING_RATE)
train(modelo, ccmusic_train_dataloader, loss_fn, optimizer, EPOCHS)

In [None]:
modelo = CNNModel(spect_height=SPECTOGRAM_H, spect_width=SPECTOGRAM_W, num_labels=9)
loss_fn = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(modelo.parameters(), 
                             lr=LEARNING_RATE)
train(modelo, ccmusic2_train_dataloader, loss_fn, optimizer, EPOCHS)