In [1]:
import librosa
import numpy as np
import pandas as pd
import utils

from sklearn.utils import shuffle
from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder, LabelBinarizer, StandardScaler

from sklearn.preprocessing import StandardScaler

def get_file_path(track_id, tidigits_path='/home/cong/Downloads/fma_small'):
    tidigits_path = tidigits_path + '/{:03d}/{:06d}.mp3'.format(track_id // 1000, track_id)
    return tidigits_path

def preprocess_audio_file(file_path, n_mfcc=13, max_pad_len=2048):
    # Load audio file
    y, sr = librosa.load(file_path, mono=True, duration=30)
    # Extract MFCCs
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
    # Pad MFCCs
    pad_width = max_pad_len - mfcc.shape[1]
    mfcc = np.pad(mfcc, pad_width=((0, 0), (0, pad_width)), mode='constant')
    return mfcc

# Example usage
# mfcc_features = preprocess_audio_file("/home/cong/Downloads/fma_small/000/000005.mp3")
# Note: You'll need to preprocess all audio files in your dataset


In [2]:
tracks = utils.load('data/fma_metadata/tracks.csv')
features = utils.load('data/fma_metadata/features.csv')
echonest = utils.load('data/fma_metadata/echonest.csv')

np.testing.assert_array_equal(features.index, tracks.index)
assert echonest.index.isin(tracks.index).all()

tracks.shape, features.shape, echonest.shape

((106574, 52), (106574, 518), (13129, 249))

In [3]:
subset = tracks.index[tracks['set', 'subset'] <= 'small']

assert subset.isin(tracks.index).all()
assert subset.isin(features.index).all()

features_all = features.join(echonest, how='inner').sort_index(axis=1)
print('Not enough Echonest features: {}'.format(features_all.shape))

tracks = tracks.loc[subset]
features_all = features.loc[subset]

tracks.shape, features_all.shape

Not enough Echonest features: (13129, 767)


((8000, 52), (8000, 518))

In [4]:
train = tracks.index[tracks['set', 'split'] == 'training']
val = tracks.index[tracks['set', 'split'] == 'validation']
test = tracks.index[tracks['set', 'split'] == 'test']

print('{} training examples, {} validation examples, {} testing examples'.format(*map(len, [train, val, test])))

genres = list(LabelEncoder().fit(tracks['track', 'genre_top']).classes_)
#genres = list(tracks['track', 'genre_top'].unique())
print('Top genres ({}): {}'.format(len(genres), genres))
genres = list(MultiLabelBinarizer().fit(tracks['track', 'genres_all']).classes_)
print('All genres ({}): {}'.format(len(genres), genres))

6400 training examples, 800 validation examples, 800 testing examples
Top genres (8): ['Electronic', 'Experimental', 'Folk', 'Hip-Hop', 'Instrumental', 'International', 'Pop', 'Rock']
All genres (114): [1, 2, 6, 10, 12, 15, 16, 17, 18, 21, 22, 25, 26, 27, 30, 31, 32, 33, 36, 38, 41, 42, 45, 46, 47, 49, 53, 58, 64, 66, 70, 71, 76, 77, 79, 81, 83, 85, 86, 88, 89, 90, 92, 94, 98, 100, 101, 102, 103, 107, 109, 111, 113, 117, 118, 125, 130, 167, 171, 172, 174, 177, 180, 181, 182, 183, 184, 185, 186, 214, 224, 232, 236, 240, 247, 250, 267, 286, 296, 297, 314, 337, 359, 360, 361, 362, 400, 401, 404, 439, 440, 456, 468, 491, 495, 502, 504, 514, 524, 538, 539, 542, 580, 602, 619, 695, 741, 763, 808, 811, 1032, 1060, 1193, 1235]


In [5]:
labels_onehot = LabelBinarizer().fit_transform(tracks['track', 'genre_top'])
labels_onehot = pd.DataFrame(labels_onehot, index=tracks.index)
labels_onehot

Unnamed: 0_level_0,0,1,2,3,4,5,6,7
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2,0,0,0,1,0,0,0,0
5,0,0,0,1,0,0,0,0
10,0,0,0,0,0,0,1,0
140,0,0,1,0,0,0,0,0
141,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...
154308,0,0,0,1,0,0,0,0
154309,0,0,0,1,0,0,0,0
154413,0,0,0,0,0,0,1,0
154414,0,0,0,0,0,0,1,0


In [6]:
labels_onehot.iloc[2].values

array([0, 0, 0, 0, 0, 0, 1, 0])

In [7]:
def get_mfcc_features(block):
    result_mfcc = []
    result_label = []
    for idx, track_id in enumerate(block):
        mfcc = None
        label = None
        try:
            mfcc = preprocess_audio_file(get_file_path(track_id))
            label = labels_onehot.iloc[idx].values.tolist()
        except:
            pass
        
        if mfcc is not None:
            result_mfcc.append(mfcc)
            result_label.append(label)
    
    return np.array(result_mfcc), np.array(result_label)

In [10]:
train_mfcc, train_label = get_mfcc_features(train)
val_mfcc, val_label = get_mfcc_features(val)
test_mfcc, test_label = get_mfcc_features(test)

# train_mfcc = np.load('data/train_mfcc.npy')
# train_label = np.load('data/train_label.npy')
# val_mfcc = np.load('data/val_mfcc.npy')
# val_label = np.load('data/val_label.npy')
# test_mfcc = np.load('data/test_mfcc.npy')
# test_label = np.load('data/test_label.npy')

In [11]:
# np.save('data/train_mfcc.npy', train_mfcc)
# np.save('data/train_label.npy', train_label)
# np.save('data/val_mfcc.npy', val_mfcc)
# np.save('data/val_label.npy', val_label)
# np.save('data/test_mfcc.npy', test_mfcc)
# np.save('data/test_label.npy', test_label)

In [12]:
train_mfcc.shape, train_label.shape, val_mfcc.shape, val_label.shape, test_mfcc.shape, test_label.shape

((6394, 13, 2048),
 (6394, 8),
 (800, 13, 2048),
 (800, 8),
 (800, 13, 2048),
 (800, 8))

In [13]:
import torch
import torch.nn as nn
import torch.optim as optim

class MusicGenreClassifier(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(MusicGenreClassifier, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, 8)  # Output layer with 8 neurons

    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        out = self.relu(out)
        out = self.fc3(out)  # No softmax here as BCEWithLogitsLoss will be used
        return out

# Example instantiation of the model
model_MLP = MusicGenreClassifier(input_size=13*2048, hidden_size=128)


In [14]:
import torch.nn as nn
import torch.nn.functional as F

class CNNMusicGenreClassifier(nn.Module):
    def __init__(self, num_classes):
        super(CNNMusicGenreClassifier, self).__init__()
        self.conv1 = nn.Conv2d(1, 16, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1)
        self.conv3 = nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1)
        self.pool = nn.MaxPool2d(2, 2)
        # Calculate the size of the flattened features after conv and pooling layers
        self.fc1 = nn.Linear(64 * 256, 500)  # Adjusted input size
        self.fc2 = nn.Linear(500, num_classes)
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        x = x.unsqueeze(1)
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = self.pool(F.relu(self.conv3(x)))
        x = x.view(-1, 64 * 256)  # Flatten and adjust size accordingly
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)
        return x



In [15]:
def train_model(model, train_loader, criterion, optimizer, device, num_epochs=25):
    model.train()
    for epoch in range(num_epochs):
        for i, (mfccs, labels) in enumerate(train_loader):
            # Forward pass
            mfccs = mfccs.to(device)
            labels = labels.to(device)
            outputs = model(mfccs)
            loss = criterion(outputs, labels.float())  # Ensure labels are float

            # Backward and optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            if (i + 1) % 100 == 0:
                print(f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{len(train_loader)}], Loss: {loss.item():.4f}')



In [17]:
def evaluate_model(model, test_loader, device):
    model.eval()
    total = 0
    correct = 0
    with torch.no_grad():
        for mfccs, labels in test_loader:
            mfccs = mfccs.to(device)
            labels = labels.to(device)
            outputs = model(mfccs)
            predicted = (outputs > 0.0).float()
            total += labels.size(0)
            correct += (predicted == labels).sum().item() / labels.size(1)

    accuracy = 100 * correct / total
    print(f'Accuracy on test set: {accuracy}%')


In [18]:
from torch.utils.data import Dataset, DataLoader

class AudioDataset(Dataset):
    def __init__(self, mfccs, labels):
        """
        file_paths: List of paths to audio files
        labels: Corresponding one-hot encoded labels
        """
        self.mfccs = mfccs
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        mfcc = self.mfccs[idx]
        label = self.labels[idx]

        return mfcc, label

In [19]:
model = CNNMusicGenreClassifier(num_classes=8)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.1)

device = "cuda:0"

2023-12-20 10:40:11.709960: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-12-20 10:40:12.426679: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.


In [20]:
train_mfcc_tensor = torch.from_numpy(train_mfcc).float()
train_label_tensor = torch.from_numpy(train_label).float()
test_mfcc_tensor = torch.from_numpy(test_mfcc).float()
test_label_tensor = torch.from_numpy(test_label).float()

# train_mfcc_tensor = train_mfcc_tensor.unsqueeze(1)
# test_mfcc_tensor = test_mfcc_tensor.unsqueeze(1)

In [21]:
# Create dataset instances
train_dataset = AudioDataset(train_mfcc_tensor, train_label_tensor)
test_dataset = AudioDataset(test_mfcc_tensor, test_label_tensor)

# Create DataLoader instances
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [22]:
model = model.to(device)

In [23]:
train_model(model, train_loader, criterion, optimizer, device, num_epochs=25)

Epoch [1/25], Step [100/200], Loss: 0.3741
Epoch [1/25], Step [200/200], Loss: 0.3785
Epoch [2/25], Step [100/200], Loss: 0.3789
Epoch [2/25], Step [200/200], Loss: 0.3741
Epoch [3/25], Step [100/200], Loss: 0.3893
Epoch [3/25], Step [200/200], Loss: 0.3733
Epoch [4/25], Step [100/200], Loss: 0.3753
Epoch [4/25], Step [200/200], Loss: 0.3845
Epoch [5/25], Step [100/200], Loss: 0.3789
Epoch [5/25], Step [200/200], Loss: 0.3753
Epoch [6/25], Step [100/200], Loss: 0.3719
Epoch [6/25], Step [200/200], Loss: 0.3797
Epoch [7/25], Step [100/200], Loss: 0.3838
Epoch [7/25], Step [200/200], Loss: 0.3744
Epoch [8/25], Step [100/200], Loss: 0.3829
Epoch [8/25], Step [200/200], Loss: 0.3795
Epoch [9/25], Step [100/200], Loss: 0.3733
Epoch [9/25], Step [200/200], Loss: 0.3844
Epoch [10/25], Step [100/200], Loss: 0.3783
Epoch [10/25], Step [200/200], Loss: 0.3813
Epoch [11/25], Step [100/200], Loss: 0.3831
Epoch [11/25], Step [200/200], Loss: 0.3762
Epoch [12/25], Step [100/200], Loss: 0.3821
Epoch 

In [24]:
evaluate_model(model, test_loader, device)

Accuracy on test set: 87.5%


In [25]:
model

CNNMusicGenreClassifier(
  (conv1): Conv2d(1, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv2): Conv2d(16, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv3): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (pool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (fc1): Linear(in_features=16384, out_features=500, bias=True)
  (fc2): Linear(in_features=500, out_features=8, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)