In [5]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import librosa
import numpy as np
import pandas as pd

file_path = '../Data/MusicNet_Dataset/musicnet/musicnet/train_data/2478.wav'
y, sr = librosa.load(file_path, sr=44100)

# Chuyển đổi Mel-spectrogram
hop_length = 512
mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128, hop_length=hop_length)
mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)

csv_path = '../Data/MusicNet_Dataset/musicnet/musicnet/train_labels/2478.csv'
df = pd.read_csv(csv_path)

# Chuyển thời gian thành frame
def time_to_frame(time, sr, hop_length):
    return int((time / 1000000) * sr // hop_length)

df['start_frame'] = df['start_time'].apply(lambda x: time_to_frame(x, sr, hop_length))
df['end_frame'] = df['end_time'].apply(lambda x: time_to_frame(x, sr, hop_length))

# Gán nhãn cho từng frame
y_train = np.full(mel_spec.shape[1], -1)  # Mặc định không có nốt (-1)
for _, row in df.iterrows():
    start, end, note = row['start_frame'], row['end_frame'], row['note']
    y_train[start:end] = note

# Thay thế -1 bằng 128 (nhãn "không có nốt")
y_train[y_train == -1] = 128

# Tạo các cửa sổ (windows)
def create_windows_torch(X, y, window_size=128, step=64):
    X_windows, y_windows = [], []
    for i in range(0, X.shape[1] - window_size, step):
        X_windows.append(X[:, i:i + window_size])
        center_index = i + window_size // 2
        if center_index < len(y):
            y_windows.append(y[center_index])
            
     # Gộp list thành numpy array trước khi chuyển sang tensor
    return torch.tensor(np.array(X_windows), dtype=torch.float32), torch.tensor(np.array(y_windows), dtype=torch.long)

X_train, y_train = create_windows_torch(mel_spec_db, y_train)

# thêm 1 kênh để phù hợp Conv2D pytorch
X_train = X_train.unsqueeze(1)  # (batch, 1, 128, 128)

In [6]:
import torch.nn.functional as F
class NoteCNN(nn.Module):
    def __init__(self, num_classes=129):
        super(NoteCNN, self).__init__()

        self.conv1 = nn.Conv2d(1, 32, kernel_size=(3, 3), padding=(1, 1))
        self.bn1 = nn.BatchNorm2d(32)
        self.pool = nn.MaxPool2d((2, 2))

        self.conv2 = nn.Conv2d(32, 64, kernel_size=(3, 3), padding=(1, 1))
        self.bn2 = nn.BatchNorm2d(64)

        self.conv3 = nn.Conv2d(64, 128, kernel_size=(3, 3), padding=(1, 1))
        self.bn3 = nn.BatchNorm2d(128)

        self.fc1 = nn.Linear(128 * 16 * 16, 256)
        self.dropout = nn.Dropout(0.3)
        self.fc2 = nn.Linear(256, num_classes)

    def forward(self, x):
        x = self.pool(F.relu(self.bn1(self.conv1(x))))
        x = self.pool(F.relu(self.bn2(self.conv2(x))))
        x = self.pool(F.relu(self.bn3(self.conv3(x))))

        x = torch.flatten(x, start_dim=1)
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)

        return x

In [7]:
class MusicDataset(data.Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

# Tạo DataLoader
dataset = MusicDataset(X_train, y_train)
train_loader = data.DataLoader(dataset, batch_size=64, shuffle=True)

In [9]:
def train_model(model, train_loader, criterion, optimizer, device, num_epochs=20):
    model.to(device)

    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        correct = 0
        total = 0

        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)

            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            _, predicted = outputs.max(1)
            correct += (predicted == labels).sum().item()
            total += labels.size(0)

        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {running_loss/len(train_loader):.4f}, "
              f"Accuracy: {100 * correct / total:.2f}%")

    print("Huấn luyện xong!")


device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

# Khởi tạo mô hình, hàm mất mát và optimizer
model = NoteCNN()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

train_model(model, train_loader, criterion, optimizer, device, num_epochs=20)

Epoch 1/20, Loss: 3.9363, Accuracy: 81.54%
Epoch 2/20, Loss: 1.8020, Accuracy: 91.59%
Epoch 3/20, Loss: 1.2111, Accuracy: 90.89%
Epoch 4/20, Loss: 0.8175, Accuracy: 92.76%
Epoch 5/20, Loss: 0.5668, Accuracy: 92.29%
Epoch 6/20, Loss: 0.4276, Accuracy: 95.33%
Epoch 7/20, Loss: 0.3236, Accuracy: 95.56%
Epoch 8/20, Loss: 0.3003, Accuracy: 95.56%
Epoch 9/20, Loss: 0.2635, Accuracy: 95.33%
Epoch 10/20, Loss: 0.2279, Accuracy: 96.03%
Epoch 11/20, Loss: 0.3535, Accuracy: 94.86%
Epoch 12/20, Loss: 0.2861, Accuracy: 95.56%
Epoch 13/20, Loss: 0.1991, Accuracy: 96.03%
Epoch 14/20, Loss: 0.2043, Accuracy: 96.03%
Epoch 15/20, Loss: 0.1539, Accuracy: 96.26%
Epoch 16/20, Loss: 0.1551, Accuracy: 95.79%
Epoch 17/20, Loss: 0.1172, Accuracy: 96.73%
Epoch 18/20, Loss: 0.1427, Accuracy: 96.50%
Epoch 19/20, Loss: 0.2205, Accuracy: 95.79%
Epoch 20/20, Loss: 0.1186, Accuracy: 96.50%
Huấn luyện xong!
