In [1]:
import torch
import torch.nn as nn
from torchvision import datasets, transforms, models
from torch.utils.data import DataLoader
import time


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cuda


In [3]:
num_classes = 3
batch_size = 64
epochs = 10

In [4]:
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], 
                         [0.229, 0.224, 0.225])
])
train_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomAffine(degrees=15, translate=(0.1, 0.1)),
    transforms.ColorJitter(brightness=0.2, contrast=0.2),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406],
                         [0.229, 0.224, 0.225])
])
mel_transform = transforms.Compose([
    transforms.RandomAffine(degrees=10, translate=(0.05, 0.05), scale=(0.9, 1.1)),
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.RandomApply([
        transforms.ColorJitter(brightness=0.2, contrast=0.2)
    ], p=0.5),
    transforms.Resize((224, 224)),  # 🔑 Resize lại sau augment
    transforms.ToTensor(),
    transforms.RandomErasing(p=0.3),
    transforms.Normalize([0.485, 0.456, 0.406], 
                         [0.229, 0.224, 0.225])
])



train_dataset = datasets.ImageFolder(r"D:\classified_music\data\dataset_mel\train", transform=mel_transform)
val_dataset = datasets.ImageFolder(r"D:\classified_music\data\dataset\train", transform=transform)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)


In [5]:
from collections import Counter
import os

# In class names
print("Classes:", train_dataset.classes)  

# Đếm số file mỗi class trong train
train_class_counts = Counter([train_dataset.targets[i] for i in range(len(train_dataset))])
print("\n[Train dataset]")
for i, class_name in enumerate(train_dataset.classes):
    print(f"{class_name}: {train_class_counts[i]} files")

# Đếm số file mỗi class trong val
val_class_counts = Counter([val_dataset.targets[i] for i in range(len(val_dataset))])
print("\n[Validation dataset]")
for i, class_name in enumerate(val_dataset.classes):
    print(f"{class_name}: {val_class_counts[i]} files")


Classes: ['cheo', 'remix', 'thieunhi']

[Train dataset]
cheo: 2057 files
remix: 1940 files
thieunhi: 1833 files

[Validation dataset]
cheo: 1249 files
remix: 1262 files
thieunhi: 1270 files


In [6]:
from torchvision.models import efficientnet_b0

model = efficientnet_b0(pretrained=True)
# Thay tầng cuối để phù hợp 6 class
# model.classifier[1] = nn.Linear(model.classifier[1].in_features, num_classes)
model.classifier = nn.Sequential(
    nn.Dropout(0.7),  # tăng từ mặc định 0.2
    nn.Linear(model.classifier[1].in_features, num_classes)
)

model = model.to(device)
for param in model.features.parameters():
    param.requires_grad = True 



In [7]:
import torch.nn.functional as F

class FocalLoss(nn.Module):
    def __init__(self, gamma=2.0, weight=None, reduction='mean'):
        super(FocalLoss, self).__init__()
        self.gamma = gamma
        self.weight = weight
        self.reduction = reduction

    def forward(self, inputs, targets):
        ce_loss = F.cross_entropy(inputs, targets, weight=self.weight, reduction='none')
        pt = torch.exp(-ce_loss)
        focal_loss = ((1 - pt) ** self.gamma) * ce_loss

        if self.reduction == 'mean':
            return focal_loss.mean()
        elif self.reduction == 'sum':
            return focal_loss.sum()
        else:
            return focal_loss


In [8]:
criterion = FocalLoss(gamma=2.0)
# optimizer = torch.optim.Adam(model.parameters(), lr=0.0003)
optimizer = torch.optim.AdamW(model.parameters(), lr=0.0003, weight_decay=1e-3)


In [9]:
best_val_acc = 0.0  # Lưu val_acc tốt nhất

for epoch in range(epochs):
    print(f"\nEpoch {epoch+1}/{epochs}")
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0

    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        _, preds = torch.max(outputs, 1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)

    train_acc = correct / total
    print(f"Train Loss: {running_loss/len(train_loader):.4f}, Accuracy: {train_acc:.4f}")

    # ================= VALIDATION =================
    model.eval()
    val_loss = 0.0
    val_correct = 0
    val_total = 0

    with torch.no_grad():
        for images, labels in val_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            loss = criterion(outputs, labels)
            val_loss += loss.item()

            # Predict with softmax
            probs = torch.softmax(outputs, dim=1)
            confidences, preds = torch.max(probs, 1)

            # In ra thông tin từng mẫu (tùy chọn)
            # for i in range(len(preds)):
            #     if confidences[i] < 0.7:
            #         print(f"[VAL] Predict: UNKNOWN (conf = {confidences[i]:.2f})")
            #     else:
            #         print(f"[VAL] Predict: {class_names[preds[i]]} (conf = {confidences[i]:.2f})")

            val_correct += (preds == labels).sum().item()
            val_total += labels.size(0)

    val_acc = val_correct / val_total
    print(f"Val Loss: {val_loss/len(val_loader):.4f}, Val Accuracy: {val_acc:.4f}")

    


Epoch 1/10
Train Loss: 0.1177, Accuracy: 0.8777
Val Loss: 0.0695, Val Accuracy: 0.9212

Epoch 2/10
Train Loss: 0.0402, Accuracy: 0.9583
Val Loss: 0.0421, Val Accuracy: 0.9503

Epoch 3/10
Train Loss: 0.0250, Accuracy: 0.9720
Val Loss: 0.0784, Val Accuracy: 0.9146

Epoch 4/10
Train Loss: 0.0197, Accuracy: 0.9770
Val Loss: 0.1161, Val Accuracy: 0.8905

Epoch 5/10
Train Loss: 0.0118, Accuracy: 0.9877
Val Loss: 0.0697, Val Accuracy: 0.9259

Epoch 6/10


KeyboardInterrupt: 

In [None]:
torch.save(model.state_dict(), "train_with_3class.pth")


In [None]:
print("Classes:", train_dataset.classes)


Classes: ['cheo', 'remix', 'thieunhi']


In [None]:
test_dataset = datasets.ImageFolder(r"D:\classified_music\data\dataset\train", transform=transform)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
class_names = test_dataset.classes


In [None]:
print(class_names)

['cheo', 'remix', 'thieunhi']


In [None]:
from torchvision.models import efficientnet_b0

model = efficientnet_b0(pretrained=False)
model.classifier[1] = nn.Linear(model.classifier[1].in_features, num_classes)
model.load_state_dict(torch.load("train_with_3class.pth", map_location=device))
model.to(device)
model.eval()




EfficientNet(
  (features): Sequential(
    (0): Conv2dNormActivation(
      (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): SiLU(inplace=True)
    )
    (1): Sequential(
      (0): MBConv(
        (block): Sequential(
          (0): Conv2dNormActivation(
            (0): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
            (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): SiLU(inplace=True)
          )
          (1): SqueezeExcitation(
            (avgpool): AdaptiveAvgPool2d(output_size=1)
            (fc1): Conv2d(32, 8, kernel_size=(1, 1), stride=(1, 1))
            (fc2): Conv2d(8, 32, kernel_size=(1, 1), stride=(1, 1))
            (activation): SiLU(inplace=True)
            (scale_activation): Sigmoid()
          )
          (2): Conv2dNormActivat

In [None]:
all_preds = []
all_labels = []

with torch.no_grad():
    for images, labels in test_loader:
        images, labels = images.to(device), labels.to(device)
        outputs = model(images)
        _, preds = torch.max(outputs, 1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())


In [None]:
import torch
import torch.nn as nn
from torchvision import datasets, transforms, models
from torch.utils.data import DataLoader
from sklearn.metrics import classification_report, confusion_matrix
import os


In [None]:
print("Classes:", class_names)
print(classification_report(all_labels, all_preds, target_names=class_names))


Classes: ['cheo', 'remix', 'thieunhi']
              precision    recall  f1-score   support

        cheo       1.00      1.00      1.00      1249
       remix       0.98      0.74      0.85      1262
    thieunhi       0.79      0.98      0.88      1270

    accuracy                           0.91      3781
   macro avg       0.92      0.91      0.91      3781
weighted avg       0.92      0.91      0.91      3781



test

In [None]:
import librosa
import numpy as np

def split_audio(file_path, segment_duration=3, sr=22050):
    y, sr = librosa.load(file_path, sr=sr)
    segment_samples = segment_duration * sr
    total_segments = len(y) // segment_samples
    segments = []
    for i in range(total_segments):
        start = i * segment_samples
        end = start + segment_samples
        segments.append(y[start:end])
    return segments, sr


In [None]:
import librosa.display
import matplotlib.pyplot as plt
import torch
from torchvision import transforms
from PIL import Image
import io

# Transform resize ảnh về 224x224
image_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor()
])

def audio_to_mel_image(audio_segment, sr):
    S = librosa.feature.melspectrogram(y=audio_segment, sr=sr, n_mels=128)
    S_db = librosa.power_to_db(S, ref=np.max)

    # Vẽ Mel thành ảnh
    fig, ax = plt.subplots(figsize=(3, 3), dpi=72)
    librosa.display.specshow(S_db, sr=sr, x_axis='time', y_axis='mel', ax=ax)
    ax.axis('off')
    buf = io.BytesIO()
    plt.savefig(buf, format='png', bbox_inches='tight', pad_inches=0)
    plt.close(fig)
    buf.seek(0)
    image = Image.open(buf).convert("RGB")
    return image_transform(image)


In [None]:
def predict_long_audio(file_path, model, class_names, segment_duration=3):
    segments, sr = split_audio(file_path, segment_duration=segment_duration)
    model.eval()
    all_preds = []
    all_probs = []

    with torch.no_grad():
        for segment in segments:
            image_tensor = audio_to_mel_image(segment, sr).unsqueeze(0).to(device)
            outputs = model(image_tensor)
            probs = torch.softmax(outputs, dim=1)
            _, pred = torch.max(probs, 1)
            all_preds.append(pred.item())
            all_probs.append(probs.squeeze().cpu().numpy())

    # Kết luận theo majority voting
    final_pred = max(set(all_preds), key=all_preds.count)
    print(f"Final Prediction: {class_names[final_pred]}")
    return class_names[final_pred], all_preds, all_probs


In [None]:
final_label, segment_preds, probs = predict_long_audio(r"G:\Trung Tam SX/2025-04/16/01-NSNDMinhThu-6Album/8935349494352/8935349494352_1_9.wav", model, class_names)

Final Prediction: thieunhi
