In [8]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import models, transforms
from torch.utils.data import DataLoader
from torchvision.datasets import ImageFolder
import librosa
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn.functional as F
from torchvision import transforms
from PIL import Image
import os
import tempfile

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cuda


In [4]:
class CustomResNet50(nn.Module):
    def __init__(self, num_classes=5):
        super(CustomResNet50, self).__init__()
        self.base_model = models.resnet50(pretrained=True)
        self.base_model.fc = nn.Identity()  # Bỏ lớp FC top
        self.fc1 = nn.Linear(2048, 512)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(512, num_classes)

    def forward(self, x):
        x = self.base_model(x)
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

model = CustomResNet50(num_classes=7).to(device)



In [5]:
# Khởi tạo lại kiến trúc model giống lúc train
model = CustomResNet50(num_classes=7)
model.load_state_dict(torch.load(r'D:\classified_music\check_point_resnet\resnet_music.pth'))
model.to(device)
model.eval()  # Rất quan trọng để chuyển về chế độ inference


CustomResNet50(
  (base_model): ResNet(
    (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU(inplace=True)
    (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (layer1): Sequential(
      (0): Bottleneck(
        (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
        (downsample): Sequential(


In [6]:
from PIL import Image
from torchvision import transforms
import torch.nn.functional as F

# Ảnh đầu vào
img_path = r"D:\classified_music\dataset\danca\Câu_Chuyện_Đầu_Năm_Hoang_Oanh_part3.png"
img = Image.open(img_path).convert("RGB")

# Transform giống lúc train
transform = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406],
                         [0.229, 0.224, 0.225])
])

# Chuẩn bị ảnh
input_tensor = transform(img).unsqueeze(0).to(device)  # Thêm batch dimension

# Dự đoán
with torch.no_grad():
    outputs = model(input_tensor)
    probs = F.softmax(outputs, dim=1)
    pred_idx = torch.argmax(probs, dim=1).item()

# Lấy tên class
class_names = ['CachMang', 'NhacTrinh', 'Rapviet', 'RockViet', 'TruTinh', 'bolero', 'danca']  
pred_label = class_names[pred_idx]

print(f"🔍 Dự đoán: {pred_label} ({probs[0][pred_idx]*100:.2f}%)")

🔍 Dự đoán: danca (97.66%)


In [7]:
class_names = ['CachMang', 'NhacTrinh', 'Rapviet', 'RockViet', 'TruTinh', 'bolero', 'danca']
segment_duration = 30  # giây
sr_target = 22050      # Sample rate mong muốn
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [9]:
def audio_to_mel_image(y, sr, out_path):
    mel = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=2048, hop_length=512)
    mel_db = librosa.power_to_db(mel, ref=np.max)

    plt.figure(figsize=(3, 3))  # size phù hợp input 224x224
    plt.axis('off')
    librosa.display.specshow(mel_db, sr=sr, hop_length=512, x_axis=None, y_axis=None, cmap='viridis')
    plt.savefig(out_path, bbox_inches='tight', pad_inches=0)
    plt.close()

In [10]:
# ==== HÀM CẮT FILE WAV VÀ DỰ ĐOÁN ====
def predict_from_wav(wav_path):
    print(f"📥 Đang xử lý: {os.path.basename(wav_path)}")
    y, sr = librosa.load(wav_path, sr=sr_target)
    total_duration = librosa.get_duration(y=y, sr=sr)
    segment_samples = segment_duration * sr

    results = []

    for i in range(0, len(y), segment_samples):
        y_seg = y[i:i+segment_samples]
        if len(y_seg) < segment_samples:
            break  # Bỏ đoạn cuối nếu không đủ 30s

        with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_img:
            audio_to_mel_image(y_seg, sr, tmp_img.name)
            # pred, conf = predict_mel_image(tmp_img.name)
            # results.append((pred, conf))
            # print(f"🧠 Đoạn {(i//sr):>4}-{(i+segment_samples)//sr:>4}s → {pred} ({conf*100:.2f}%)")
            # plt.show(y_seg, sr, tmp_img.name)
            os.remove(tmp_img.name)

    return results

In [11]:
predict_from_wav(r"D:\classified_music\data_test\BaiCaDaoDauDoi-LuongGiaHuy-5288583.mp3")

📥 Đang xử lý: BaiCaDaoDauDoi-LuongGiaHuy-5288583.mp3


PermissionError: [WinError 32] The process cannot access the file because it is being used by another process: 'C:\\Users\\BHM-KY~1\\AppData\\Local\\Temp\\2\\tmphqnd2y57.png'