In [7]:
import os
import librosa as lb
import soundfile as sf
import numpy as np
import matplotlib.pyplot as plt
import cv2


In [14]:
import time,random
import os
import json
import yt_dlp
from pydub import AudioSegment

def download_and_convert_youtube(link, save_path_wav):
    temp_file = save_path_wav.replace('.wav', '')

    ydl_opts = {
    'format': 'bestaudio/best',
    'outtmpl': temp_file,
    'quiet': True,
    'cookiefile': 'cookie.txt' # file content cookie youtube
    }
    ydl_opts_info = {
        'quiet': True,
        'skip_download': True,
        'cookiefile': 'cookie.txt'
    }
    try:
        with yt_dlp.YoutubeDL(ydl_opts_info) as ydl:
            info = ydl.extract_info(link, download=False)
            # duration = info.get("duration", 0)

            # if duration > 600:
            #     print(f"video dài hơn 10 phút: {link} ({duration // 60} phút)")
            #     return
        # dowload wav
        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            print(f"🎵 Đang tải: {link}")
            ydl.download([link])
            time.sleep(random.uniform(5, 15))
        if not os.path.exists(temp_file):
            raise FileNotFoundError(f" Không tìm thấy file tạm: {temp_file}")

        audio = AudioSegment.from_file(temp_file)
        audio = audio.set_channels(1).set_frame_rate(16000)
        audio.export(save_path_wav, format="wav")
        os.remove(temp_file)
        print(f" Đã lưu: {save_path_wav}")
    except Exception as e:
        print(f" Lỗi với {link}: {e}")

In [3]:
import torch
import torch.nn as nn
from torchvision import datasets, transforms, models
from torch.utils.data import DataLoader
import time
from torchvision.models import efficientnet_b0
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
num_classes = 3 
class_names = ['cheo', 'remix', 'thieunhi']
print("Using device:", device)
model = efficientnet_b0(pretrained=False)
model.classifier[1] = nn.Linear(model.classifier[1].in_features, num_classes)
model.load_state_dict(torch.load(r"D:\classified_music\vn_5_class.pth", map_location=device))
model.to(device)
model.eval()

Using device: cuda




EfficientNet(
  (features): Sequential(
    (0): Conv2dNormActivation(
      (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): SiLU(inplace=True)
    )
    (1): Sequential(
      (0): MBConv(
        (block): Sequential(
          (0): Conv2dNormActivation(
            (0): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
            (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): SiLU(inplace=True)
          )
          (1): SqueezeExcitation(
            (avgpool): AdaptiveAvgPool2d(output_size=1)
            (fc1): Conv2d(32, 8, kernel_size=(1, 1), stride=(1, 1))
            (fc2): Conv2d(8, 32, kernel_size=(1, 1), stride=(1, 1))
            (activation): SiLU(inplace=True)
            (scale_activation): Sigmoid()
          )
          (2): Conv2dNormActivat

In [None]:



# Hàm xử lý STFT
def get_fft(samples, n_fft=2048, hop_length=512):
    for index, item in samples.items():
        D = np.abs(lb.stft(item["sampling"], n_fft=n_fft, hop_length=hop_length))
        samples[index]["stft"] = D
    return samples

# Hàm xử lý mel spectrogram
def get_mel_spectrogram(samples, sr=22050):
    for index, item in samples.items():
        S = lb.feature.melspectrogram(y=item["sampling"], sr=sr)
        S_db = lb.amplitude_to_db(S, ref=np.max)
        samples[index]["mel-spec-db"] = S_db
    return samples

# Hàm lưu ảnh mel spectrogram
def save_mel_spec(samples, root):
    image_paths = []
    for index, item in samples.items():
        S_db = item["mel-spec-db"]
        os.makedirs(root, exist_ok=True)

        file_name = os.path.splitext(os.path.basename(item["dir"]))[0]
        out_path = os.path.join(root, file_name + ".png")
        plt.imsave(out_path, S_db)
        image_paths.append(out_path)
    return image_paths

# Hàm chính: predict và sinh ảnh spectrogram
def predict_component(file_path, duration=30):
    try:
        y, sr = lb.load(file_path, sr=None)
    except Exception as e:
        print(f"❌ Lỗi khi load: {file_path} ({e})")
        return
    List_predictions = []
    segment_samples = duration * sr
    total_samples = len(y)
    num_segments = total_samples // segment_samples
    base_filename = os.path.splitext(os.path.basename(file_path))[0]

    if num_segments == 0:
        return

    # Tự động lấy class từ folder cha (ví dụ: 'pop')
    folder_path = os.path.dirname(file_path)

    # Folder lưu wav đã cắt
    output_folder = os.path.join(folder_path, "predict")
    os.makedirs(output_folder, exist_ok=True)

    samples = {}

    for i in range(num_segments):
        start = i * segment_samples
        end = start + segment_samples
        segment = y[start:end]

        new_filename = f"{base_filename}_part{i+1}.wav"
        new_path = os.path.join(output_folder, new_filename)

        sf.write(new_path, segment, sr)
        samples[i] = {
            "dir": new_path,
            "sampling": segment
        }

    samples = get_fft(samples)
    samples = get_mel_spectrogram(samples, sr)

    mel_root = os.path.join(output_folder, "mel-images")
    os.makedirs(mel_root, exist_ok=True)

    list_test = save_mel_spec(samples, mel_root)

    # === DỰ ĐOÁN TỪ ẢNH MEL ===
    print(f"\n Đang dự đoán : {base_filename}")
    for path in list_test:
      image=cv2.imread(str(path))
      image_resized= cv2.resize(image, (224,224))
      image=np.expand_dims(image_resized,axis=0)
      predictions = model.predict(image)
      images, labels = images.to(device), labels.to(device)
      output_class=class_names[np.argmax(predictions)]
      confidences = predictions[0]
      predicted_index = np.argmax(confidences)
      confidence_score = confidences[predicted_index]
      List_predictions.append((output_class, confidence_score))

    from collections import Counter
    class_counts = Counter([pred[0] for pred in List_predictions])
    most_common_class, _ = class_counts.most_common(1)[0]

    # Tính trung bình xác suất của class đó (còn lại = 0)
    confidence_sum = 0
    for class_id, conf in List_predictions:
        if class_id == most_common_class:
            confidence_sum += conf
        else:
            confidence_sum += 0

    average_confidence = confidence_sum / len(List_predictions)
    print(f"🎶 Thể loại dự đoán: {most_common_class}")
    print(f"📊 Xác suất trung bình: {round(average_confidence * 100, 2)}%")


In [10]:
predict_component(r'D:\classified_music\data_test\bolero_chill_test.wav')


 Đang dự đoán : bolero_chill_test


AttributeError: 'EfficientNet' object has no attribute 'predict'

In [18]:
predict_component(r"D:\classified_music\data_test\trutinh_test.wav")


 Đang dự đoán : trutinh_test
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 74ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 70ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 70ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 88ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 66ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 73ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 69ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 71ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 73ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 77ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 76ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 71ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 82ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━

In [19]:

predict_component(r'D:\classified_music\data_test\danca_test.wav')


 Đang dự đoán : danca_test
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 77ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 79ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 76ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 75ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 71ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 74ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 73ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 75ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 78ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 77ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 77ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 77ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 79ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━

In [20]:
def test(url):
    # download_and_convert_youtube("gan link nhac youtube can test", "duong dan luu file wav")
    link = r"D:\classified_music\data_test\test_audio_.wav"
    download_and_convert_youtube(url,link)
    predict_component(link)

### dan link youtube de test

In [21]:
test("https://youtu.be/lrpOGToQtYk?si=3XrmsQBQyKdQevrg")

🎵 Đang tải: https://youtu.be/lrpOGToQtYk?si=3XrmsQBQyKdQevrg
 Đã lưu: D:\classified_music\data_test\test_audio_.wav     

 Đang dự đoán : test_audio_
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 77ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 74ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 73ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 74ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 76ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 70ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 73ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 76ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 78ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 75ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 80ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s

In [25]:
#  cheo
test("https://youtu.be/1vZC1CpD6QA?si=G65J8E4rjJrEfdAl")

🎵 Đang tải: https://youtu.be/1vZC1CpD6QA?si=G65J8E4rjJrEfdAl
 Đã lưu: D:\classified_music\data_test\test_audio_.wav     

 Đang dự đoán : test_audio_
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 74ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 73ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 73ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 72ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 73ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 72ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 77ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 72ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 73ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 71ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 71ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s

In [27]:
# cai luong
test("https://youtu.be/Zyi0o-ql1h8?si=9QccY3ysfDwPyHU_")

🎵 Đang tải: https://youtu.be/Zyi0o-ql1h8?si=9QccY3ysfDwPyHU_
 Đã lưu: D:\classified_music\data_test\test_audio_.wav

 Đang dự đoán : test_audio_
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 94ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 72ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 76ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 71ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 71ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 71ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 77ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 79ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 75ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 72ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 73ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7