In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

import os
import yt_dlp
import librosa
from pydub import AudioSegment

from concurrent.futures import ThreadPoolExecutor

## Data Preparation

In [2]:
df = pd.read_csv("data/features_30_sec.csv")
df["label"] = df["label"].str.capitalize().replace("Hiphop", "Hip-hop")
df.drop("length", axis=1, inplace=True)
df.head()

Unnamed: 0,filename,chroma_stft_mean,chroma_stft_var,rms_mean,rms_var,spectral_centroid_mean,spectral_centroid_var,spectral_bandwidth_mean,spectral_bandwidth_var,rolloff_mean,...,mfcc16_var,mfcc17_mean,mfcc17_var,mfcc18_mean,mfcc18_var,mfcc19_mean,mfcc19_var,mfcc20_mean,mfcc20_var,label
0,blues.00000.wav,0.350088,0.088757,0.130228,0.002827,1784.16585,129774.064525,2002.44906,85882.761315,3805.839606,...,52.42091,-1.690215,36.524071,-0.408979,41.597103,-2.303523,55.062923,1.221291,46.936035,Blues
1,blues.00001.wav,0.340914,0.09498,0.095948,0.002373,1530.176679,375850.073649,2039.036516,213843.755497,3550.522098,...,55.356403,-0.731125,60.314529,0.295073,48.120598,-0.283518,51.10619,0.531217,45.786282,Blues
2,blues.00002.wav,0.363637,0.085275,0.17557,0.002746,1552.811865,156467.643368,1747.702312,76254.192257,3042.260232,...,40.598766,-7.729093,47.639427,-1.816407,52.382141,-3.43972,46.63966,-2.231258,30.573025,Blues
3,blues.00003.wav,0.404785,0.093999,0.141093,0.006346,1070.106615,184355.942417,1596.412872,166441.494769,2184.745799,...,44.427753,-3.319597,50.206673,0.636965,37.31913,-0.619121,37.259739,-3.407448,31.949339,Blues
4,blues.00004.wav,0.308526,0.087841,0.091529,0.002303,1835.004266,343399.939274,1748.172116,88445.209036,3579.757627,...,86.099236,-5.454034,75.269707,-0.916874,53.613918,-4.404827,62.910812,-11.703234,55.19516,Blues


## Tensors

In [3]:
features = df.iloc[:, 1:58]
scaler = StandardScaler()
X = scaler.fit_transform(features)

target = df["label"]
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(target)

X_tensor = torch.tensor(X, dtype=torch.float32)
y_tensor = torch.tensor(y, dtype=torch.long)

## Dataset and DataLoader

In [4]:
class AudioDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X_tensor, y_tensor, random_state=42, test_size=0.2)

train_dataset = AudioDataset(X_train, y_train)
test_dataset = AudioDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

## Classification Model

In [6]:
class GenreClassifier(nn.Module):
    def __init__(self, input_features, output_features):
        super(GenreClassifier, self).__init__()
        self.fc1 = nn.Linear(input_features, 48)
        self.fc2 = nn.Linear(48, 24)
        self.fc3 = nn.Linear(24, output_features)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.fc3(x)
        return x

In [7]:
device = "cuda" if torch.cuda.is_available() else "cpu"

num_features = features.shape[1]
num_classes = len(label_encoder.classes_)
model = GenreClassifier(num_features, num_classes).to(device)

## Training Loop

In [8]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

epochs = 50

for epoch in range(epochs):
    model.train()
    running_loss = 0.0

    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    print(f"Epoch: {epoch+1}, Loss: {running_loss/len(train_loader):.4f}")

torch.save(model.state_dict(), "model.pth")

Epoch: 1, Loss: 2.2046
Epoch: 2, Loss: 1.9416
Epoch: 3, Loss: 1.6865
Epoch: 4, Loss: 1.5237
Epoch: 5, Loss: 1.3835
Epoch: 6, Loss: 1.2603
Epoch: 7, Loss: 1.1539
Epoch: 8, Loss: 1.0659
Epoch: 9, Loss: 0.9924
Epoch: 10, Loss: 0.9256
Epoch: 11, Loss: 0.8756
Epoch: 12, Loss: 0.8272
Epoch: 13, Loss: 0.7826
Epoch: 14, Loss: 0.7447
Epoch: 15, Loss: 0.7066
Epoch: 16, Loss: 0.6724
Epoch: 17, Loss: 0.6430
Epoch: 18, Loss: 0.6140
Epoch: 19, Loss: 0.5881
Epoch: 20, Loss: 0.5672
Epoch: 21, Loss: 0.5444
Epoch: 22, Loss: 0.5220
Epoch: 23, Loss: 0.5037
Epoch: 24, Loss: 0.4788
Epoch: 25, Loss: 0.4616
Epoch: 26, Loss: 0.4419
Epoch: 27, Loss: 0.4286
Epoch: 28, Loss: 0.4135
Epoch: 29, Loss: 0.3949
Epoch: 30, Loss: 0.3801
Epoch: 31, Loss: 0.3644
Epoch: 32, Loss: 0.3495
Epoch: 33, Loss: 0.3351
Epoch: 34, Loss: 0.3273
Epoch: 35, Loss: 0.3138
Epoch: 36, Loss: 0.3021
Epoch: 37, Loss: 0.2907
Epoch: 38, Loss: 0.2787
Epoch: 39, Loss: 0.2710
Epoch: 40, Loss: 0.2606
Epoch: 41, Loss: 0.2530
Epoch: 42, Loss: 0.2451
E

## Model Evaluation

In [9]:
y_pred = []
y_true = []

model.eval()
with torch.no_grad():
    for inputs, labels in test_loader:
        inputs, labels = inputs.to(device), labels.to(device)

        outputs = model(inputs)
        _, pred = torch.max(outputs, 1)

        y_pred.extend(pred.cpu().numpy())
        y_true.extend(labels.cpu().numpy())

test_accuracy = accuracy_score(y_true, y_pred)
print(f"Test Accuracy: {test_accuracy:.4f}")

Test Accuracy: 0.7150


In [10]:
def feature_ablation(model, test_loader, device):
    model.eval()
    baseline_acc = get_test_accuracy(model, test_loader, device)  # Get original accuracy
    feature_importance = {}

    for feature_idx in range(test_loader.dataset[0][0].shape[0]):  # Assuming feature dimension first
        y_pred, y_true = [], []

        with torch.no_grad():
            for inputs, labels in test_loader:
                inputs, labels = inputs.to(device), labels.to(device)
                modified_inputs = inputs.clone()
                modified_inputs[:, feature_idx] = 0  # Zero out one feature at a time

                outputs = model(modified_inputs)
                _, pred = torch.max(outputs, 1)
                y_pred.extend(pred.cpu().numpy())
                y_true.extend(labels.cpu().numpy())

        acc = accuracy_score(y_true, y_pred)
        feature_importance[feature_idx] = baseline_acc - acc  # Importance = drop in accuracy

    return feature_importance

def get_test_accuracy(model, test_loader, device):
    y_pred, y_true = [], []
    model.eval()
    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            _, pred = torch.max(outputs, 1)
            y_pred.extend(pred.cpu().numpy())
            y_true.extend(labels.cpu().numpy())
    return accuracy_score(y_true, y_pred)

feature_importance = feature_ablation(model, test_loader, device)

for idx in range(0, len(feature_importance)):
    print(features.columns[idx], feature_importance[idx])

chroma_stft_mean 0.0
chroma_stft_var 0.020000000000000018
rms_mean 0.020000000000000018
rms_var 0.025000000000000022
spectral_centroid_mean 0.025000000000000022
spectral_centroid_var 0.020000000000000018
spectral_bandwidth_mean 0.0050000000000000044
spectral_bandwidth_var 0.015000000000000013
rolloff_mean 0.015000000000000013
rolloff_var 0.015000000000000013
zero_crossing_rate_mean 0.020000000000000018
zero_crossing_rate_var 0.010000000000000009
harmony_mean 0.010000000000000009
harmony_var 0.015000000000000013
perceptr_mean 0.0050000000000000044
perceptr_var 0.020000000000000018
tempo -0.015000000000000013
mfcc1_mean 0.03499999999999992
mfcc1_var 0.010000000000000009
mfcc2_mean 0.010000000000000009
mfcc2_var 0.025000000000000022
mfcc3_mean 0.020000000000000018
mfcc3_var 0.0
mfcc4_mean 0.06999999999999995
mfcc4_var 0.020000000000000018
mfcc5_mean 0.029999999999999916
mfcc5_var 0.020000000000000018
mfcc6_mean 0.03499999999999992
mfcc6_var 0.025000000000000022
mfcc7_mean 0.02000000000000

## Feature Extraction

In [11]:
def extract_segment_features(y, sr=22050):
    seg_features = []
    
    chroma_stft = librosa.feature.chroma_stft(y=y, sr=sr)
    rms = librosa.feature.rms(y=y)
    spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)
    spectral_bandwidth = librosa.feature.spectral_bandwidth(y=y, sr=sr)
    rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)
    zero_crossing_rate = librosa.feature.zero_crossing_rate(y=y)
    harmony = librosa.effects.harmonic(y=y)
    perceptr = librosa.effects.percussive(y=y)
    tempo, beats = librosa.beat.beat_track(y=y, sr=sr)
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=20)
    
    seg_features.append(np.mean(chroma_stft))
    seg_features.append(np.var(chroma_stft))
    seg_features.append(np.mean(rms))
    seg_features.append(np.var(rms))
    seg_features.append(np.mean(spectral_centroid))
    seg_features.append(np.var(spectral_centroid))
    seg_features.append(np.mean(spectral_bandwidth))
    seg_features.append(np.var(spectral_bandwidth))
    seg_features.append(np.mean(rolloff))
    seg_features.append(np.var(rolloff))
    seg_features.append(np.mean(zero_crossing_rate))
    seg_features.append(np.var(zero_crossing_rate))
    seg_features.append(np.mean(harmony))
    seg_features.append(np.var(harmony))
    seg_features.append(np.mean(perceptr))
    seg_features.append(np.var(perceptr))
    seg_features.extend(tempo)

    for mfcc in mfccs:
        seg_features.append(np.mean(mfcc))
        seg_features.append(np.var(mfcc))

    return seg_features

In [40]:
def extract_full_features(filename):
    y, sr = librosa.load(filename, sr=22050)

    segment_length = 30 * sr
    segments = librosa.util.frame(y, frame_length=segment_length, hop_length=segment_length).T

    with ThreadPoolExecutor() as executor:
        full_features = list(executor.map(extract_segment_features, segments))
    
    return np.mean(full_features, axis=0).reshape(1, -1)

## Example Usage

In [41]:
def convert_to_wav(url):
    ydl_opts = {
        "outtmpl": "temp_file.%(ext)s"
    }
    
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        ydl.download([url])
    
    temp_file = "temp_file.webm"
    wav_file = AudioSegment.from_file(temp_file)
    wav_file.export(out_f="output_file.wav", format="wav")
    output_file = "output_file.wav"
    os.remove(temp_file)

    return output_file

In [46]:
url = input("Paste URL: ")
song_file = convert_to_wav(url)

extracted_features = extract_full_features(song_file)
features_df = pd.DataFrame(extracted_features, columns=features.columns)
scaled_features = scaler.transform(features_df)
features_tensor = torch.tensor(scaled_features, dtype=torch.float32).to(device)
os.remove(song_file)

model.eval()
with torch.no_grad():
    outputs = model(features_tensor)
    probabilities = torch.nn.functional.softmax(outputs, dim=1)
    
    pred_class = torch.argmax(probabilities, dim=1).item()
    pred_genre = label_encoder.inverse_transform([pred_class])[0]
    print(f"\nPredicted Genre: {pred_genre}\n")

    genre_probabilities = {}
    for idx, probability in enumerate(probabilities[0]):
        genre = label_encoder.inverse_transform([idx])[0]
        genre_probabilities[genre] = probability.item() * 100

    sorted_genre_probabilities = sorted(genre_probabilities.items(), key=lambda x: x[1], reverse=True)
    for genre, probability in sorted_genre_probabilities:
        print(f"{genre}: {probability:.2f}%")

Paste URL:  https://www.youtube.com/watch?v=KfEEm4Zx-EU


[youtube] Extracting URL: https://www.youtube.com/watch?v=KfEEm4Zx-EU
[youtube] KfEEm4Zx-EU: Downloading webpage
[youtube] KfEEm4Zx-EU: Downloading tv client config
[youtube] KfEEm4Zx-EU: Downloading player 56511309
[youtube] KfEEm4Zx-EU: Downloading tv player API JSON
[youtube] KfEEm4Zx-EU: Downloading ios player API JSON
[youtube] KfEEm4Zx-EU: Downloading m3u8 information
[info] KfEEm4Zx-EU: Downloading 1 format(s): 399+251
[download] Destination: temp_file.f399.mp4
[download] 100% of    4.39MiB in 00:00:00 at 5.75MiB/s   
[download] Destination: temp_file.f251.webm
[download] 100% of    4.11MiB in 00:00:00 at 8.02MiB/s   
[Merger] Merging formats into "temp_file.webm"
Deleting original file temp_file.f251.webm (pass -k to keep)
Deleting original file temp_file.f399.mp4 (pass -k to keep)

Predicted Genre: Rock

Rock: 69.47%
Pop: 8.59%
Disco: 8.58%
Reggae: 8.03%
Country: 4.27%
Hip-hop: 0.82%
Metal: 0.17%
Jazz: 0.07%
Classical: 0.01%
Blues: 0.00%
