In [136]:
import warnings
warnings.filterwarnings('ignore')

# Complex pytorch
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
from complexPyTorch.complexLayers import *
from complexPyTorch.complexFunctions import *
from torch_geometric.nn import GCNConv
from torch_geometric.data import Data

# Plot
import matplotlib.pyplot as plt
import seaborn as sns
import time

# Load Data
import numpy as np
import json
import os
import math
import librosa
import pathlib
from scipy.spatial.distance import cdist
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
import random
import pandas as pd

# MFCCS
from scipy.io import wavfile
import scipy.fftpack as fft
from scipy.signal import get_window
import librosa
import librosa.display
import IPython.display as ipd
import scipy as spp

# CV
from sklearn.model_selection import cross_val_score, KFold

In [139]:
def custom_cross_val(model, X, y, k=5):
    np.random.seed(42)
    indices = np.arange(len(X))
    np.random.shuffle(indices)
    splits = np.array_split(indices, k)
    accuracies = []
    for i in range(k):
        test_indices = splits[i]
        train_indices = np.concatenate([splits[j] for j in range(k) if j != i])
        X_train, y_train = X.iloc[train_indices], y.iloc[train_indices]
        X_test, y_test = X.iloc[test_indices], y.iloc[test_indices] 
        model.fit(X_train.to_numpy(), y_train.to_numpy())
        y_pred = model.predict(X_test.to_numpy())
        accuracy = np.mean(y_pred == y_test.to_numpy())
        accuracies.append(accuracy)   
    return accuracies

# Create Data

In [None]:
DATASET_PATH = "Data/train"
SAMPLE_RATE = 22050
TRACK_DURATION = 30 # measured in seconds
SAMPLES_PER_TRACK = SAMPLE_RATE * TRACK_DURATION
BATCH_SIZE = 32
NUM_EPOCHS = 50
genre_list = os.listdir(DATASET_PATH)
if '.DS_Store' in genre_list: genre_list.remove('.DS_Store')
genre_mappings = dict(zip(genre_list, range(len(genre_list))))
print(genre_mappings)

In [None]:
class TimeDomainFeatures:
    @staticmethod
    def amplitude_envelope(signal, frame_size, hop_length):
        res = []
        for i in range(0, len(signal), hop_length):
            cur_portion = signal[i:i + frame_size]  
            ae_val = max(cur_portion)  
            res.append(ae_val)  
        return np.array([np.nanmean(res), np.nanvar(res)])
    
    @staticmethod
    def RMS_energy(signal, frame_size, hop_length):
        res = []
        for i in range(0, len(signal), hop_length):
            cur_portion = signal[i:i + frame_size]  
            rmse_val = np.sqrt(1 / len(cur_portion) * sum(i**2 for i in cur_portion))  
            res.append(rmse_val)
        return np.array([np.nanmean(res), np.nanvar(res)])
    
    @staticmethod
    def crest_factor(signal, frame_size, hop_length):
        res = []
        for i in range(0, len(signal), hop_length):
            cur_portion = signal[i:i + frame_size]  
            rmse_val = np.sqrt(1 / len(cur_portion) * sum(i ** 2 for i in cur_portion))  
            crest_val = max(np.abs(cur_portion)) / rmse_val  
            res.append(crest_val)  
        return np.array([np.nanmean(res), np.nanvar(res)])
    
    @staticmethod
    def ZCR(signal, frame_size, hop_length):
        def num_sign_changes(signal):
            res = 0
            for i in range(0, len(signal) - 1):
                if (signal[i] * signal[i + 1] < 0): res += 1
            return res
        res = []
        for i in range(0, len(signal), hop_length):
            cur_portion = signal[i:i + frame_size]  
            zcr_val = num_sign_changes(cur_portion)
            res.append(zcr_val)  
        return np.array([np.nanmean(res), np.nanvar(res)])

## 0.1 Real Frequncy Domain

In [None]:
class FreqDomainFeatures:

    @staticmethod
    def normalize_audio(audio):
        audio = audio / np.max(np.abs(audio))
        return audio

    @staticmethod
    def compute_spectrogram(signal, frame_size, hop_length):
        signal = FreqDomainFeatures.normalize_audio(signal)
        spec = librosa.stft(signal, n_fft=frame_size, hop_length=hop_length)
        return np.abs(spec).T

    @staticmethod
    def band_energy_ratio(spec, split_freq = 2048):
        def find_split_freq_bin(spec, split_freq):
            range_of_freq = SAMPLE_RATE / 2
            change_per_bin = range_of_freq / spec.shape[0]
            split_freq_bin = split_freq / change_per_bin
            return int(np.floor(split_freq_bin))
        split_freq_bin = find_split_freq_bin(spec.T, split_freq)
        res = []
        for sub_arr in spec:
            low_freq_density = sum(i ** 2 for i in sub_arr[:split_freq_bin])
            high_freq_density = sum(i ** 2 for i in sub_arr[split_freq_bin:])
            ber_val = low_freq_density / high_freq_density
            res.append(ber_val)
        return np.array([np.nanmean(res), np.nanvar(res)])

    @staticmethod
    def spectral_centroid(spec):
        def sc(arr):
            res = 0
            for i in range(0, len(arr)):
                res += i*arr[i]
            return res/sum(arr)
        res = []
        for sub_arr in spec:
            sc_val = sc(sub_arr)
            res.append(sc_val)
        return np.array([np.nanmean(res), np.nanvar(res)])

    @staticmethod
    def spectral_bandwidth(spec):
        def sc(arr):
            res = 0
            for i in range(0, len(arr)):
                res += i*arr[i]
            return res/sum(arr)
        def sb(arr):
            res = 0
            sc_val = sc(arr)
            for i in range(0, len(arr)):
                res += (abs(i - sc_val))*arr[i]
            return res/sum(arr)
        res = []
        for sub_arr in spec:
            sb_val = sb(sub_arr)
            res.append(sb_val)
        return np.array([np.nanmean(res), np.nanvar(res)])

    @staticmethod
    def spectral_flatness(spec):
        res = []
        for sub_arr in spec:
            geom_mean = np.exp(np.log(sub_arr).mean())
            ar_mean = np.mean(sub_arr)
            sl_val = geom_mean/ar_mean
            res.append(sl_val)
        return np.array([np.nanmean(res), np.nanvar(res)])

In [182]:
class MusicFeatureExtractor:
    @staticmethod
    def normalize_audio(audio):
        audio = audio / np.max(np.abs(audio))
        return audio

    @staticmethod
    def frame_audio(audio, FFT_size=1024, HOP_SIZE=512):
        frame_num = int((len(audio) - FFT_size) / HOP_SIZE) + 1
        frames = np.zeros((frame_num, FFT_size))
        for n in range(frame_num):
            frames[n] = audio[n * HOP_SIZE: n * HOP_SIZE + FFT_size]
        return frames

    @staticmethod
    def freq_to_mel(freq):
        return 2595.0 * np.log10(1.0 + freq / 700.0)

    @staticmethod
    def met_to_freq(mels):
        return 700.0 * (10.0 ** (mels / 2595.0) - 1.0)

    @staticmethod
    def get_filter_points(fmin, fmax, sample_rate, mel_filter_num=13, FFT_size=1024):
        fmin_mel = MusicFeatureExtractor.freq_to_mel(fmin)
        fmax_mel = MusicFeatureExtractor.freq_to_mel(fmax)
        mels = np.linspace(fmin_mel, fmax_mel, num=mel_filter_num + 2)
        freqs = MusicFeatureExtractor.met_to_freq(mels)
        return np.floor((FFT_size + 1) / sample_rate * freqs).astype(int), freqs

    @staticmethod
    def get_filters(filter_points, FFT_size=1024):
        filters = np.zeros((len(filter_points) - 2, int(FFT_size / 2 + 1)))
        for n in range(len(filter_points) - 2):
            filters[n, filter_points[n]: filter_points[n + 1]] = np.linspace(0, 1, filter_points[n + 1] - filter_points[n])
            filters[n, filter_points[n + 1]: filter_points[n + 2]] = np.linspace(1, 0, filter_points[n + 2] - filter_points[n + 1])
        return filters

    @staticmethod
    def dct(dct_filter_num=40, mel_filter_num=13):
        basis = np.empty((dct_filter_num, mel_filter_num))
        basis[0, :] = 1.0 / np.sqrt(mel_filter_num)
        samples = np.arange(1, 2 * mel_filter_num, 2) * np.pi / (2.0 * mel_filter_num)
        for i in range(1, dct_filter_num):
            basis[i, :] = np.cos(i * samples) * np.sqrt(2.0 / mel_filter_num)
        return basis

    @staticmethod
    def get_mfcc_features(audio, sample_rate, FFT_size=1024, mel_filter_num=13, dct_filter_num=16, HOP_SIZE = 512):
        epsilon = 1e-10
        audio = MusicFeatureExtractor.normalize_audio(audio)
        audio_framed = MusicFeatureExtractor.frame_audio(audio, FFT_size, HOP_SIZE)
        window = get_window("hann", FFT_size, fftbins=True)
        audio_win = audio_framed * window
        audio_winT = np.transpose(audio_win)
        audio_fft = np.empty((int(1 + FFT_size // 2), audio_winT.shape[1]), dtype=np.complex64, order='F')
        for n in range(audio_fft.shape[1]):
            audio_fft[:, n] = fft.fft(audio_winT[:, n], axis=0)[:audio_fft.shape[0]]
        audio_fft = np.transpose(audio_fft)
        audio_fft = np.square(np.abs(audio_fft))
        freq_min = 0
        freq_high = sample_rate / 2
        filter_points, mel_freqs = MusicFeatureExtractor.get_filter_points(
            freq_min, freq_high, sample_rate, mel_filter_num, FFT_size)
        filters = MusicFeatureExtractor.get_filters(filter_points, FFT_size)
        audio_filtered = np.dot(filters, np.transpose(audio_fft))
        audio_filtered = np.maximum(audio_filtered, epsilon)  # Replace zero values with epsilon
        audio_log = 10.0 * np.log10(audio_filtered)
        dct_filters = MusicFeatureExtractor.dct(dct_filter_num, mel_filter_num)
        cepstral_coefficents = np.dot(dct_filters, audio_log)
        mean = np.nanmean(cepstral_coefficents, axis=1)
        var = np.nanvar(cepstral_coefficents, axis=1)
        return mean, var


In [198]:
class GenreTimeFreqDomain(Dataset):

    def __init__(self, train_path, frame_size=1024, hop_length=512, num_segments = 10, training = True):
        cur_path = pathlib.Path(train_path)
        self.files = []
        for i in list(cur_path.rglob("*.wav")):
            for j in range(num_segments):
                self.files.append([j, i])
        self.frame_size = frame_size
        self.hop_length = hop_length
        self.training = training
        self.samples_per_segment = int(SAMPLES_PER_TRACK / num_segments)
        self.num_segments = num_segments

    def apply_augmentations(self, signal):
        # Apply augmentations to the audio signal
        if random.random() < 0.5:
            signal = librosa.effects.pitch_shift(signal, sr=SAMPLE_RATE, n_steps=random.uniform(-2, 2))
        if random.random() < 0.5:
            signal = librosa.effects.time_stretch(signal, rate=random.uniform(0.8, 1.2))
        return signal
        
    def __len__(self):
        return len(self.files)

    def adj_shape(self, features):
        if features.shape[0] < 130:
            features = np.pad(features, (0, 130 - features.shape[0]), mode='constant')
        else:
            features = features[:130]
        return features

    def get_time_domain(self, cur_signal):
        ae = TimeDomainFeatures.amplitude_envelope(cur_signal, self.frame_size, self.hop_length)
        rmse = TimeDomainFeatures.RMS_energy(cur_signal, self.frame_size, self.hop_length)
        cf = TimeDomainFeatures.crest_factor(cur_signal, self.frame_size, self.hop_length)
        zcr = TimeDomainFeatures.ZCR(cur_signal, self.frame_size, self.hop_length)
        return np.concatenate([ae, rmse, cf, zcr])

    def get_freq_domain(self, cur_signal):
        spec = FreqDomainFeatures.compute_spectrogram(cur_signal, self.frame_size, self.hop_length)
        ber = FreqDomainFeatures.band_energy_ratio(spec)
        sc = FreqDomainFeatures.spectral_centroid(spec)
        sb = FreqDomainFeatures.spectral_bandwidth(spec)
        sf = FreqDomainFeatures.spectral_flatness(spec)
        return np.concatenate([ber, sc, sb, sf])

    def get_mfcc_features(self, cur_signal, sample_rate):
        mean, var = MusicFeatureExtractor.get_mfcc_features(cur_signal, sample_rate)
        return np.concatenate([mean, var])

    def __getitem__(self, idx):
        cur_file = self.files[idx]
        d = cur_file[0]
        file_path = cur_file[1]
        target = genre_mappings[str(file_path).split("/")[2]]
        signal, sample_rate = librosa.load(file_path, sr=SAMPLE_RATE)
        start = self.samples_per_segment * d
        finish = start + self.samples_per_segment
        cur_signal = signal[start:finish]
        # if self.training: cur_signal = self.apply_augmentations(cur_signal)
        td_features = self.get_time_domain(cur_signal)
        fd_features = self.get_freq_domain(cur_signal)
        mfcc_features = self.get_mfcc_features(cur_signal, sample_rate)
        return torch.tensor(np.concatenate([td_features.flatten(), fd_features.flatten(), mfcc_features.flatten()]),
                            dtype=torch.float32), target

In [206]:
train_dataset = GenreTimeFreqDomain("Data/train/", training = True)
test_dataset = GenreTimeFreqDomain("Data/test/", training = False)

column_names = []
for j in ["ae", "rmse", "cf", "zcr", "ber", "sc", "sb", "sf"]:
    for i in ["mean", "var"]:
        column_names.extend([f"{j}_{i}"])
for i in ["mean", "var"]:
    for j in range(1, 17):
        column_names.extend([f"mfcc_{j}_{i}"])
# Add label and set columns
column_names.extend(['label'])
print(column_names)


def create_dataframe(dataset):
    features_list = []
    labels_list = []
    for i in range(len(dataset)):
        features, label = dataset[i]
        features_list.append(features)
        labels_list.append(label)
        if i%10 == 0: print(i)
    df = pd.DataFrame(features_list) # Flatten the features
    df['label'] = labels_list
    return df

train_df = create_dataframe(train_dataset)
train_df.columns = column_names
print("-"*75)
test_df = create_dataframe(test_dataset)
test_df.columns = column_names

tr_df = train_df.applymap(lambda x: x.numpy() if hasattr(x, 'numpy') else x)
te_df = test_df.applymap(lambda x: x.numpy() if hasattr(x, 'numpy') else x)
tr_df.to_csv("Data/exp4_data/train_tff_mfcc.csv", index = False)
te_df.to_csv("Data/exp4_data/test_tff_mfcc.csv", index = False)

['ae_mean', 'ae_var', 'rmse_mean', 'rmse_var', 'cf_mean', 'cf_var', 'zcr_mean', 'zcr_var', 'ber_mean', 'ber_var', 'sc_mean', 'sc_var', 'sb_mean', 'sb_var', 'sf_mean', 'sf_var', 'mfcc_1_mean', 'mfcc_2_mean', 'mfcc_3_mean', 'mfcc_4_mean', 'mfcc_5_mean', 'mfcc_6_mean', 'mfcc_7_mean', 'mfcc_8_mean', 'mfcc_9_mean', 'mfcc_10_mean', 'mfcc_11_mean', 'mfcc_12_mean', 'mfcc_13_mean', 'mfcc_14_mean', 'mfcc_15_mean', 'mfcc_16_mean', 'mfcc_1_var', 'mfcc_2_var', 'mfcc_3_var', 'mfcc_4_var', 'mfcc_5_var', 'mfcc_6_var', 'mfcc_7_var', 'mfcc_8_var', 'mfcc_9_var', 'mfcc_10_var', 'mfcc_11_var', 'mfcc_12_var', 'mfcc_13_var', 'mfcc_14_var', 'mfcc_15_var', 'mfcc_16_var', 'label']
0
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300
310
320
330
340
350
360
370
380
390
400
410
420
430
440
450
460
470
480
490
500
510
520
530
540
550
560
570
580
590
600
610
620
630
640
650
660
670
680
690
700
710
720
730
740
750
760
770
780
790
800
810
820
830
840
850
860

## 0.2 Complex Frequency Domain

In [216]:
class FreqDomainFeatures:

    @staticmethod
    def normalize_audio(audio):
        audio = audio / np.max(np.abs(audio))
        return audio

    @staticmethod
    def compute_spectrogram(signal, frame_size, hop_length):
        signal = FreqDomainFeatures.normalize_audio(signal)
        spec = librosa.stft(signal, n_fft=frame_size, hop_length=hop_length)
        return (spec).T

    @staticmethod
    def band_energy_ratio(spec, split_freq = 2048):
        def find_split_freq_bin(spec, split_freq):
            range_of_freq = SAMPLE_RATE / 2
            change_per_bin = range_of_freq / spec.shape[0]
            split_freq_bin = split_freq / change_per_bin
            return int(np.floor(split_freq_bin))
        split_freq_bin = find_split_freq_bin(spec.T, split_freq)
        res = []
        for sub_arr in spec:
            low_freq_density = sum(i ** 2 for i in sub_arr[:split_freq_bin])
            high_freq_density = sum(i ** 2 for i in sub_arr[split_freq_bin:])
            ber_val = low_freq_density / high_freq_density
            res.append(ber_val)
        return np.array([np.nanmean(res), np.nanvar(res)])

    @staticmethod
    def spectral_centroid(spec):
        def sc(arr):
            res = 0
            for i in range(0, len(arr)):
                res += i*arr[i]
            return res/sum(arr)
        res = []
        for sub_arr in spec:
            sc_val = sc(sub_arr)
            res.append(sc_val)
        return np.array([np.nanmean(res), np.nanvar(res)])

    @staticmethod
    def spectral_bandwidth(spec):
        def sc(arr):
            res = 0
            for i in range(0, len(arr)):
                res += i*arr[i]
            return res/sum(arr)
        def sb(arr):
            res = 0
            sc_val = sc(arr)
            for i in range(0, len(arr)):
                res += (np.abs(i - sc_val))*arr[i]
            return res/sum(arr)
        res = []
        for sub_arr in spec:
            sb_val = sb(sub_arr)
            res.append(sb_val)
        return np.array([np.nanmean(res), np.nanvar(res)])

    @staticmethod
    def spectral_flatness(spec):
        res = []
        for sub_arr in spec:
            geom_mean = np.exp(np.log(sub_arr).mean())
            ar_mean = np.mean(sub_arr)
            sl_val = geom_mean/ar_mean
            res.append(sl_val)
        return np.array([np.nanmean(res), np.nanvar(res)])


In [217]:
class MusicFeatureExtractor:
    @staticmethod
    def normalize_audio(audio):
        audio = audio / np.max(np.abs(audio))
        return audio

    @staticmethod
    def frame_audio(audio, FFT_size=1024, HOP_SIZE=512):
        frame_num = int((len(audio) - FFT_size) / HOP_SIZE) + 1
        frames = np.zeros((frame_num, FFT_size))
        for n in range(frame_num):
            frames[n] = audio[n * HOP_SIZE: n * HOP_SIZE + FFT_size]
        return frames

    @staticmethod
    def freq_to_mel(freq):
        return 2595.0 * np.log10(1.0 + freq / 700.0)

    @staticmethod
    def met_to_freq(mels):
        return 700.0 * (10.0 ** (mels / 2595.0) - 1.0)

    @staticmethod
    def get_filter_points(fmin, fmax, sample_rate, mel_filter_num=13, FFT_size=1024):
        fmin_mel = MusicFeatureExtractor.freq_to_mel(fmin)
        fmax_mel = MusicFeatureExtractor.freq_to_mel(fmax)
        mels = np.linspace(fmin_mel, fmax_mel, num=mel_filter_num + 2)
        freqs = MusicFeatureExtractor.met_to_freq(mels)
        return np.floor((FFT_size + 1) / sample_rate * freqs).astype(int), freqs

    @staticmethod
    def get_filters(filter_points, FFT_size=1024):
        filters = np.zeros((len(filter_points) - 2, int(FFT_size / 2 + 1)))
        for n in range(len(filter_points) - 2):
            filters[n, filter_points[n]: filter_points[n + 1]] = np.linspace(0, 1, filter_points[n + 1] - filter_points[n])
            filters[n, filter_points[n + 1]: filter_points[n + 2]] = np.linspace(1, 0, filter_points[n + 2] - filter_points[n + 1])
        return filters

    @staticmethod
    def dct(dct_filter_num=40, mel_filter_num=13):
        basis = np.empty((dct_filter_num, mel_filter_num))
        basis[0, :] = 1.0 / np.sqrt(mel_filter_num)
        samples = np.arange(1, 2 * mel_filter_num, 2) * np.pi / (2.0 * mel_filter_num)
        for i in range(1, dct_filter_num):
            basis[i, :] = np.cos(i * samples) * np.sqrt(2.0 / mel_filter_num)
        return basis

    @staticmethod
    def get_mfcc_features(audio, sample_rate, FFT_size=1024, mel_filter_num=13, dct_filter_num=16, HOP_SIZE = 512):
        epsilon = 1e-10
        audio = MusicFeatureExtractor.normalize_audio(audio)
        audio_framed = MusicFeatureExtractor.frame_audio(audio, FFT_size, HOP_SIZE)
        window = get_window("hann", FFT_size, fftbins=True)
        audio_win = audio_framed * window
        audio_winT = np.transpose(audio_win)
        audio_fft = np.empty((int(1 + FFT_size // 2), audio_winT.shape[1]), dtype=np.complex64, order='F')
        for n in range(audio_fft.shape[1]):
            audio_fft[:, n] = fft.fft(audio_winT[:, n], axis=0)[:audio_fft.shape[0]]
        audio_fft = np.transpose(audio_fft)
        freq_min = 0
        freq_high = sample_rate / 2
        filter_points, mel_freqs = MusicFeatureExtractor.get_filter_points(
            freq_min, freq_high, sample_rate, mel_filter_num, FFT_size)
        filters = MusicFeatureExtractor.get_filters(filter_points, FFT_size)
        audio_filtered = np.dot(filters, np.transpose(audio_fft))
        audio_filtered = np.maximum(audio_filtered, epsilon)  # Replace zero values with epsilon
        audio_log = 10.0 * np.log10(audio_filtered)
        dct_filters = MusicFeatureExtractor.dct(dct_filter_num, mel_filter_num)
        cepstral_coefficents = np.dot(dct_filters, audio_log)
        mean = np.nanmean(cepstral_coefficents, axis=1)
        var = np.nanvar(cepstral_coefficents, axis=1)
        return mean, var


In [221]:
class GenreTimeFreqDomain(Dataset):

    def __init__(self, train_path, frame_size=1024, hop_length=512, num_segments = 10, training = True):
        cur_path = pathlib.Path(train_path)
        self.files = []
        for i in list(cur_path.rglob("*.wav")):
            for j in range(num_segments):
                self.files.append([j, i])
        self.frame_size = frame_size
        self.hop_length = hop_length
        self.training = training
        self.samples_per_segment = int(SAMPLES_PER_TRACK / num_segments)
        self.num_segments = num_segments

    def apply_augmentations(self, signal):
        # Apply augmentations to the audio signal
        if random.random() < 0.5:
            signal = librosa.effects.pitch_shift(signal, sr=SAMPLE_RATE, n_steps=random.uniform(-2, 2))
        if random.random() < 0.5:
            signal = librosa.effects.time_stretch(signal, rate=random.uniform(0.8, 1.2))
        return signal
        
    def __len__(self):
        return len(self.files)

    def adj_shape(self, features):
        if features.shape[0] < 130:
            features = np.pad(features, (0, 130 - features.shape[0]), mode='constant')
        else:
            features = features[:130]
        return features

    def get_time_domain(self, cur_signal):
        ae = TimeDomainFeatures.amplitude_envelope(cur_signal, self.frame_size, self.hop_length)
        rmse = TimeDomainFeatures.RMS_energy(cur_signal, self.frame_size, self.hop_length)
        cf = TimeDomainFeatures.crest_factor(cur_signal, self.frame_size, self.hop_length)
        zcr = TimeDomainFeatures.ZCR(cur_signal, self.frame_size, self.hop_length)
        return np.concatenate([ae, rmse, cf, zcr])

    def get_freq_domain(self, cur_signal):
        spec = FreqDomainFeatures.compute_spectrogram(cur_signal, self.frame_size, self.hop_length)
        ber = FreqDomainFeatures.band_energy_ratio(spec)
        sc = FreqDomainFeatures.spectral_centroid(spec)
        sb = FreqDomainFeatures.spectral_bandwidth(spec)
        sf = FreqDomainFeatures.spectral_flatness(spec)
        return np.concatenate([ber, sc, sb, sf])

    def get_mfcc_features(self, cur_signal, sample_rate):
        mean, var = MusicFeatureExtractor.get_mfcc_features(cur_signal, sample_rate)
        return np.concatenate([mean, var])


    def __getitem__(self, idx):
        cur_file = self.files[idx]
        d = cur_file[0]
        file_path = cur_file[1]
        target = genre_mappings[str(file_path).split("/")[2]]
        signal, sample_rate = librosa.load(file_path, sr=SAMPLE_RATE)
        start = self.samples_per_segment * d
        finish = start + self.samples_per_segment
        cur_signal = signal[start:finish]
        # if self.training: cur_signal = self.apply_augmentations(cur_signal)
        td_features = self.get_time_domain(cur_signal)
        fd_features = self.get_freq_domain(cur_signal)
        mfcc_features = self.get_mfcc_features(cur_signal, sample_rate)
        return torch.tensor(np.concatenate([td_features.flatten(), fd_features.flatten(), mfcc_features.flatten()]), 
                            dtype=torch.complex64), target

In [None]:
train_dataset = GenreTimeFreqDomain("Data/train/", training = True)
test_dataset = GenreTimeFreqDomain("Data/test/", training = False)

column_names = []
for j in ["ae", "rmse", "cf", "zcr", "ber", "sc", "sb", "sf"]:
    for i in ["mean", "var"]:
        column_names.extend([f"{j}_{i}"])
for i in ["mean", "var"]:
    for j in range(1, 17):
        column_names.extend([f"mfcc_{j}_{i}"])
# Add label and set columns
column_names.extend(['label'])
print(column_names)

def create_dataframe(dataset):
    features_list = []
    labels_list = []
    for i in range(len(dataset)):
        features, label = dataset[i]
        features_list.append(features)
        labels_list.append(label)
        if i%10 == 0: print(i)
    df = pd.DataFrame(features_list) # Flatten the features
    df['label'] = labels_list
    return df

train_df = create_dataframe(train_dataset)
train_df.columns = column_names
print("-"*75)
test_df = create_dataframe(test_dataset)
test_df.columns = column_names

tr_df = train_df.applymap(lambda x: x.numpy() if hasattr(x, 'numpy') else x)
te_df = test_df.applymap(lambda x: x.numpy() if hasattr(x, 'numpy') else x)
tr_df.to_csv("Data/exp4_data/train_tff_mfcc_comp.csv", index = False)
te_df.to_csv("Data/exp4_data/test_tff_mfcc_comp.csv", index = False)

['ae_mean', 'ae_var', 'rmse_mean', 'rmse_var', 'cf_mean', 'cf_var', 'zcr_mean', 'zcr_var', 'ber_mean', 'ber_var', 'sc_mean', 'sc_var', 'sb_mean', 'sb_var', 'sf_mean', 'sf_var', 'mfcc_1_mean', 'mfcc_2_mean', 'mfcc_3_mean', 'mfcc_4_mean', 'mfcc_5_mean', 'mfcc_6_mean', 'mfcc_7_mean', 'mfcc_8_mean', 'mfcc_9_mean', 'mfcc_10_mean', 'mfcc_11_mean', 'mfcc_12_mean', 'mfcc_13_mean', 'mfcc_14_mean', 'mfcc_15_mean', 'mfcc_16_mean', 'mfcc_1_var', 'mfcc_2_var', 'mfcc_3_var', 'mfcc_4_var', 'mfcc_5_var', 'mfcc_6_var', 'mfcc_7_var', 'mfcc_8_var', 'mfcc_9_var', 'mfcc_10_var', 'mfcc_11_var', 'mfcc_12_var', 'mfcc_13_var', 'mfcc_14_var', 'mfcc_15_var', 'mfcc_16_var', 'label']
0
10
20
30
40
50
60
70
80
90
100
110
120


# 1. Simple Decsion Tree with Real Valued Frequency domain Features

In [207]:
tr_df = pd.read_csv("Data/exp4_data/train_tff_mfcc.csv")
te_df = pd.read_csv("Data/exp4_data/test_tff_mfcc.csv")

In [208]:
# Separate features and labels
X_train = tr_df.drop('label', axis=1)
y_train = tr_df['label']
X_test = te_df.drop('label', axis=1)
y_test = te_df['label']

In [212]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, SGDClassifier, RidgeClassifier, PassiveAggressiveClassifier
from sklearn.neighbors import KNeighborsClassifier, RadiusNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB, ComplementNB, BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.svm import SVC, NuSVC, LinearSVC
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score

# List of models to try
models = [
    ('Logistic Regression', LogisticRegression()),
    ('SGD Classifier', SGDClassifier()),
    ('Ridge Classifier', RidgeClassifier()),
    ('Passive Aggressive Classifier', PassiveAggressiveClassifier()),
    ('K-Nearest Neighbors', KNeighborsClassifier()),
    ('Gaussian Naive Bayes', GaussianNB()),
    ('Decision Tree', DecisionTreeClassifier()),
    ('Random Forest', RandomForestClassifier()),
    ('AdaBoost', AdaBoostClassifier()),
    ('Gradient Boosting', GradientBoostingClassifier()),
    ('Support Vector Machine', SVC()),
    ('Nu-Support Vector Machine', NuSVC()),
    ('Linear Support Vector Machine', LinearSVC()),
    ('Linear Discriminant Analysis', LinearDiscriminantAnalysis()),
    ('Quadratic Discriminant Analysis', QuadraticDiscriminantAnalysis()),
    ('Voting Classifier', VotingClassifier(estimators=[
        ('lr', LogisticRegression()),
        ('rf', RandomForestClassifier()),
        ('svc', SVC())
    ]))
    # Add more models as needed
]

# Loop through models
for model_name, model in models:
    # Train the model
    model.fit(X_train, y_train)
    
    # Predict on test data
    y_pred = model.predict(X_test)
    
    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    
    # Print results
    print(f'Model: {model_name}')
    print(f'Accuracy: {accuracy}\n')


Model: Logistic Regression
Accuracy: 0.10875

Model: SGD Classifier
Accuracy: 0.0975

Model: Ridge Classifier
Accuracy: 0.49625

Model: Passive Aggressive Classifier
Accuracy: 0.156875

Model: K-Nearest Neighbors
Accuracy: 0.23375

Model: Gaussian Naive Bayes
Accuracy: 0.11125

Model: Decision Tree
Accuracy: 0.476875

Model: Random Forest
Accuracy: 0.659375

Model: AdaBoost
Accuracy: 0.389375

Model: Gradient Boosting
Accuracy: 0.63875

Model: Support Vector Machine
Accuracy: 0.100625

Model: Nu-Support Vector Machine
Accuracy: 0.105625

Model: Linear Support Vector Machine
Accuracy: 0.16625

Model: Linear Discriminant Analysis
Accuracy: 0.550625

Model: Quadratic Discriminant Analysis
Accuracy: 0.255

Model: Voting Classifier
Accuracy: 0.235625



In [213]:
class Node:
    def __init__(self, feature=None, threshold=None, left=None, right=None, *, value=None):
        self.feature = feature
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value
    
    def is_leaf(self):
        return self.value is not None

class DecisionTree:
    def __init__(self, max_depth=100, min_samples_split=2):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.root = None

    def _is_finished(self, depth):
        if (depth >= self.max_depth
            or self.n_class_labels == 1
            or self.n_samples < self.min_samples_split):
            return True
        return False
    
    def _entropy(self, y):
        proportions = np.bincount(y) / len(y)
        entropy = -np.sum([p * np.log2(p) for p in proportions if p > 0])
        return entropy

    def _create_split(self, X, thresh):
        left_idx = np.argwhere(X <= thresh).flatten()
        right_idx = np.argwhere(X > thresh).flatten()
        return left_idx, right_idx

    def _information_gain(self, X, y, thresh):
        parent_loss = self._entropy(y)
        left_idx, right_idx = self._create_split(X, thresh)
        n, n_left, n_right = len(y), len(left_idx), len(right_idx)

        if n_left == 0 or n_right == 0: 
            return 0
        
        child_loss = (n_left / n) * self._entropy(y[left_idx]) + (n_right / n) * self._entropy(y[right_idx])
        return parent_loss - child_loss

    def _best_split(self, X, y, features):
        split = {'score':- 1, 'feat': None, 'thresh': None}

        for feat in features:
            X_feat = X[:, feat]
            thresholds = np.unique(X_feat)
            for thresh in thresholds:
                score = self._information_gain(X_feat, y, thresh)

                if score > split['score']:
                    split['score'] = score
                    split['feat'] = feat
                    split['thresh'] = thresh

        return split['feat'], split['thresh']
    
    def _build_tree(self, X, y, depth=0):
        self.n_samples, self.n_features = X.shape
        self.n_class_labels = len(np.unique(y))

        # stopping criteria
        if self._is_finished(depth):
            most_common_Label = np.argmax(np.bincount(y))
            return Node(value=most_common_Label)

        # get best split
        rnd_feats = np.random.choice(self.n_features, self.n_features, replace=False)
        best_feat, best_thresh = self._best_split(X, y, rnd_feats)

        # grow children recursively
        left_idx, right_idx = self._create_split(X[:, best_feat], best_thresh)
        left_child = self._build_tree(X[left_idx, :], y[left_idx], depth + 1)
        right_child = self._build_tree(X[right_idx, :], y[right_idx], depth + 1)
        return Node(best_feat, best_thresh, left_child, right_child)
    
    def _traverse_tree(self, x, node):
        if node.is_leaf():
            return node.value
        
        if x[node.feature] <= node.threshold:
            return self._traverse_tree(x, node.left)
        return self._traverse_tree(x, node.right)

    def fit(self, X, y):
        self.root = self._build_tree(X, y)

    def predict(self, X):
        predictions = [self._traverse_tree(x, self.root) for x in X]
        return np.array(predictions)

In [214]:
np.random.seed(42)
model = DecisionTree(max_depth=10)
model.fit(X_train.to_numpy(), y_train.to_numpy())
y_pred = model.predict(X_test.to_numpy())
accuracy = accuracy_score(y_test.to_numpy(), y_pred)
print(f'Accuracy: {accuracy}\n')

Accuracy: 0.4525



In [215]:
# CV:
merged_df = pd.concat([tr_df, te_df], axis=0)
X = merged_df.drop('label', axis=1)
y = merged_df['label']
np.random.seed(42)
model = DecisionTree(max_depth=10)
cv_results = custom_cross_val(model, X, y, k=5)
for i, acc in enumerate(cv_results):
    print(f'Fold {i+1} Accuracy: {acc}')
print(f'Mean Accuracy: {np.mean(cv_results)}')

Fold 1 Accuracy: 0.6111111111111112
Fold 2 Accuracy: 0.6176176176176176
Fold 3 Accuracy: 0.6101101101101101
Fold 4 Accuracy: 0.6196196196196196
Fold 5 Accuracy: 0.6071071071071071
Mean Accuracy: 0.6131131131131131


# 2. Simple Decision Tree with Complex Valued Frequency Domain Features

In [172]:
tr_df = pd.read_csv("Data/exp4_data/train_tff_mfcc_comp.csv")
te_df = pd.read_csv("Data/exp4_data/test_tff_mfcc_comp.csv")

def df_csv_complex(df):
    result_df = df.copy()  # Make a copy to avoid modifying the original DataFrame
    result_df.iloc[:, :-1] = df.iloc[:, :-1].apply(lambda col: col.apply(
        lambda val: torch.tensor((complex(val.strip('()'))), dtype=torch.complex64) ))
    return result_df

tr_df = df_csv_complex(tr_df)
te_df = df_csv_complex(te_df)
tr_df = train_df.applymap(lambda x: x.numpy() if hasattr(x, 'numpy') else x)
te_df = test_df.applymap(lambda x: x.numpy() if hasattr(x, 'numpy') else x)

In [141]:
# Separate features and labels
X_train = tr_df.drop('label', axis=1)
y_train = tr_df['label']
X_test = te_df.drop('label', axis=1)
y_test = te_df['label']

## 2.1 Compare only real

In [145]:
class Node:
    def __init__(self, feature=None, threshold=None, left=None, right=None, *, value=None):
        self.feature = feature
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value
    
    def is_leaf(self):
        return self.value is not None

class DecisionTree:
    def __init__(self, max_depth=100, min_samples_split=2):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.root = None

    def _is_finished(self, depth):
        if (depth >= self.max_depth
            or self.n_class_labels == 1
            or self.n_samples < self.min_samples_split):
            return True
        return False
    
    def _entropy(self, y):
        proportions = np.bincount(y) / len(y)
        entropy = -np.sum([p * np.log2(p) for p in proportions if p > 0])
        return entropy

    def _create_split(self, X, thresh):
        left_idx = np.argwhere(X <= thresh).flatten()
        right_idx = np.argwhere(X > thresh).flatten()
        return left_idx, right_idx

    def _information_gain(self, X, y, thresh):
        parent_loss = self._entropy(y)
        left_idx, right_idx = self._create_split(X, thresh)
        n, n_left, n_right = len(y), len(left_idx), len(right_idx)

        if n_left == 0 or n_right == 0: 
            return 0
        
        child_loss = (n_left / n) * self._entropy(y[left_idx]) + (n_right / n) * self._entropy(y[right_idx])
        return parent_loss - child_loss

    def _best_split(self, X, y, features):
        split = {'score':- 1, 'feat': None, 'thresh': None}

        for feat in features:
            X_feat = X[:, feat]
            thresholds = np.unique(X_feat)
            for thresh in thresholds:
                score = self._information_gain(X_feat, y, thresh)

                if score > split['score']:
                    split['score'] = score
                    split['feat'] = feat
                    split['thresh'] = thresh

        return split['feat'], split['thresh']
    
    def _build_tree(self, X, y, depth=0):
        self.n_samples, self.n_features = X.shape
        self.n_class_labels = len(np.unique(y))

        # stopping criteria
        if self._is_finished(depth):
            most_common_Label = np.argmax(np.bincount(y))
            return Node(value=most_common_Label)

        # get best split
        rnd_feats = np.random.choice(self.n_features, self.n_features, replace=False)
        best_feat, best_thresh = self._best_split(X, y, rnd_feats)

        # grow children recursively
        left_idx, right_idx = self._create_split(X[:, best_feat], best_thresh)
        left_child = self._build_tree(X[left_idx, :], y[left_idx], depth + 1)
        right_child = self._build_tree(X[right_idx, :], y[right_idx], depth + 1)
        return Node(best_feat, best_thresh, left_child, right_child)
    
    def _traverse_tree(self, x, node):
        if node.is_leaf():
            return node.value
        
        if x[node.feature] <= node.threshold:
            return self._traverse_tree(x, node.left)
        return self._traverse_tree(x, node.right)

    def fit(self, X, y):
        self.root = self._build_tree(X, y)

    def predict(self, X):
        predictions = [self._traverse_tree(x, self.root) for x in X]
        return np.array(predictions)

In [68]:
np.random.seed(42)
model = DecisionTree(max_depth=10)
model.fit(X_train.to_numpy(), y_train.to_numpy())
y_pred = model.predict(X_test.to_numpy())
accuracy = accuracy_score(y_test.to_numpy(), y_pred)
print(f'Accuracy: {accuracy}\n')

Accuracy: 0.4575



In [175]:
# CV:
merged_df = pd.concat([tr_df, te_df], axis=0)
X = merged_df.drop('label', axis=1)
y = merged_df['label']
np.random.seed(42)
model = DecisionTree(max_depth=10)
cv_results = custom_cross_val(model, X, y, k=5)
for i, acc in enumerate(cv_results):
    print(f'Fold {i+1} Accuracy: {acc}')
print(f'Mean Accuracy: {np.mean(cv_results)}')

Fold 1 Accuracy: 0.5680680680680681
Fold 2 Accuracy: 0.5640640640640641
Fold 3 Accuracy: 0.5570570570570571
Fold 4 Accuracy: 0.561061061061061
Fold 5 Accuracy: 0.561061061061061
Mean Accuracy: 0.5622622622622624


## 2.2 Compare only magnitude

In [176]:
class Node:
    def __init__(self, feature=None, threshold=None, left=None, right=None, *, value=None):
        self.feature = feature
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value
    
    def is_leaf(self):
        return self.value is not None

class DecisionTree:
    def __init__(self, max_depth=100, min_samples_split=2):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.root = None

    def _is_finished(self, depth):
        if (depth >= self.max_depth
            or self.n_class_labels == 1
            or self.n_samples < self.min_samples_split):
            return True
        return False
    
    def _entropy(self, y):
        proportions = np.bincount(y) / len(y)
        entropy = -np.sum([p * np.log2(p) for p in proportions if p > 0])
        return entropy

    def _create_split(self, X, thresh):
        left_idx = np.argwhere(np.abs(X) <= np.abs(thresh)).flatten()
        right_idx = np.argwhere(np.abs(X) > np.abs(thresh)).flatten()
        return left_idx, right_idx

    def _information_gain(self, X, y, thresh):
        parent_loss = self._entropy(y)
        left_idx, right_idx = self._create_split(X, thresh)
        n, n_left, n_right = len(y), len(left_idx), len(right_idx)

        if n_left == 0 or n_right == 0: 
            return 0
        
        child_loss = (n_left / n) * self._entropy(y[left_idx]) + (n_right / n) * self._entropy(y[right_idx])
        return parent_loss - child_loss

    def _best_split(self, X, y, features):
        split = {'score':- 1, 'feat': None, 'thresh': None}

        for feat in features:
            X_feat = X[:, feat]
            thresholds = np.unique(X_feat)
            for thresh in thresholds:
                score = self._information_gain(X_feat, y, thresh)

                if np.abs(score) > np.abs(split['score']) if split['score'] != -1 else -1:
                    split['score'] = score
                    split['feat'] = feat
                    split['thresh'] = thresh

        return split['feat'], split['thresh']
    
    def _build_tree(self, X, y, depth=0):
        self.n_samples, self.n_features = X.shape
        self.n_class_labels = len(np.unique(y))

        # stopping criteria
        if self._is_finished(depth):
            most_common_Label = np.argmax(np.bincount(y))
            return Node(value=most_common_Label)

        # get best split
        rnd_feats = np.random.choice(self.n_features, self.n_features, replace=False)
        best_feat, best_thresh = self._best_split(X, y, rnd_feats)

        # grow children recursively
        left_idx, right_idx = self._create_split(X[:, best_feat], best_thresh)
        left_child = self._build_tree(X[left_idx, :], y[left_idx], depth + 1)
        right_child = self._build_tree(X[right_idx, :], y[right_idx], depth + 1)
        return Node(best_feat, best_thresh, left_child, right_child)
    
    def _traverse_tree(self, x, node):
        if node.is_leaf():
            return node.value
        
        if x[node.feature] <= node.threshold:
            return self._traverse_tree(x, node.left)
        return self._traverse_tree(x, node.right)

    def fit(self, X, y):
        self.root = self._build_tree(X, y)

    def predict(self, X):
        predictions = [self._traverse_tree(x, self.root) for x in X]
        return np.array(predictions)

In [94]:
np.random.seed(42)
model = DecisionTree(max_depth=10)
model.fit(X_train.to_numpy(), y_train.to_numpy())
y_pred = model.predict(X_test.to_numpy())
accuracy = accuracy_score(y_test.to_numpy(), y_pred)
print(f'Accuracy: {accuracy}\n')

Accuracy: 0.3975



In [177]:
# CV:
merged_df = pd.concat([tr_df, te_df], axis=0)
X = merged_df.drop('label', axis=1)
y = merged_df['label']
np.random.seed(42)
model = DecisionTree(max_depth=10)
cv_results = custom_cross_val(model, X, y, k=5)
for i, acc in enumerate(cv_results):
    print(f'Fold {i+1} Accuracy: {acc}')
print(f'Mean Accuracy: {np.mean(cv_results)}')

Fold 1 Accuracy: 0.47147147147147145
Fold 2 Accuracy: 0.45245245245245247
Fold 3 Accuracy: 0.47097097097097096
Fold 4 Accuracy: 0.476976976976977
Fold 5 Accuracy: 0.4954954954954955
Mean Accuracy: 0.47347347347347346
