In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

import os

import librosa
import librosa.display

import seaborn as sns
sns.set_style('whitegrid')

import math

import warnings
warnings.filterwarnings("ignore")

import IPython

import sklearn
from sklearn import preprocessing
from scipy.signal import lfilter
from scipy.stats import kurtosis
from scipy.stats import skew

from tqdm import tqdm

In [2]:
# Define the relative path to the data
path = "../"

# Define the path to the audio files stored according to genres
path_audio_files = path + "Data/genres_original/"

# Parameters related to Fast Fourier Transform (FFT) and hop length for audio processing
hop_length = 512 # Number of samples between successive frames
n_fft = 2048  # Number of FFTs (Fast Fourier Transforms) per frame

# Dictionary to map the genre to an integer for encoding
genre_dict = {"blues": 0, "classical": 1, "country": 2, "disco": 3,
              "hiphop": 4, "jazz": 5, "metal": 6, "pop": 7, "reggae": 8, "rock": 9}

In [3]:
def extract_features(y,sr=22050,n_fft=2048,hop_length=512):
    features = {
        'stft': np.abs(librosa.stft(y=y, n_fft=n_fft, hop_length=hop_length)).ravel(), # Audio signal
        'chroma_stft' : librosa.feature.chroma_stft(y=y, sr=sr, hop_length=hop_length).ravel(), # Frequency domain
        'rmse': librosa.feature.rms(y=y, frame_length=n_fft, hop_length=hop_length).ravel(), # Time domain
        'centroid': librosa.feature.spectral_centroid(y=y, sr=sr, n_fft=n_fft, hop_length=hop_length).ravel(), # Frequency domain
        'bandwidth': librosa.feature.spectral_bandwidth(y=y, sr=sr, n_fft=n_fft, hop_length=hop_length).ravel(), # Frequency domain
        'rolloff': librosa.feature.spectral_rolloff(y=y, sr=sr, n_fft=n_fft, hop_length=hop_length).ravel(), # Frequency domain
        'zcr': librosa.feature.zero_crossing_rate(y=y, frame_length=n_fft, hop_length=hop_length).ravel(), # Time domain
        'flux': librosa.onset.onset_strength(y=y, sr=sr).ravel(),
        'contrast': librosa.feature.spectral_contrast(y=y, sr=sr).ravel(), # Frequency domain        
        'flatness': librosa.feature.spectral_flatness(y=y, n_fft=n_fft, hop_length=hop_length).ravel()
    } 

    # MFCC treatment
    mfcc = librosa.feature.mfcc(y=y, n_fft=n_fft, hop_length=hop_length, n_mfcc=20) # Frequency domain                                       
    for idx, v_mfcc in enumerate(mfcc):
        features['mfcc_{}'.format(idx)] = v_mfcc.ravel()

    # Get statistics from the vectors
    def get_feature_stats(features):
        result = {}
        for k, v in features.items():
            result['{}_max'.format(k)] = np.max(v)
            result['{}_min'.format(k)] = np.min(v)
            result['{}_mean'.format(k)] = np.mean(v)
            result['{}_std'.format(k)] = np.std(v)
            result['{}_kurtosis'.format(k)] = kurtosis(v)
            result['{}_skew'.format(k)] = skew(v)
        return result

    dict_agg_features = get_feature_stats(features)
    dict_agg_features['tempo'] = librosa.beat.tempo(y=y,sr=sr,hop_length=hop_length)[0] # Time domain

    return dict_agg_features


def make_train_data(path):
    arr_features = []
    genres = 'blues classical country disco hiphop jazz metal pop reggae rock'.split()
    for idx, genre in tqdm(enumerate(genres), total=len(genres)):
        for fname in os.listdir(path + genre):
            if fname == 'jazz.00054.wav':
                continue
            y, sr = librosa.load(path + genre + '/' + fname, duration=30)
            dict_features = {'filename': fname, 'length': len(y)}
            dict_features.update(extract_features(y=y, sr=sr))
            dict_features['label'] = genre
            arr_features.append(dict_features)

    df = pd.DataFrame(data=arr_features)
    print(df.head())
    print(df.shape)
    df.to_csv('train_data.csv',index=False)


def make_test_data(path):
    arr_features = []
    genres = 'blues classical country disco hiphop jazz metal pop reggae rock'.split()
    for fname in tqdm(os.listdir(path), total=10*len(genres)):
        y, sr = librosa.load(path + fname, duration=30)
        dict_features = extract_features(y=y, sr=sr)
        dict_features['label'] = genres.index(fname.split('.')[0])
        arr_features.append(dict_features)

    df = pd.DataFrame(data=arr_features)
    print(df.head())
    print(df.shape)
    df.to_csv('test_data.csv',index=False)

In [4]:
make_train_data(path_audio_files)
# make_test_data(path_test_audio_files)

100%|██████████| 10/10 [26:26<00:00, 158.64s/it]


          filename  length    stft_max      stft_min  stft_mean  stft_std  \
0  blues.00060.wav  661500  146.837738  2.000563e-08   1.851225  5.361300   
1  blues.00071.wav  661500   87.857414  7.256205e-07   1.264366  3.129830   
2  blues.00059.wav  661500  214.587540  3.878479e-08   1.594532  4.784221   
3  blues.00004.wav  661500  174.341003  1.085686e-08   0.871003  2.727628   
4  blues.00085.wav  661500  245.389069  6.348434e-08   0.916177  4.383680   

   stft_kurtosis  stft_skew  chroma_stft_max  chroma_stft_min  ...  \
0     124.340251   9.279234              1.0         0.004526  ...   
1      99.487005   8.098578              1.0         0.002236  ...   
2     231.757143  11.826159              1.0         0.003591  ...   
3     189.011689  10.310731              1.0         0.001347  ...   
4     267.913628  13.212771              1.0         0.001842  ...   

   mfcc_18_kurtosis  mfcc_18_skew  mfcc_19_max  mfcc_19_min  mfcc_19_mean  \
0         -0.264349      0.307678    32