# Database creation

Read the audio files and create a database.

In [None]:
import os
import re
import numpy as np
import pandas as pd

df = pd.DataFrame(columns=['predominant', 'genre', 'drum'])

# Set your own
path = 'IRMAS-TrainingData'

dirs = os.scandir(path)
for dir in dirs:
    if os.path.isfile(dir.path):
        continue
    
    pieces = os.scandir(dir.path)
    
    for p in pieces:
        fp = p.path
        name = p.name
        name = name.replace('.wav', '')
        
        m = re.search(r'(\[([a-zA-Z_]+)\])+', name)
        
        labels = m.group()[1:-1].split('][')
        
        predominant = labels[0]
        genre = labels[-1]

        drum = None
        
        if len(labels) > 2:
            drum = labels[1]
            
        df = df.append({'file':fp, 'predominant':predominant, 'genre':genre, 'drum':drum}, ignore_index=True)

df

Functions for getting features from audio files

In [None]:
from pymir import AudioFile

def getPYMIR(file):
    wavData = AudioFile.open(file)

    def getFrames(wavData):
        return wavData.frames(132300)[0]

    def getSpectrum(frames):
        return frames.spectrum()

    frames = getFrames(wavData)
    spectrum = getSpectrum(frames)
    
    mfcd = {}
    for i, f in enumerate(spectrum.mfcc2()):
        mfcd[f'MFC{i}'] = f

    return {**{
        "ZCR":frames.zcr(),
        "SCrest":spectrum.crest(),
        "SCentroid":spectrum.centroid(),
        "SKurtosis":spectrum.kurtosis(),
        "SMean":spectrum.mean(),
        "SRolloff":spectrum.rolloff(),
        "SVariance":spectrum.variance(),
        "SSkewness":spectrum.skewness()
    }, **mfcd}

In [None]:
import pyACA
import timbral_models
import soundfile as sf
import pyloudnorm as pyln

def getPYACA(file):
    def computeFeatureC2(cPath, cFeatureName, bPlotOutput = False):
        # read audio file
        [f_s, afAudioData] = pyACA.ToolReadAudio(cPath)

        # for debugging
        #afAudioData = np.sin(2*np.pi * np.arange(f_s*1)*440./f_s)

        # compute feature
        [v, t] = pyACA.computeFeature(cFeatureName, afAudioData, f_s, None, 132300, 132300)

        # plot feature output
        if bPlotOutput:
            plt.plot(t, v)

        return v
    
    data, rate = sf.read(file) # load audio (with shape (samples, channels))
    meter = pyln.Meter(rate) # create BS.1770 meter
    loudness = meter.integrated_loudness(data)
    
    tpe = computeFeatureC2(file, "TimePeakEnvelope")

    return {**{
        "SSlope": computeFeatureC2(file, "SpectralSlope")[0],
        "TPE00": tpe[0][0],
        "TPE01": tpe[0][1],
        "TPE10": tpe[1][0],
        "TPE11": tpe[1][1],
        "Loudness": loudness,
    }, **timbral_models.timbral_extractor(file, verbose=False)}

In [None]:
def getFeature(file):
    return {**getPYMIR(file), **getPYACA(file)}

Create a new CSV. This will take some time.

In [None]:
classical = df[(df.genre=='cla') & (df.predominant!='gel')]

newdf = pd.DataFrame()
for index, row in classical.iterrows():
    newdf = newdf.append({'instrument': row.predominant, **getFeature(row.file)}, ignore_index=True)

In [None]:
newdf.to_csv('data.csv', index=True)

# For NN
Neural networks can handle more data. The idea is this: make lots of 1D images (timestreams) for the input (spectral flux, centroid, etc.) with some sampling rate.

In [None]:
from collections import defaultdict
from pymir import SpectralFlux

def getPYMIRbetter(file, samples):
    wavData = AudioFile.open(file)

    def getFrames(wavData):
        return wavData.frames(int(132300/samples))

    def getSpectrum(frames):
        return [f.spectrum() for f in frames]

    frames = getFrames(wavData)
    spectrum = getSpectrum(frames)

    return {
        "ZCR":np.asarray([f.zcr() for f in frames]),
        "SCrest":np.asarray([float(s.crest()) for s in spectrum]),
        "SCentroid":np.asarray([float(s.centroid()) for s in spectrum]),
        "SKurtosis":np.asarray([float(s.kurtosis()) for s in spectrum]),
        "SMean":np.asarray([float(s.mean()) for s in spectrum]),
        "SRolloff":np.asarray([float(s.rolloff()) for s in spectrum]),
        "SVariance":np.asarray([float(s.variance()) for s in spectrum]),
        "SSkewness":np.asarray([float(s.skewness()) for s in spectrum]),
        "MFCD": np.asarray([spec.mfcc2()[0] for spec in spectrum]),
        "SFlux": np.asarray(SpectralFlux.spectralFlux(spectrum, rectify = True))
    }

Create a new CSV. This will take some time

In [None]:
classical = df[(df.genre=='cla') & (df.predominant!='gel')]
classical

# The audio file is split into 30 samples
smpls = 30

newdf = pd.DataFrame()
for index, row in classical.iterrows():
    newdf = newdf.append({'instrument': row.predominant, **getPYMIRbetter(row.file, smpls)}, ignore_index=True)
    print(index)

In [None]:
newdf.to_csv('data.csv', index=True)