In [26]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import audioread 
import librosa

from sklearn.model_selection import train_test_split

In [2]:
ROOT_DIR = 'cats_dogs/'
CSV_PATH = 'train_test_split.csv'

In [3]:
def load_audio_data():
    """
    Reads audio files and returns the time series, sampling rates, channel count, and wav file names
    """
    df = pd.read_csv(CSV_PATH)
    file_names, time_series, sampling_rates, channels = [], [] ,[], []
    durations = []

    for k in ['train_cat', 'train_dog', 'test_cat', 'test_dog']:
        v = list(df[k].dropna())
        
        for f in v:
            file_names.append(f)
            # Read and get data and sampling rate of audio
            ts, sr = librosa.load(ROOT_DIR + f,sr=16000)
            time_series.append(ts)
            sampling_rates.append(sr)
            
            # Calculate duration of each file
            duration = len(ts) / sr 
            durations.append(duration)

            # Count number of channels within audio 
            with audioread.audio_open(ROOT_DIR + f) as input_file:
                channels.append(input_file.channels)
        
    return file_names, time_series, sampling_rates, channels, durations

def extract_mel_features(time_series):
    """
    Calculate the mel-frequency features of the dataset
    """
    freq, mfccs, delta_mfcc = [],[],[]
    
    for ts in time_series:
        # Store frequencies
        fr = librosa.feature.melspectrogram(y=ts,sr=16000)
        freq.append(fr)
        # delta_mel.append(librosa.feature.delta(fr))
        
        # To find MFCCs, the spectrogram is converted to dB. The first-order time difference 
        # is then calculated for each frequency band 
        mfcc = librosa.feature.mfcc(S=librosa.power_to_db(fr),sr=1600)
        mfccs.append(mfcc)
        delta_mfcc.append(librosa.feature.delta(mfcc))

    return freq, mfccs, delta_mfcc

names, ts, sr, channels, durations = load_audio_data()
fr, mfccs, mfcc_deltas = extract_mel_features(ts)
audio = pd.DataFrame({'file':names, 'data':ts, 'sample rate':sr,'channel count':channels, 'duration':durations, 'frequency':fr, 'mfccs':mfccs, 'delta mfcc':mfcc_deltas})
audio.head()

Unnamed: 0,file,data,sample rate,channel count,duration,frequency,mfccs,delta mfcc
0,cat_99.wav,"[-0.07397461, -0.05130005, -0.0053710938, 0.04...",16000,1,12.0,"[[0.2109778, 0.052674465, 0.00015328123, 0.000...","[[-95.159546, -93.86674, -102.64034, -89.75827...","[[-1.6581222, -1.6581222, -1.6581222, -1.65812..."
1,cat_54.wav,"[0.014190674, 0.016448975, 0.015899658, 0.0152...",16000,1,1.0845,"[[0.0013635816, 0.0035894401, 0.0044356594, 0....","[[-243.6483, -132.19197, -60.69829, -31.834412...","[[22.241482, 22.241482, 22.241482, 22.241482, ..."
2,cat_34.wav,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",16000,1,8.748,"[[6.644849e-07, 4.310075e-05, 0.0051410776, 0....","[[-497.22916, -430.5966, -296.09723, -273.6029...","[[17.203953, 17.203953, 17.203953, 17.203953, ..."
3,cat_132.wav,"[0.0077819824, 0.009796143, 0.0093688965, 0.00...",16000,1,15.192,"[[0.00078292354, 0.0033641942, 0.006507087, 0....","[[-240.26288, -196.23723, -154.42952, -116.919...","[[14.193284, 14.193284, 14.193284, 14.193284, ..."
4,cat_124.wav,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",16000,1,1.08,"[[1.0567337e-06, 0.022469442, 0.41236275, 0.83...","[[-586.92194, -506.6404, -310.1259, -185.72336...","[[51.336784, 51.336784, 51.336784, 51.336784, ..."


In [25]:
len(audio.mfccs[0]) # Find length of MFCCs 

20

In [None]:
# Onset detection, requires more research 


In [24]:
#Adding the class label to the dataframe
def add_class(row):
    label = None
    if 'cat' in row:
        label = 'cat'
    elif 'dog' in row:
        label = 'row'
    return label
audio['label'] = audio.apply(lambda x: add_class(x['file']),axis=1)
audio.head()

Unnamed: 0,file,data,sample rate,channel count,duration,frequency,mfccs,delta mfcc,label
0,cat_99.wav,"[-0.07397461, -0.05130005, -0.0053710938, 0.04...",16000,1,12.0,"[[0.2109778, 0.052674465, 0.00015328123, 0.000...","[[-95.159546, -93.86674, -102.64034, -89.75827...","[[-1.6581222, -1.6581222, -1.6581222, -1.65812...",cat
1,cat_54.wav,"[0.014190674, 0.016448975, 0.015899658, 0.0152...",16000,1,1.0845,"[[0.0013635816, 0.0035894401, 0.0044356594, 0....","[[-243.6483, -132.19197, -60.69829, -31.834412...","[[22.241482, 22.241482, 22.241482, 22.241482, ...",cat
2,cat_34.wav,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",16000,1,8.748,"[[6.644849e-07, 4.310075e-05, 0.0051410776, 0....","[[-497.22916, -430.5966, -296.09723, -273.6029...","[[17.203953, 17.203953, 17.203953, 17.203953, ...",cat
3,cat_132.wav,"[0.0077819824, 0.009796143, 0.0093688965, 0.00...",16000,1,15.192,"[[0.00078292354, 0.0033641942, 0.006507087, 0....","[[-240.26288, -196.23723, -154.42952, -116.919...","[[14.193284, 14.193284, 14.193284, 14.193284, ...",cat
4,cat_124.wav,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",16000,1,1.08,"[[1.0567337e-06, 0.022469442, 0.41236275, 0.83...","[[-586.92194, -506.6404, -310.1259, -185.72336...","[[51.336784, 51.336784, 51.336784, 51.336784, ...",cat
