# <u> Detecting Kermit the Frog (Audio) </u>

# Convert and extract raw data

In [1]:
import subprocess
import librosa

def load(path, target):
    command = "ffmpeg -i" + path + "-ab 160k -ac 2 -ar 44100 -vn" + target
    subprocess.call(command, shell=True) 
    return librosa.load(target)
    
raw, sample_rate = load("../data/Muppets-02-01-01.avi", "../data/ep1_audio.wav")
raw.shape, sample_rate

((34107999,), 22050)

In [2]:
import pandas as pd

# load ground truth
truth_csv = pd.read_csv('../data/gt/gt_02_01_01.csv')
truth = truth_csv.kermit_audio
truth.shape

(1560,)

In [3]:
# get second or timeframe from raw waveform
def sec(sec, raw_wave=raw, sr=22050):
    if type(sec) == int:
        return raw_wave[sec*sr:(sec+1)*sr]
    elif type(sec) == list and len(sec) == 1:
        return raw_wave[sec[0]*sr:(sec[0]+1)*sr]
    elif type(sec) == list and len(sec) == 2:
        return raw_wave[sec[0]*sr:(sec[1]+1)*sr]

# Create Feature Vectors

In [4]:
import statistics as stat
import numpy as np

def max_mfcc(wave_form, sr=22050):
    features = []
    for i in range(round(len(wave_form)/sr)):
        mfcc = librosa.feature.mfcc(sec(i, wave_form))
        # pick maximal value for each DCT dimension
        features.append([max(dim) for dim in mfcc])
    return features

def all_mfcc(wave_form, sr=22050, flat=False):
    features = []
    for i in range(round(len(wave_form)/sr)):
        mfcc = librosa.feature.mfcc(sec(i, wave_form))
        d_mfcc = librosa.feature.delta(mfcc)
        d2_mfcc = librosa.feature.delta(d_mfcc, order=2)

        features.append(np.concatenate((mfcc, d_mfcc, d2_mfcc), axis=0))
    
    # complete last entry
    fulld = features[0].shape[1]
    fill = fulld - features[-1].shape[1]
    features[-1] = np.concatenate((features[-1],np.zeros((features[0].shape[0],fill))), axis=1)
    
    # stack all of them on top of each other
    features = np.stack(features)
    
    if flat:
        nsamples, nx, ny = features.shape
        return features.reshape((nsamples,nx*ny))
    else:
        return features
    
features = all_mfcc(raw, flat=True)
features.shape

(1547, 2640)

# Prepare Test Data

In [7]:
# extract features
test_raw, test_sr = load('../data/Muppets-02-04-04.avi', '../data/ep2_audio.wav')
test_features = all_mfcc(test_raw, flat=True)
test_features.shape

(1548, 2640)

In [8]:
# load ground truth
test_truth_csv = pd.read_csv('../data/gt/gt_02_04_04.csv.csv')
test_truth = truth_csv.kermit_audio[:len(test_features)]
test_truth.shape

(1548,)

# Multilayer Perceptron

In [17]:
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.neural_network import MLPClassifier as MLP
from sklearn.metrics import precision_score as precision

clf = MLP(random_state=1, hidden_layer_sizes=(1000, 200, 50)).fit(features, truth[:len(features)])

# train accuracy
print('training precision', precision(truth[:len(features)], clf.predict(features)))
# test accuracy
print('testing precision',precision(test_truth, clf.predict(test_features)))

training precision 1.0
testing precision 0.10416666666666667


# TPOT

In [21]:
from tpot import TPOTClassifier

pipeline_optimizer = TPOTClassifier(
    generations=5, 
    population_size=20, 
    cv=5, 
    random_state=42, 
    verbosity=2)

pipeline_optimizer.fit(features, truth[:len(features)])
pipeline_optimizer.score(test_features, test_truth)



Widget Javascript not detected.  It may not be installed or enabled properly.



Generation 1 - Current best internal CV score: 0.9360058461217247

Generation 2 - Current best internal CV score: 0.9385990186867106

Generation 3 - Current best internal CV score: 0.9385990186867106

Generation 4 - Current best internal CV score: 0.9385990186867106

Generation 5 - Current best internal CV score: 0.9385990186867106

Best pipeline: RandomForestClassifier(BernoulliNB(input_matrix, alpha=0.001, fit_prior=False), bootstrap=True, criterion=gini, max_features=0.2, min_samples_leaf=8, min_samples_split=4, n_estimators=100)


0.9231266149870802