In [1]:
import torch
from sklearn import metrics

import data_load
import model
import os
import h5py_cache
from torch.autograd import Variable
import numpy as np
import matplotlib.pyplot as plt
import python_speech_features

from pydub import AudioSegment
import webrtcvad
import sklearn

In [2]:
SAMPLE_RATE = 16000
FRAME_SIZE_MS = 10
FRAME_SIZE = int(SAMPLE_RATE * (FRAME_SIZE_MS / 1000.0))

In [3]:
def net_path(epoch, title):
    part = os.getcwd() + '/models/' + title
    if epoch >= 0:
        return part + '_epoch' + str(epoch).zfill(3) + '.net'
    else:
        return part + '.net'
    
def load_net(epoch = 14, title = 'net'):
    if torch.cuda.is_available():
        return torch.load(net_path(epoch, title))
    else:
        return torch.load(net_path(epoch, title), map_location='cpu')

In [4]:
net = load_net(title='net', epoch = 10)

In [5]:
class wav_file:
    def __init__(self, path_to_file):
        self.name = path_to_file
        vad = webrtcvad.Vad(0)
        self.test_data = {'mfcc': [], 'delta': [], 'predictions': []}
        
        track = (AudioSegment.from_file(self.name).set_frame_rate(16000).set_sample_width(2).set_channels(1))
        self.sound = track
        track = np.array(track.get_array_of_samples(), dtype=np.int16)
        padded_track = np.concatenate((track, np.zeros(FRAME_SIZE - (len(track) % FRAME_SIZE))))

        self.frames = np.array(np.split(padded_track, len(padded_track) / FRAME_SIZE), dtype=np.int16)
        self.labels = [1 if vad.is_speech(f.tobytes(), sample_rate=16000) else 0 for f in self.frames]
        
        self.test_data['labels'] = self.labels
        self.predictions = self.get_predictions()[0]
        self.probabilities_true = self.get_predictions()[1]
        self.score = self.roc_auc_score()
    
    def get_features(self):
        for i in range(len(self.test_data['labels'])):
            mfcc = python_speech_features.mfcc(self.frames[i], 16000, winstep=(10 / 1000),
                                           winlen=4 * (10 / 1000), nfft=2048)
            mfcc = mfcc[:, 1:]
            delta = python_speech_features.delta(mfcc, 2)

            self.test_data['mfcc'].append(mfcc)
            self.test_data['delta'].append(delta)

            self.test_data['mfcc_padded'] = [np.zeros((1, 12))] + self.test_data['mfcc'] + [np.zeros((1, 12))] 
            self.test_data['delta_padded'] = [np.zeros((1, 12))] + self.test_data['delta'] + [np.zeros((1, 12))] 
    
    def get_predictions(self):
        self.get_features()
        predictions = []
        true_prob = []

        for i in range(1, len(self.test_data['labels']) + 1):    
            X = np.hstack((self.test_data['mfcc_padded'][i-1: i+2], self.test_data['delta_padded'][i-1: i+2])).reshape(1,3,24).repeat(2048, 0)
            X = Variable(torch.from_numpy(np.array(X).reshape((2048, 3, 24))).float())
            out = net(X)
            true_prob.append(float(out[0][1]))
            predictions.append(int(torch.argmax(out[0]))) 
        
        return predictions, true_prob
    
    
    def roc_auc_score(self):
        return sklearn.metrics.roc_auc_score(self.labels, self.probabilities_true)

# Get predictions

In [7]:
example = wav_file('for_devs/ZH1KC7KCWHLQYR4G1ROYWJC37DE18A.wav')

In [8]:
example.score

0.47930147448817223

In [11]:
example.predictions[:20]

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1]

# Get ROC AUC score for the some files 

In [15]:
import glob
ff = glob.glob('for_devs/' + '/*.wav', recursive=True)

In [18]:
for f in ff[:10]:
    try:
        example = wav_file(f)
        print(example.score)
    except:
        pass

0.3646549014324846
0.898657498362803
0.4554263565891473
0.8924434784913461
0.7019180948585093
0.811660777385159
0.6311899038461538
