In [1]:
import torch
import json
from tqdm import tqdm_notebook as tqdm
import numpy as np
import pandas as pd
from scipy.signal import resample

In [2]:
import sys
sys.path.append('../')
from config import PATH, LIBRISPEECH_SAMPLING_RATE
from data import LibriSpeechDataset, label_to_sex
from models import DilatedNet, ConvNet
from utils import whiten

### Load model

In [3]:
model_path = PATH + '/models/max_pooling__n_layers=7__n_filters=64__downsampling=1__n_seconds=3.torch'

In [4]:
model_type = model_path.split('/')[-1].split('__')[0]
model_name = model_path.split('/')[-1].split('.')[0]
model_params = {i.split('=')[0]: int(i.split('=')[1]) for i in model_name.split('__')[1:]}

# Here we assume that the model was trained on the LibriSpeech dataset
model_sampling_rate = LIBRISPEECH_SAMPLING_RATE/model_params['downsampling']
model_num_samples = model_params['n_seconds']*model_sampling_rate

print('Model parameters determined from filename:')
print(json.dumps(model_params, indent=4))

if model_type == 'max_pooling':
    model = ConvNet(model_params['n_filters'], model_params['n_layers'])
elif model_type == 'dilated':
    model = DilatedNet(model_params['n_filters'], model_params['n_depth'], model_params['n_stacks'])
else:
    raise(ValueError, 'Model type not recognised.')

model.load_state_dict(torch.load(model_path))
model.double()
model.cuda()
model.eval()

Model parameters determined from filename:
{
    "n_layers": 7, 
    "n_filters": 64, 
    "n_seconds": 3, 
    "downsampling": 1
}


ConvNet(
  (initialconv): Conv1d(1, 64, kernel_size=(3,), stride=(1,), padding=(1,))
  (initialbn): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv_0): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(1,))
  (bn_0): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv_1): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(1,))
  (bn_1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv_2): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(1,))
  (bn_2): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv_3): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(1,))
  (bn_3): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv_4): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(1,))
  (bn_4): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv_5)

### Generate predictions

In [5]:
testset = LibriSpeechDataset('dev-clean',LIBRISPEECH_SAMPLING_RATE*model_params['n_seconds'],stochastic=False,cache=False)
testloader = torch.utils.data.DataLoader(testset,batch_size=16,num_workers=4)

  3%|▎         | 86/2703 [00:00<00:03, 857.59it/s]

Initialising LibriSpeechDataset with length = 48000 and subsets = dev-clean
Indexing dev-clean...


100%|██████████| 2703/2703 [00:04<00:00, 648.07it/s]


Finished indexing data. 2303 usable files found.


In [6]:
df = []
for i in tqdm(range(len(testset))):
    instance, label = testset[i]
    instance = whiten(torch.from_numpy(instance[np.newaxis,:]))
        
     # New resampling
    instance_cuda = torch.from_numpy(
        resample(
            instance,
            int(LIBRISPEECH_SAMPLING_RATE*model_params['n_seconds']/model_params['downsampling']),
            axis=1
        )
    ).reshape((1,1,int(LIBRISPEECH_SAMPLING_RATE*model_params['n_seconds']/model_params['downsampling'])))
    
    with torch.no_grad():
        pred = model(instance_cuda)[0][0].cpu().numpy()
        
    df.append({
        'i': i,
        'name': testset.datasetid_to_name[i],
        'sex': label_to_sex[label],
        'rms': np.sqrt(np.square(instance)).mean(),
        'rmedians': np.median(np.sqrt(np.square(instance))),
        'mean': instance.mean(),
        'pred':pred,
        'label': label
    })
df = pd.DataFrame(df)

HBox(children=(IntProgress(value=0, max=2303), HTML(value=u'')))




In [7]:
df = df.assign(
    error=abs(df['pred'].astype(float)-df['label'].astype(int)),
    label=df['label'].astype(int),
    correct=(df['pred'] > 0.5) == df['label'],
    pred=df['pred'].astype(float)
)

In [8]:
gb = df.groupby('name').agg({'error': ['mean','max'], 'pred': 'mean', 'label': 'mean'})
gb.columns = ['.'.join(col).strip() for col in gb.columns.values]
gb.sort_values('error.mean',ascending=False)

Unnamed: 0_level_0,pred.mean,label.mean,error.mean,error.max
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Kathy Caver,0.68646,1,0.31354,0.941445
dexter,0.234752,0,0.234752,0.780647
President Lethe,0.144391,0,0.144391,0.948249
Jennifer Wiginton,0.888373,1,0.111627,0.381209
Nicodemus,0.108616,0,0.108616,0.703835
Stephen Kinford,0.106456,0,0.106456,0.882708
Peter Eastman,0.079562,0,0.079562,0.785736
Mark Nelson,0.056028,0,0.056028,0.967619
VOICEGUY,0.036067,0,0.036067,0.500753
badey,0.035517,0,0.035517,0.218884


In [11]:
print '{} out of {} ({}%) of speakers in the validation set are never misclassified.'.format(
    len(gb[gb['error.max']<0.5]),
    len(gb),
    len(gb[gb['error.max']<0.5])*100./len(gb)
)

32 out of 40 (80.0%) of speakers in the validation set are never misclassified.
