In [None]:
import numpy as np
import pandas as pd
import os
import pickle
import librosa
import scipy
import tensorflow as tf
from tqdm import tqdm
import multiprocessing
from sklearn.utils import shuffle
from sklearn.metrics import log_loss

import keras
from keras.models import Sequential
from keras.models import Model, load_model
from keras.layers import Dense, Dropout, Flatten, BatchNormalization, Input, Embedding, LSTM, GRU
from keras.layers import Conv2D, MaxPooling2D, AveragePooling2D, Bidirectional, Lambda
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras import backend as K
from keras.utils import to_categorical
from torch.utils.data import Dataset, DataLoader


from sklearn.utils import shuffle
from sklearn.preprocessing import normalize


In [None]:
import IPython.display as ipd
import matplotlib.pyplot as plt

def display_waveform(waveform, title="", sr=8000):
    """Display waveform plot and audio play UI."""
    plt.figure()
    plt.title(title)
    plt.plot(waveform)
    ipd.display(ipd.Audio(waveform, rate=sr))

def stft3_quick(file_path):
    res = []
    Y, _ = librosa.load(file_path, sr=22050, res_type='kaiser_fast')
    for i in np.arange(0.0, 9.5, 0.5):
        y = Y[int(66150*i):min(len(Y),int(66150*(i+1)))]
        if y.shape[0] < 66150:
            y = np.pad(y, (0, 66150 - y.shape[0]), 'constant')
        res.append(np.abs(librosa.stft(y, n_fft=1024, window=scipy.signal.hanning, hop_length=512))[:,:128])
    res = np.array(res)
    return res, file_path

In [None]:
class MultispeakerDataset(Dataset):
    def __init__(self, index, path):
        self.path = path
        self.index = index
        self.all_files = [(i, name) for (i, speaker) in enumerate(index) for name in speaker]

    def __getitem__(self, index):
        speaker_id, name = self.all_files[index]
        speaker_onehot = (np.arange(len(self.index)) == speaker_id).astype(np.long)
        audio = np.load(f'{self.path}/{speaker_id}/{name}.npy')
        
        audio = (audio / np.abs(audio.max())).astype(np.float32)
        if audio.shape[0] < 66150:
            audio = np.tile(audio, int(np.ceil(66150/audio.shape[0])))
        audio = np.abs(librosa.stft(audio[0:66150], n_fft=1024, window=scipy.signal.hanning, hop_length=512))[:,:128]
        
        return speaker_onehot, audio

    def __len__(self):
        return len(self.all_files)

    def num_speakers(self):
        return len(self.index)


In [None]:
data_path = ''
with open(f'{data_path}/index.pkl', 'rb') as f:
    index = pickle.load(f)

train_index = [[_ for o,_ in enumerate(x) if o % 10 != 0] for i, x in enumerate(index) if i < 16]
val_index = [x[::10] for i, x in enumerate(index) if i < 16]

dataset = MultispeakerDataset(train_index, data_path)
valset = MultispeakerDataset(val_index, data_path)

In [None]:
x_test = np.array([x[1] for x in dataset])
y_test = np.array([x[0] for x in dataset])
x_test = x_test.reshape(x_test.shape[0], 128, 513, 1)
np.save('x_test', x_test)
np.save('y_test', y_test)

In [None]:
### Training

In [None]:
def define_model():
    # code taken and modified from 
    # https://machinelearningmastery.com/how-to-develop-a-convolutional-neural-network-to-classify-photos-of-dogs-and-cats/
    model_name = 'best_rescnnqstft3_b128.h5'
    model = load_model(model_name)
    
    for layer in model.layers:
        layer.trainable = False  # mark loaded layers as not trainable
        
    # replace final layers
    flat1 = Flatten()(model.layers[-2].output)
    output = Dense(16, activation='softmax')(flat1)

    # define new model
    model = Model(inputs=model.inputs, outputs=output)

    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

In [None]:
model = define_model()
model.summary()

In [None]:
checkpoint = ModelCheckpoint('test_model', monitor='acc', verbose=2, save_best_only=True, mode='max')
early_stop = EarlyStopping(monitor='acc', patience=5, mode='max') 
callbacks_list = [checkpoint, early_stop]

model.fit(x_train, y_train,
          batch_size=128,
          epochs=20,
          verbose=2,
          shuffle=True,
          callbacks=callbacks_list)

In [None]:
model.save('curr_best_vctk.h5')

### Evaluation

In [None]:
x_test = np.array([x[1] for x in valset])
y_test = np.array([x[0] for x in valset])
x_test = x_test.reshape(x_test.shape[0], 128, 513, 1)
np.save('x_test', x_test)
np.save('y_test', y_test)

In [None]:
model = load_model('curr_best_vctk.h5')

In [None]:
results = model.evaluate(x_test, y_test, batch_size=128)