In [1]:
import os
import shutil
import random
import librosa
import numpy as np
from glob import glob
from scipy import signal
from sklearn.svm import SVC
from IPython import display
from IPython.display import SVG
import matplotlib.pyplot as plt
import keras
from keras import backend as K
from keras.models import load_model
#from keras.utils import model_to_dot
from keras.utils import to_categorical
from keras.models import Model, load_model
from keras.layers import Input, Conv2D, Dropout, MaxPooling2D, AveragePooling2D, Flatten, Dense, Activation, BatchNormalization, Lambda
from python_speech_features import mfcc

#For fft spectrum.
import sigproc
import constants as c
from scipy.signal import lfilter, butter


os.environ['CUDA_VISIBLE_DEVICES'] = '1'
#import psutil
#p = psutil.Process()
#p.cpu_affinity([0,1,2,5,9,13,17,18,19,20,23,27,28,29,30,31])

Using TensorFlow backend.


In [2]:
def euclidean_distance(vects):
    x, y = vects
    sum_square = K.sum(K.square(x - y), axis=1, keepdims=True)
    return K.sqrt(K.maximum(sum_square, K.epsilon()))


def eucl_dist_output_shape(shapes):
    shape1, shape2 = shapes
    return (shape1[0], 1)


def contrastive_loss(y_true, y_pred):
    '''Contrastive loss from Hadsell-et-al.'06
    http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf
    '''
    margin = 1
    sqaure_pred = K.square(y_pred)
    margin_square = K.square(K.maximum(margin - y_pred, 0))
    return K.mean(y_true * sqaure_pred + (1 - y_true) * margin_square)

In [3]:
#Functions for fft spectrum. Copied from 'https://github.com/linhdvu14/vggvox-speaker-identification/blob/master'
def remove_dc_and_dither(sin, sample_rate):
	if sample_rate == 16e3:
		alpha = 0.99
	elif sample_rate == 8e3:
		alpha = 0.999
	else:
		print("Sample rate must be 16kHz or 8kHz only")
		exit(1)
	sin = lfilter([1,-1], [1,-alpha], sin)
	dither = np.random.random_sample(len(sin)) + np.random.random_sample(len(sin)) - 1
	spow = np.std(dither)
	sout = sin + 1e-6 * spow * dither
	return sout
        

def normalize_frames(m,epsilon=1e-12):
	return np.array([(v - np.mean(v)) / max(np.std(v),epsilon) for v in m])


def get_fft_spectrum(signal, buckets=None):
	#signal = load_wav(filename,c.SAMPLE_RATE)
	signal *= 2**15

	# get FFT spectrum
	signal = remove_dc_and_dither(signal, c.SAMPLE_RATE)
	#signal = sigproc.preemphasis(signal, coeff=c.PREEMPHASIS_ALPHA)
	frames = sigproc.framesig(signal, frame_len=c.FRAME_LEN*c.SAMPLE_RATE, frame_step=c.FRAME_STEP*c.SAMPLE_RATE, winfunc=np.hamming)
	fft = abs(np.fft.fft(frames,n=c.NUM_FFT))
	fft_norm = normalize_frames(fft.T)

	# truncate to max bucket sizes
	#rsize = max(k for k in buckets if k <= fft_norm.shape[1])
	#rstart = int((fft_norm.shape[1]-rsize)/2)
	#out = fft_norm[:,rstart:rstart+rsize]

	#return out
	return fft_norm

In [4]:
def create_base_network():
    model_input = Input(shape=(512,299,1))
    
    conv1 = Conv2D(filters=96, kernel_size=(7, 7), strides=(2, 2))(model_input)
    conv1 = BatchNormalization(scale=False, axis=3)(conv1)
    conv1 = Activation('relu')(conv1) 
    mpool1 = MaxPooling2D(pool_size=(3, 3), strides=(2,2))(conv1)
    
    conv2 = Conv2D(filters=256, kernel_size=(5, 5), strides=(2, 2))(mpool1)
    conv2 = BatchNormalization(scale=False, axis=3)(conv2)
    conv2 = Activation('relu')(conv2) 
    mpool2 = MaxPooling2D(pool_size=(3, 3), strides=(2,2))(conv2)
    
    conv3 = Conv2D(filters=384, kernel_size=(3, 3), strides=(1, 1))(mpool2)
    conv3 = BatchNormalization(scale=False, axis=3)(conv3)
    conv3 = Activation('relu')(conv3)
    
    conv4 = Conv2D(filters=256, kernel_size=(3, 3), strides=(1, 1))(conv3)
    conv4 = BatchNormalization(scale=False, axis=3)(conv4)
    conv4 = Activation('relu')(conv4) 
    
    conv5 = Conv2D(filters=256, kernel_size=(3, 3), strides=(1, 1))(conv4)
    conv5 = BatchNormalization(scale=False, axis=3)(conv5)
    conv5 = Activation('relu')(conv5)
    conv5 = Dropout(0.25)(conv5)
    mpool5 = MaxPooling2D(pool_size=(3, 3), strides=(2,2))(conv5)
    
    fc6 = Conv2D(filters=4096, kernel_size=(11, 1), strides=(1, 1))(mpool5)
    fc6 = BatchNormalization(scale=False, axis=3)(fc6)
    fc6 = Activation('relu')(fc6) 
    fc6 = Dropout(0.35)(fc6)
    apool6 = AveragePooling2D(pool_size=(1, 5), strides=(1,1))(fc6)
    
    flatten = Flatten()(apool6)
    
    fc7 = Dense(1024, activation='relu')(flatten)
    fc7 = Dropout(0.35)(fc7)
    
    fc8 = Dense(1024, activation='softmax')(fc7)
    #fc8 = Dense(1211, activation='softmax', name='classifcation')(fc7)
    
    feature_model = Model(model_input, fc7)
    
    return feature_model

In [5]:
def create_siamese_network():
    base_network = create_base_network()
    ##########################################
    #           Load weights here.          #
    ##########################################
    
    base_network.load_weights('/data/techresearch/Murtaza/vox2/dev/weights4/weights_4_26.h5', by_name=True)
    
    input_a = Input(shape=(512,299,1))
    input_b = Input(shape=(512,299,1))
    
    processed_a = base_network(input_a)
    processed_b = base_network(input_b)
    
    distance = Lambda(euclidean_distance,
                      output_shape=eucl_dist_output_shape)([processed_a, processed_b])
    
    out = Dense(1, activation='sigmoid')(distance)
    
    siamese = Model([input_a, input_b], out)
    return siamese

In [6]:
siamese = create_siamese_network()
siamese.summary()

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, 512, 299, 1)  0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            (None, 512, 299, 1)  0                                            
__________________________________________________________________________________________________
model_1 (Model)                 (None, 1024)         18729440    input_2[0][0]                    
                                                                 input_3[0][0]                    
_____________________

In [None]:
#all_audios = glob("/data/techresearch/Murtaza/vox2/dev/wav/*/*.wav")
#random.shuffle(all_audios)
audio_rate = 16000
no_seconds = 3

speakers = glob("/data/techresearch/Murtaza/vox2/dev/wav/*")
random.shuffle(speakers)
    
def create_batch():
    labels = []
    input_A = []
    input_B = []
    
    global speakers
    current_speaker = random.choice(speakers)
    current_speaker_audios = glob(current_speaker + '/*.wav')
    
    
    
    label = 0
    labels.append(label)   
    while True:
        audio_A1 = random.choice(current_speaker_audios)
        audio_data, _ = librosa.load(sr=None, mono=True, path=audio_A1)
        if len(audio_data)/audio_rate >= no_seconds: #If 3+ sec audio is not found, loop will run again, else it will break.
                    start = random.randint(0, len(audio_data) - (audio_rate*no_seconds) )
                    end = start + (audio_rate*no_seconds)
                    audio_clip = audio_data[start : end]
                    image = audio_to_image(audio_clip, audio_rate)
                    audio_A1_image = np.reshape(image, (512,299,1))
                    input_A.append(audio_A1_image)
                    break
    while True:
        audio_B1 = random.choice(current_speaker_audios)
        audio_data, _ = librosa.load(sr=None, mono=True, path=audio_B1)
        if len(audio_data)/audio_rate >= no_seconds: #If 3+ sec audio is not found, loop will run again, else it will break.
                    start = random.randint(0, len(audio_data) - (audio_rate*no_seconds) )
                    end = start + (audio_rate*no_seconds)
                    audio_clip = audio_data[start : end]
                    image = audio_to_image(audio_clip, audio_rate)
                    audio_B1_image = np.reshape(image, (512,299,1))
                    input_B.append(audio_B1_image)
                    break
    
    
    
    while True:
        random_speaker = random.choice(speakers)
        if random_speaker != current_speaker:         #The loop will run again if random.choice() resulted in the same speaker. Different is required.
            break   
    random_speaker_audios = glob(random_speaker + '/*.wav')
    label = 1
    labels.append(label)
    audio_B2 = random.choice(random_speaker_audios)
    audio_data, _ = librosa.load(sr=None, mono=True, path=audio_B2)
    if len(audio_data)/audio_rate >= no_seconds: #If 3+ sec audio is not found, loop will run again, else it will break.
                start = random.randint(0, len(audio_data) - (audio_rate*no_seconds) )
                end = start + (audio_rate*no_seconds)
                audio_clip = audio_data[start : end]
                image = audio_to_image(audio_clip, audio_rate)
                audio_B2_image = np.reshape(image, (512,299,1))
                input_B.append(audio_B2_image)
                break
    
    
    
    while True:
        random_speaker = random.choice(speakers)
        if random_speaker != current_speaker:         #The loop will run again if random.choice() resulted in the same speaker. Different is required.
            break   
    random_speaker_audios = glob(random_speaker + '/*.wav')
    label = 1
    labels.append(label)
    audio_B3 = random.choice(random_speaker_audios)
    audio_data, _ = librosa.load(sr=None, mono=True, path=audio_B3)
    if len(audio_data)/audio_rate >= no_seconds: #If 3+ sec audio is not found, loop will run again, else it will break.
                start = random.randint(0, len(audio_data) - (audio_rate*no_seconds) )
                end = start + (audio_rate*no_seconds)
                audio_clip = audio_data[start : end]
                image = audio_to_image(audio_clip, audio_rate)
                audio_B3_image = np.reshape(image, (512,299,1))
                input_B.append(audio_B3_image)
                break
    
    
    
    label = 0
    labels.append(label)   
    while True:
        audio_A2 = random.choice(current_speaker_audios)
        audio_data, _ = librosa.load(sr=None, mono=True, path=audio_A2)
        if len(audio_data)/audio_rate >= no_seconds: #If 3+ sec audio is not found, loop will run again, else it will break.
                    start = random.randint(0, len(audio_data) - (audio_rate*no_seconds) )
                    end = start + (audio_rate*no_seconds)
                    audio_clip = audio_data[start : end]
                    image = audio_to_image(audio_clip, audio_rate)
                    audio_A2_image = np.reshape(image, (512,299,1))
                    input_A.append(audio_A2_image)
                    break
    while True:
        audio_B4 = random.choice(current_speaker_audios)
        audio_data, _ = librosa.load(sr=None, mono=True, path=audio_B4)
        if len(audio_data)/audio_rate >= no_seconds: #If 3+ sec audio is not found, loop will run again, else it will break.
                    start = random.randint(0, len(audio_data) - (audio_rate*no_seconds) )
                    end = start + (audio_rate*no_seconds)
                    audio_clip = audio_data[start : end]
                    image = audio_to_image(audio_clip, audio_rate)
                    audio_B4_image = np.reshape(image, (512,299,1))
                    input_B.append(audio_B4_image)
                    break
    
    
    
    while True:
        random_speaker = random.choice(speakers)
        if random_speaker != current_speaker:         #The loop will run again if random.choice() resulted in the same speaker. Different is required.
            break   
    random_speaker_audios = glob(random_speaker + '/*.wav')
    label = 1
    labels.append(label)
    audio_B5 = random.choice(random_speaker_audios)
    audio_data, _ = librosa.load(sr=None, mono=True, path=audio_B5)
    if len(audio_data)/audio_rate >= no_seconds: #If 3+ sec audio is not found, loop will run again, else it will break.
                start = random.randint(0, len(audio_data) - (audio_rate*no_seconds) )
                end = start + (audio_rate*no_seconds)
                audio_clip = audio_data[start : end]
                image = audio_to_image(audio_clip, audio_rate)
                audio_B5_image = np.reshape(image, (512,299,1))
                input_B.append(audio_B5_image)
                break
    
    
    
    while True:
        random_speaker = random.choice(speakers)
        if random_speaker != current_speaker:         #The loop will run again if random.choice() resulted in the same speaker. Different is required.
            break   
    random_speaker_audios = glob(random_speaker + '/*.wav')
    label = 1
    labels.append(label)
    audio_B6 = random.choice(random_speaker_audios)
    audio_data, _ = librosa.load(sr=None, mono=True, path=audio_B6)
    if len(audio_data)/audio_rate >= no_seconds: #If 3+ sec audio is not found, loop will run again, else it will break.
                start = random.randint(0, len(audio_data) - (audio_rate*no_seconds) )
                end = start + (audio_rate*no_seconds)
                audio_clip = audio_data[start : end]
                image = audio_to_image(audio_clip, audio_rate)
                audio_B6_image = np.reshape(image, (512,299,1))
                input_B.append(audio_B6_image)
                break
    

In [None]:
create_batch()