# ITI110 Project - Speaker Recognition

Team 9

## Part 2 - Building Tensors

In [0]:
import sys
import numpy as np
import pandas as pd
import librosa
import pickle
import os
from shutil import copyfile
import matplotlib.pyplot as plt
import imageio
%matplotlib inline

import cv2
import time

import tensorflow as tf
from keras.models import Sequential
from keras.optimizers import Adam
from keras.layers import Conv2D, ZeroPadding2D, Activation, Input, concatenate
from keras.models import Model

from keras.layers.normalization import BatchNormalization
from keras.layers.pooling import MaxPooling2D
from keras.layers.merge import Concatenate
from keras.layers.core import Lambda, Flatten, Dense
from keras.initializers import glorot_uniform

from keras.engine.topology import Layer
from keras.regularizers import l2
from keras import backend as K
from keras import initializers

import numpy.random as rng

Using TensorFlow backend.


### Folder paths

In [0]:
base_folder = "/Users/MacBookPro/AIandMLNYP/AIProject/SpeakerRecognition/TIMIT/"
data_folder = base_folder + "/data"
train_audio_folder = base_folder + "/Audio/TRAIN/"
test_audio_folder = base_folder + "/Audio/TEST/"
save_path = '/Users/MacBookPro/AIandMLNYP/AIProject/SpeakerRecognition/data/'
output_folder = base_folder + "/npydata/"

### Converting to MFCC

In [0]:
# Each of our sample (16khz) lasts exactly from 3 - 5  seconds. We will truncate at 3 secs with 16000 * 3 samples.
#
mfcc_hop_length = 256
mfcc_max_frames = int(16000 * 3 / mfcc_hop_length) + 1

print ("MFCC Frames (for 3 sec audio):     %d" % (mfcc_max_frames))

num_classes = 10
max_samples = 16000 * 3  # 5 seconds
max_mfcc_features = 40

# Scale the values to be between 
def scale(arr):
    #arr = arr - arr.mean()
    safe_max = np.abs(arr).max()
    if safe_max == 0:
        safe_max = 1
    arr = arr / safe_max
    return arr


# Load a file and convert its audio signal into a series of MFCC
# This will return a 2D numpy array.
#
def convert_mfcc(file_name):
    signal, sample_rate = librosa.load(file_name) 
    signal = librosa.util.normalize(signal)
    signal_trimmed, index = librosa.effects.trim(signal, top_db=60)
    signal_trimmed = librosa.util.fix_length(signal_trimmed, max_samples)
    
    feature = (librosa.feature.mfcc(y=signal_trimmed, sr=sample_rate, n_mfcc=max_mfcc_features).T)

    if (feature.shape[0] > mfcc_max_frames):
        feature = feature[0:mfcc_max_frames, :]
    if (feature.shape[0] < mfcc_max_frames):
        feature = np.pad(feature, pad_width=((0, mfcc_max_frames - feature.shape[0]), (0,0)), mode='constant')
    
    # This removes the average component from the MFCC as it may not be meaningful.
    #
    feature[:,0] = 0
        
    feature = scale(feature)

    return feature


MFCC Frames (for 3 sec audio):     188


In [0]:
def convertAudio(path,n = 0):
    '''
    path => Path of train directory or test directory
    '''
    X = []
    y = []
    speaker_dict = {}
    mfcc_dict = {}
    speaker_dict = {}
    curr_y = n
    # convert and load the mfcc features for all audio files for each speaker
    for speaker in os.listdir(path):
        print("loading speaker: " + speaker)
        speaker_dict[speaker] = [curr_y, None]
        
        mfcc_list = []
        speaker_path = os.path.join(path,speaker)
        # read all the images in the current category
        for filename in os.listdir(speaker_path):
            file_path = os.path.join(speaker_path, filename)
            # convert the audio file to spectral
            mfcc = convert_mfcc(file_path)
            mfcc_list.append(mfcc)
            y.append(curr_y)
            curr_y += 1
        try:
            X.append(np.stack(mfcc_list))
            # edge case - last one
        except ValueError as e:
            print(e)
            print("error - category_images:", category_images)

        speaker_dict[speaker][1] = curr_y - 1

    y = np.vstack(y)
    X = np.stack(X)

    return  X, y,speaker_dict

### Loading the training audio files into tensors

In [0]:
X, y,c=convertAudio(train_audio_folder)
print(X.shape)
print(c)

loading speaker: MDNS0
loading speaker: FCRZ0
loading speaker: MMDM0
loading speaker: MKXL0
loading speaker: MESJ0
loading speaker: MRAV0
loading speaker: FECD0
loading speaker: FJKL0
loading speaker: MCSS0
loading speaker: MWRE0
loading speaker: FDNC0
loading speaker: MPRD0
loading speaker: MCDR0
loading speaker: MDAC0
loading speaker: FSJG0
loading speaker: MRLJ1
loading speaker: MKDT0
loading speaker: MLEL0
loading speaker: MAKR0
loading speaker: MAFM0
loading speaker: FEEH0
loading speaker: MGAK0
loading speaker: MPSW0
loading speaker: MMDG0
loading speaker: MVRW0
loading speaker: MWCH0
loading speaker: MFWK0
loading speaker: MILB0
loading speaker: FNTB0
loading speaker: FNKL0
loading speaker: FKLH0
loading speaker: MTJU0
loading speaker: MHBS0
loading speaker: MRLJ0
loading speaker: FPMY0
loading speaker: MJHI0
loading speaker: MRMS0
loading speaker: MJWG0
loading speaker: MEFG0
loading speaker: FBLV0
loading speaker: MRBC0
loading speaker: FMMH0
loading speaker: FPJF0
loading spe

loading speaker: FEAR0
loading speaker: FDTD0
loading speaker: MDHL0
loading speaker: FBAS0
loading speaker: MGRP0
loading speaker: FSAG0
loading speaker: MJAE0
loading speaker: FCMM0
loading speaker: MEDR0
loading speaker: MWGR0
loading speaker: MGXP0
loading speaker: FSMA0
loading speaker: MSFH0
loading speaker: FCLT0
loading speaker: MSAT1
loading speaker: FDML0
loading speaker: MTCS0
loading speaker: MAEB0
loading speaker: MJRG0
loading speaker: FKDW0
loading speaker: MARC0
loading speaker: FCMG0
loading speaker: MDLC2
loading speaker: MDWH0
loading speaker: MTRT0
loading speaker: MGAR0
loading speaker: MTLC0
loading speaker: FCDR1
loading speaker: MAKB0
loading speaker: MSAT0
loading speaker: MSMR0
loading speaker: FPAF0
loading speaker: MMAG0
loading speaker: MREE0
loading speaker: MTPR0
loading speaker: MTBC0
loading speaker: MJLS0
loading speaker: MRVG0
loading speaker: MJWT0
loading speaker: MJEE0
loading speaker: FPLS0
loading speaker: FLMC0
loading speaker: MDKS0
loading spe

ValueError: all input arrays must have the same shape

In [0]:
print(X.shape)
print(c)

(462, 10, 188, 40)
{'MDNS0': [0, 9], 'FCRZ0': [10, 19], 'MMDM0': [20, 29], 'MKXL0': [30, 39], 'MESJ0': [40, 49], 'MRAV0': [50, 59], 'FECD0': [60, 69], 'FJKL0': [70, 79], 'MCSS0': [80, 89], 'MWRE0': [90, 99], 'FDNC0': [100, 109], 'MPRD0': [110, 119], 'MCDR0': [120, 129], 'MDAC0': [130, 139], 'FSJG0': [140, 149], 'MRLJ1': [150, 159], 'MKDT0': [160, 169], 'MLEL0': [170, 179], 'MAKR0': [180, 189], 'MAFM0': [190, 199], 'FEEH0': [200, 209], 'MGAK0': [210, 219], 'MPSW0': [220, 229], 'MMDG0': [230, 239], 'MVRW0': [240, 249], 'MWCH0': [250, 259], 'MFWK0': [260, 269], 'MILB0': [270, 279], 'FNTB0': [280, 289], 'FNKL0': [290, 299], 'FKLH0': [300, 309], 'MTJU0': [310, 319], 'MHBS0': [320, 329], 'MRLJ0': [330, 339], 'FPMY0': [340, 349], 'MJHI0': [350, 359], 'MRMS0': [360, 369], 'MJWG0': [370, 379], 'MEFG0': [380, 389], 'FBLV0': [390, 399], 'MRBC0': [400, 409], 'FMMH0': [410, 419], 'FPJF0': [420, 429], 'MRGS0': [430, 439], 'MKAG0': [440, 449], 'FDAS1': [450, 459], 'MMDM1': [460, 469], 'FSBK0': [470, 

### Saving the train tensors to disk

In [0]:
with open(os.path.join(save_path,"train.pickle"), "wb") as f:
    pickle.dump((X,c),f)

In [0]:
with open(os.path.join(save_path,"validation.pickle"), "wb") as f:
    pickle.dump((X_val,c_val),f)

### Loading the validation audio  into tensors

In [0]:
X_test, y_test,c_test=convertAudio(test_audio_folder)
print(X_test.shape)
print(c_test)

loading speaker: MERS0
loading speaker: FSLB1
loading speaker: MRMS1
loading speaker: MJFC0
loading speaker: MRJO0
loading speaker: FNLP0
loading speaker: MGMM0
loading speaker: MRCS0
loading speaker: MJVW0
loading speaker: MJMP0
loading speaker: MKCH0
loading speaker: MRCZ0
loading speaker: MTLS0
loading speaker: MDRB0
loading speaker: MCRC0
loading speaker: MTMR0
loading speaker: MCSH0
loading speaker: MWVW0
loading speaker: FNMR0
loading speaker: FGWR0
loading speaker: FAKS0
loading speaker: MCTW0
loading speaker: MKJL0
loading speaker: FRNG0
loading speaker: MPLB0
loading speaker: MJTC0
loading speaker: FLKD0
loading speaker: MDSC0
loading speaker: FREW0
loading speaker: MJBR0
loading speaker: MDLD0
loading speaker: MCTT0
loading speaker: MJRF0
loading speaker: FCRH0
loading speaker: MRPC0
loading speaker: FCMH0
loading speaker: FJSA0
loading speaker: FDRD1
loading speaker: MRJS0
loading speaker: MCHH0
loading speaker: MGLB0
loading speaker: FAWF0
loading speaker: FMLD0
loading spe

In [0]:
#Xval,yval,cval=loadimgs(val_folder)

### Saving the validation tensors on disk

In [0]:
with open(os.path.join(save_path,"val.pickle"), "wb") as f:
    pickle.dump((X_test, c_test),f)