In [1]:
import numpy as np
from matplotlib import pyplot as pl
import os
import scipy.io.wavfile as wav

%matplotlib inline

PATH = './vowels/'

In [91]:
import enum


def getGenderCount():
    counterM = 0
    counterF = 0
    counterK = 0
    
    pathlist = Path(PATH).glob('*.wav')
    for path in pathlist:
        if(path.stem[2] == 'f'):
            counterF += 1
        elif(path.stem[2] == 'm'):
            counterM += 1
        else:
            counterK += 1
    return counterM, counterF, counterK

def getGenderFromFilename(filename):
    if(filename[2] == 'f'):
        return [1, 0, 0] # female
    elif(filename[2] == 'm'):
        return [0, 1, 0] # male
    else:
        return [0, 0, 1] # kid
    
def getVoiceTypeFromFilename(filename):
    if(filename[0] == 's'):
        return 0 # syntetic voice
    else:
        return 1 # real voice
    
def isAdult(filename):
    return filename[1] != 'k'


In [92]:
from pathlib import Path
from python_speech_features import mfcc

mfccRawDatas = []
#Read all the wav files
pathlist = Path(PATH).glob('*.wav')
for path in pathlist:
    sample_rate, X = wav.read(str(path))
    category = getGenderFromFilename(path.stem)
    voiceType = getVoiceTypeFromFilename(path.stem)
    #Append the data as a tuple of : (category (male, female or kid), voicetype (natural or synthetic),Mfcc data)
    mfccRawDatas.append((category, voiceType, mfcc(X, samplerate = sample_rate, nfft = 1024)))

In [127]:
import random


def mfccDatas(mfccRawDatas):
    mfccDatas = []

    random.shuffle(mfccRawDatas)

    #Recuce the MFCCs datas with a mean of all values
    for i in range(len(mfccRawDatas)) :
        #Take the means of all windows
        values = mfccRawDatas[i][2].mean(axis=0)
        #Add the gender
        #We know that we have to much data of kid, so we stop when we have 72 kids data
        gender = mfccRawDatas[i][0]
        values = np.append(values, gender)
        #Put the row in the array
        mfccDatas.append(values)

    return np.asarray(mfccDatas)

def mfccDatasOverSampling(mfccRawDatas):
    
    #Compute the amount of additional data we need
    counterM, counterF, counterK = getGenderCount()
    maxCounter = max(counterM, counterF, counterK)
    additionalDataF = maxCounter - counterF
    additionalDataM = maxCounter - counterM
    additionalDataK = maxCounter - counterK
    
    mfccDatas = []
    random.shuffle(mfccRawDatas)
    #Recuce the MFCCs datas with a mean of all values
    #while(additionalDataF > 0 && additionalDataM > 0 && additionalDataK > 0):
    for i in range(len(mfccRawDatas)) :
        #Take the means of all windows
        values = mfccRawDatas[i][2].mean(axis=0)
        #Add the gender
        #We know that we have to much data of kid, so we stop when we have 72 kids data
        gender = mfccRawDatas[i][0]
        values = np.append(values, gender)
        #Put the row in the array
        mfccDatas.append(values)

    while(additionalDataF > 0 or additionalDataM > 0 or additionalDataK > 0):
        for i in range(len(mfccRawDatas)) :
            values = mfccRawDatas[i][2].mean(axis=0)
            #Add the gender
            #We know that we have to much data of kid, so we stop when we have 72 kids data
            gender = mfccRawDatas[i][0]
            values = np.append(values, gender)
            
            if(gender[0] == 1 and additionalDataF > 0):
                mfccDatas.append(values) # double the data
                additionalDataF -= 1
            if(gender[1] == 1 and additionalDataM > 0):
                mfccDatas.append(values) # double the data
                additionalDataM -= 1
            if(gender[2] == 1 and additionalDataK > 0):
                mfccDatas.append(values) # double the data
                additionalDataK -= 1
        
    return np.asarray(mfccDatas)
    
def mfccDatasUnderSampling(mfccRawDatas):
     #Compute the amount of additional data we need
    counterM, counterF, counterK = getGenderCount()
    minCounter = min(counterM, counterF, counterK)
    nbF = 0
    nbM = 0
    nbK = 0
    
    mfccDatas = []
    random.shuffle(mfccRawDatas)
    #Recuce the MFCCs datas with a mean of all values
    #while(additionalDataF > 0 && additionalDataM > 0 && additionalDataK > 0):
    for i in range(len(mfccRawDatas)) :
        #Take the means of all windows
        values = mfccRawDatas[i][2].mean(axis=0)
        #Add the gender
        #We know that we have to much data of kid, so we stop when we have 72 kids data
        gender = mfccRawDatas[i][0]
        values = np.append(values, gender)
        if(gender[0] == 1 and nbF < minCounter):
            mfccDatas.append(values) # double the data
            nbF += 1
        if(gender[1] == 1 and nbM < minCounter):
            mfccDatas.append(values) # double the data
            nbM += 1
        if(gender[2] == 1 and nbK < minCounter):
            mfccDatas.append(values) # double the data
            nbK += 1
        
    return np.asarray(mfccDatas)


overSamplingDatas = mfccDatasOverSampling(mfccRawDatas)
underSamplingDatas = mfccDatasUnderSampling(mfccRawDatas)
datas = mfccDatas(mfccRawDatas)

[ 21.90338687  -8.88053649   0.83702144  10.07505979 -23.89380195
 -35.85249852 -19.32257479  -3.83643192   4.72586426  -1.1128845
 -18.80962313  16.93730234 -22.02681128   0.           0.
   1.        ]


In [128]:
print(underSamplingDatas.shape)
print(overSamplingDatas.shape)
print(datas.shape)


(216, 16)
(648, 16)
(360, 16)


# Training part

In [79]:
import mlp_backprop_momentum as mlp
import k_fold_cross_validation as cv
%matplotlib inline

In [None]:
EPOCHS = 300
LEARNING_RATE = 0.001
MOMENTUM = 0.85

K = 5
N_TESTS = 5
N_NEURONS = [3, 5, 7]
#N_NEURONS = [3, 5, 7]

MSE_train = np.zeros((len(N_NEURONS), EPOCHS, N_TESTS))
MSE_test = np.zeros((len(N_NEURONS), EPOCHS, N_TESTS))

for i_h, h in enumerate(N_NEURONS):                                     # looping the number of hidden neurons
    print('Testing', h, 'neurons...')
    nn = mlp.MLP([13, h,3, 3], 'tanh')
        
    for i in np.arange(N_TESTS):                                        # looping the tests
        nn.init_weights()                                               # the network has to be reinitialized before each test
        temp1, temp2 = cv.k_fold_cross_validation_per_epoch(nn,         # notice that we do not use cv.k_fold_cross_validation
                                                            datas,      # but cv.k_fold_cross_validation_per_epoch which
                                                            k=K,        # returns a value of error per each epoch
                                                            learning_rate=LEARNING_RATE,
                                                            momentum=MOMENTUM,
                                                            epochs=EPOCHS)
        # temp1 and temp2 are the training and test error. One value per epoch
        MSE_train[i_h, :, i] = temp1
        MSE_test[i_h, :, i] = temp2 

print("Done !")

In [None]:
MSE_train_mean = np.mean(MSE_train, axis=2)
MSE_test_mean = np.mean(MSE_test, axis=2)
MSE_train_sd = np.std(MSE_train, axis=2)
MSE_test_sd = np.std(MSE_test, axis=2)

v_min = min(np.min(MSE_train_mean), np.min(MSE_test_mean))
v_max = max(np.max(MSE_train_mean), np.max(MSE_test_mean))

n_rows = int(np.ceil(len(N_NEURONS)/3.0))
pl.figure(figsize=(12,3*n_rows))
for i_n, n in enumerate(N_NEURONS):
    pl.subplot(n_rows, min(3, len(N_NEURONS)), i_n+1)
    pl.fill_between(np.arange(EPOCHS), MSE_train_mean[i_n,:], MSE_train_mean[i_n,:]+MSE_train_sd[i_n,:], facecolor='blue', alpha=0.5, label='Train')
    pl.fill_between(np.arange(EPOCHS), MSE_train_mean[i_n,:], MSE_train_mean[i_n,:]-MSE_train_sd[i_n,:], facecolor='blue', alpha=0.5)
    pl.fill_between(np.arange(EPOCHS), MSE_test_mean[i_n,:], MSE_test_mean[i_n,:]+MSE_test_sd[i_n,:], facecolor='red', alpha=0.5, label='Test')
    pl.fill_between(np.arange(EPOCHS), MSE_test_mean[i_n,:], MSE_test_mean[i_n,:]-MSE_test_sd[i_n,:], facecolor='red', alpha=0.5)
    pl.ylim(v_min,0.8*v_max)
    pl.ylabel('MSE')
    pl.xlabel('Number of epochs')
    pl.title(str(K)+'-fold CV with '+str(n)+' hidden neurons')
    pl.legend()
    pl.grid()
pl.tight_layout()

In [None]:
pl.figure(figsize=(15,8))
pl.subplot(2,1,1)
pl.imshow(MSE_train_mean, vmin=np.min(MSE_train_mean), vmax=np.percentile(MSE_train_mean, 90), aspect=3, interpolation='nearest')
pl.yticks(np.arange(len(N_NEURONS)), N_NEURONS)
pl.xlabel('Epochs')
pl.ylabel('Number of hidden Neurons')
pl.title('Training')
pl.colorbar()
pl.subplot(2,1,2)
pl.imshow(MSE_test_mean, vmin=np.min(MSE_test_mean), vmax=np.percentile(MSE_test_mean, 90), aspect=3, interpolation='nearest')
pl.yticks(np.arange(len(N_NEURONS)), N_NEURONS)
pl.xlabel('Epochs')
pl.ylabel('Number of hidden Neurons')
pl.title('Test')
pl.colorbar()
pl.tight_layout()

# The final model

In [139]:
datasets = {'Over sampling data' : overSamplingDatas, 
            'Under sampling data2' : underSamplingDatas, 
            'Natural datas' : datas}

(648, 16)
(216, 16)
(360, 16)


In [140]:
for (key, datas) in datasets.items():
    nn = mlp.MLP([13,5,3,3], 'tanh')

    LEARNING_RATE = 0.001
    MOMENTUM = 0.85
    THRESHOLD = 0.5
    EPOCHS = 80
    K = 5

    MSE_train, MSE_test, conf_mat, target_test, output_test = cv.k_fold_cross_validation(nn,
                                                              datas,
                                                              k=K,
                                                              learning_rate=LEARNING_RATE,
                                                              momentum=MOMENTUM,
                                                              epochs=EPOCHS,
                                                              threshold=THRESHOLD)
    
    
    print('################')
    print(key)
    print('################')
    print('MSE training: ', MSE_train)
    print('MSE test: ', MSE_test)
    print('Confusion matrix:')
    print(conf_mat)
    
    tp = np.diag(conf_mat)
    fp = np.sum(conf_mat, axis=0) - tp
    fn = np.sum(conf_mat, axis=1) - tp
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    fscore = 2 * recall * precision / (recall + precision)

    print("Classes: woman - man - kid")
    print("Precision: ", precision)
    print("Recall: ", recall)
    print("F-score: ", fscore)


################
Over sampling data
################
MSE training:  0.043883148373848456
MSE test:  0.06748296266030415
Confusion matrix:
[[187.   0.  40.]
 [  0. 213.   3.]
 [ 54.   2. 175.]]
Classes: woman - man - kid
Precision:  [0.77593361 0.99069767 0.80275229]
Recall:  [0.82378855 0.98611111 0.75757576]
F-score:  [0.7991453  0.98839907 0.77951002]
################
Under sampling data2
################
MSE training:  0.048010633610088636
MSE test:  0.09824812961049248
Confusion matrix:
[[60.  1. 16.]
 [ 4. 69.  3.]
 [24.  2. 55.]]
Classes: woman - man - kid
Precision:  [0.68181818 0.95833333 0.74324324]
Recall:  [0.77922078 0.90789474 0.67901235]
F-score:  [0.72727273 0.93243243 0.70967742]
################
Natural datas
################
MSE training:  0.05100534787344356
MSE test:  0.09173539176376391
Confusion matrix:
[[ 35.   1.  38.]
 [  2.  69.   4.]
 [ 26.   3. 189.]]
Classes: woman - man - kid
Precision:  [0.55555556 0.94520548 0.81818182]
Recall:  [0.47297297 0.92       0.