In [1]:
import numpy as np
from matplotlib import pyplot as pl
import os
import scipy.io.wavfile as wav

%matplotlib inline

PATH = './vowels/'

In [24]:
import enum

counterF = 0 #72
counterM = 0 #72
counterK = 0 #216

def getGenderFromFilename(filename):
    if(filename[2] == 'f'):
        counterF += 1
        return [1, 0, 0] # female
    elif(filename[2] == 'm'):
        counterM += 1
        return [0, 1, 0] # male
    else:
        counterK += 1
        return [0, 0, 1] # kid
    
def getVoiceTypeFromFilename(filename):
    if(filename[0] == 's'):
        return 0 # syntetic voice
    else:
        return 1 # real voice
    
def isAdult(filename):
    return filename[1] != 'k'


In [25]:
from pathlib import Path
from python_speech_features import mfcc

mfccRawDatas = []
#Read all the wav files
pathlist = Path(PATH).glob('*.wav')
for path in pathlist:
    sample_rate, X = wav.read(str(path))
    category = getGenderFromFilename(path.stem)
    voiceType = getVoiceTypeFromFilename(path.stem)
    #Append the data as a tuple of : (category (male, female or kid), voicetype (natural or synthetic),Mfcc data)
    mfccRawDatas.append((category, voiceType, mfcc(X, samplerate = sample_rate, nfft = 1024)))

In [76]:
import random
mfccDatas = []
counterK = 0
maxValueOfGender = 72


random.shuffle(mfccRawDatas)
#Recuce the MFCCs datas with a mean of all values
for i in range(len(mfccRawDatas)) :
    #Take the means of all windows
    values = mfccRawDatas[i][2].mean(axis=0)
    #Add the gender
    #We know that we have to much data of kid, so we stop when we have 72 kids data
    gender = mfccRawDatas[i][0]
    if(gender[2] == 1): #kid
        counterK += 1
        if(counterK > maxValueOfGender):
            continue
    values = np.append(values, gender)
    #Put the row in the array
    mfccDatas.append(values)

datas = np.asarray(mfccDatas)

In [78]:
datas.shape

(216, 16)

# Training part

In [79]:
import mlp_backprop_momentum as mlp
import k_fold_cross_validation as cv
%matplotlib inline

In [None]:
EPOCHS = 300
LEARNING_RATE = 0.001
MOMENTUM = 0.85

K = 5
N_TESTS = 5
N_NEURONS = [3, 5, 7]
#N_NEURONS = [3, 5, 7]

MSE_train = np.zeros((len(N_NEURONS), EPOCHS, N_TESTS))
MSE_test = np.zeros((len(N_NEURONS), EPOCHS, N_TESTS))

for i_h, h in enumerate(N_NEURONS):                                     # looping the number of hidden neurons
    print('Testing', h, 'neurons...')
    nn = mlp.MLP([13, h,3, 3], 'tanh')
        
    for i in np.arange(N_TESTS):                                        # looping the tests
        nn.init_weights()                                               # the network has to be reinitialized before each test
        temp1, temp2 = cv.k_fold_cross_validation_per_epoch(nn,         # notice that we do not use cv.k_fold_cross_validation
                                                            datas,      # but cv.k_fold_cross_validation_per_epoch which
                                                            k=K,        # returns a value of error per each epoch
                                                            learning_rate=LEARNING_RATE,
                                                            momentum=MOMENTUM,
                                                            epochs=EPOCHS)
        # temp1 and temp2 are the training and test error. One value per epoch
        MSE_train[i_h, :, i] = temp1
        MSE_test[i_h, :, i] = temp2 

print("Done !")

In [None]:
MSE_train_mean = np.mean(MSE_train, axis=2)
MSE_test_mean = np.mean(MSE_test, axis=2)
MSE_train_sd = np.std(MSE_train, axis=2)
MSE_test_sd = np.std(MSE_test, axis=2)

v_min = min(np.min(MSE_train_mean), np.min(MSE_test_mean))
v_max = max(np.max(MSE_train_mean), np.max(MSE_test_mean))

n_rows = int(np.ceil(len(N_NEURONS)/3.0))
pl.figure(figsize=(12,3*n_rows))
for i_n, n in enumerate(N_NEURONS):
    pl.subplot(n_rows, min(3, len(N_NEURONS)), i_n+1)
    pl.fill_between(np.arange(EPOCHS), MSE_train_mean[i_n,:], MSE_train_mean[i_n,:]+MSE_train_sd[i_n,:], facecolor='blue', alpha=0.5, label='Train')
    pl.fill_between(np.arange(EPOCHS), MSE_train_mean[i_n,:], MSE_train_mean[i_n,:]-MSE_train_sd[i_n,:], facecolor='blue', alpha=0.5)
    pl.fill_between(np.arange(EPOCHS), MSE_test_mean[i_n,:], MSE_test_mean[i_n,:]+MSE_test_sd[i_n,:], facecolor='red', alpha=0.5, label='Test')
    pl.fill_between(np.arange(EPOCHS), MSE_test_mean[i_n,:], MSE_test_mean[i_n,:]-MSE_test_sd[i_n,:], facecolor='red', alpha=0.5)
    pl.ylim(v_min,0.8*v_max)
    pl.ylabel('MSE')
    pl.xlabel('Number of epochs')
    pl.title(str(K)+'-fold CV with '+str(n)+' hidden neurons')
    pl.legend()
    pl.grid()
pl.tight_layout()

In [None]:
pl.figure(figsize=(15,8))
pl.subplot(2,1,1)
pl.imshow(MSE_train_mean, vmin=np.min(MSE_train_mean), vmax=np.percentile(MSE_train_mean, 90), aspect=3, interpolation='nearest')
pl.yticks(np.arange(len(N_NEURONS)), N_NEURONS)
pl.xlabel('Epochs')
pl.ylabel('Number of hidden Neurons')
pl.title('Training')
pl.colorbar()
pl.subplot(2,1,2)
pl.imshow(MSE_test_mean, vmin=np.min(MSE_test_mean), vmax=np.percentile(MSE_test_mean, 90), aspect=3, interpolation='nearest')
pl.yticks(np.arange(len(N_NEURONS)), N_NEURONS)
pl.xlabel('Epochs')
pl.ylabel('Number of hidden Neurons')
pl.title('Test')
pl.colorbar()
pl.tight_layout()

# The final model

In [80]:
nn = mlp.MLP([13,8,3], 'tanh')

In [81]:
LEARNING_RATE = 0.001
MOMENTUM = 0.85
THRESHOLD = 0.55
EPOCHS = 200
K = 5

MSE_train, MSE_test, conf_mat, target_test, output_test = cv.k_fold_cross_validation(nn,
                                                          datas,
                                                          k=K,
                                                          learning_rate=LEARNING_RATE,
                                                          momentum=MOMENTUM,
                                                          epochs=EPOCHS,
                                                          threshold=THRESHOLD)


In [73]:
totalm = 0
totalk = 0
totalf = 0
for i in range(target_test.shape[0]):
    if((output_test[i] > THRESHOLD).tolist().count(True) > 1):
        print(target_test[i])
        print(output_test[i])
    if(target_test[i][2] == 1):
        totalk += 1
    elif(target_test[i][1] == 1):
        totalm += 1
    else:
        totalf += 1

print(totalm, " ", totalf, " ", totalk)


[0. 0. 1.]
[-0.50075948  0.7040485   0.94049172]
[1. 0. 0.]
[0.62663037 0.0012015  0.61264481]
[0. 1. 0.]
[-0.65841371  0.97545256  0.62126545]
[0. 0. 1.]
[0.59232051 0.00161832 0.56468739]
[0. 1. 0.]
[ 0.6168057   0.96711199 -0.576632  ]
[0. 0. 1.]
[ 5.95534383e-01 -6.30458665e-06  7.37578401e-01]
[1. 0. 0.]
[ 0.64927799 -0.00287468  0.67965334]
[0. 1. 0.]
[-0.67588005  0.9672657   0.72656587]
[0. 0. 1.]
[5.62928977e-01 1.48347986e-04 7.54335053e-01]
[1. 0. 0.]
[0.57212632 0.02914645 0.83852239]
[0. 1. 0.]
[ 0.6134084   0.97613277 -0.71630973]
[1. 0. 0.]
[0.6749109  0.00842074 0.78469935]
[1. 0. 0.]
[-0.43325181  0.90804498  0.7294387 ]
[0. 1. 0.]
[-0.60205932  0.97033009  0.8668079 ]
[0. 0. 1.]
[0.57940308 0.00299608 0.89569312]
[0. 0. 1.]
[0.55709622 0.00519073 0.88287605]
[0. 0. 1.]
[0.57959217 0.00296891 0.89565845]
72   72   216
[0. 1. 0.]
[False  True False]


In [82]:

print('MSE training: ', MSE_train)
print('MSE test: ', MSE_test)
print('Confusion matrix:')
print(conf_mat)

MSE training:  0.035653289967389167
MSE test:  0.12650729756400103
Confusion matrix:
[[49.  1. 23.]
 [ 7. 66.  4.]
 [16.  1. 57.]]


In [83]:
tp = np.diag(conf_mat)
fp = np.sum(conf_mat, axis=0) - tp
fn = np.sum(conf_mat, axis=1) - tp
precision = tp / (tp + fp)
recall = tp / (tp + fn)
fscore = 2 * recall * precision / (recall + precision)

print("Classes: woman - man - kid")
print("Precision: ", precision)
print("Recall: ", recall)
print("F-score: ", fscore)

Classes: woman - man - kid
Precision:  [0.68055556 0.97058824 0.67857143]
Recall:  [0.67123288 0.85714286 0.77027027]
F-score:  [0.67586207 0.91034483 0.72151899]
