# Importing necassary libraries

In [1]:
import numpy as np
import scipy.io
from python_speech_features import mfcc
import matplotlib.pyplot as plt
import IPython.display as ipd
import os
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
import soundfile as sf
import random
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import StandardScaler
import joblib
import math
import statistics




# Defining the function that reads the audios from a given path using soundfile and returns 4 lists : audios, freqs, filepaths and problematic files for potential debugging

In [2]:

def read_audios(directory):
    audios = []
    freqs = []
    filepaths = []
    problematic_files = []
    
    # Walking through the directory that contains the dataset and reading each file that has the .wav extension
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith('.wav'):
                filepath = os.path.join(root, file)
                filepaths.append(filepath)
                try:
                    # Read the audio file using soundfile
                    data, freq = sf.read(filepath)
                    audios.append(data)
                    freqs.append(freq)
                except Exception as e:
                    print(f"Encountered an error with file: {filepath}. Error: {e}")
                    problematic_files.append(filepath)
    
    return audios, freqs, filepaths, problematic_files

# Example usage
audios, freqs, filepaths, problematic_files = read_audios(r'Spanish_wav')


# Defining the function that extracts the mfcc features then removes the frames of silence finally it saves the mffc features into a .txt file according to gender 

In [3]:
def extractMfccs_RemoveSilence_saveMfccs(audios,freqs,filepaths, directory):
    mfccs = []
 
    for audio, freq, filepath in zip(audios, freqs,filepaths):
        # extract the MFCC features
        mfcc_features = mfcc(audio, freq, winlen=0.025, winstep=0.01, numcep=13, nfilt=26, nfft= 2048, lowfreq=0,
                         highfreq=None, preemph=0.97, ceplifter=22, appendEnergy=False)
        
        # calculate the energy
        energy = np.sum(mfcc_features**2, axis=1)
        # calculate the threshold for silence
        threshold = np.mean(energy) * 0.4
        #removing silence frames from mfccs
        voiced_indices = np.where(energy > threshold)[0]
        mfccs_voiced = mfcc_features[voiced_indices,:]
        mfccs.append(mfccs_voiced)
        
        # print the shape of the MFCCs before and after removing silence
        print(f"MFCCs before removing silence: {mfcc_features.shape}")
        print(f"MFCCs after removing silence: {mfccs_voiced.shape}")
        
       #saving mffcs 
       # extract the gender information from the file name
        gender = None
        if 'hommes' in filepath:
            gender = 'Hommes'
        elif 'femmes' in filepath:
            gender = 'Femmes'

        # save the MFCCs to the appropriate directory based on gender
        if gender is not None:
            gender_dir = os.path.join(directory, gender)
            if not os.path.exists(gender_dir):
                os.makedirs(gender_dir)
            mfcc_file = os.path.join(gender_dir,  os.path.splitext(os.path.basename(filepath))[0] + ".mfcc")
            np.savetxt(mfcc_file, mfccs_voiced, delimiter=',')
            
      
    
    return  mfccs

# Defining the function that splits the extraced mfccs into training and testing sets : 2/3 from male directory for training and 2/3 from female 

In [4]:
def train_test_split(mfcc_dir):
    # create separate lists for male and female file paths
    male_files = []
    female_files = []
    for root, dirs, files in os.walk(mfcc_dir):
        for file in files:
            if file.endswith('.mfcc'):
                if 'Hommes' in root:
                    male_files.append(os.path.join(root, file))
                elif 'Femmes' in root:
                    female_files.append(os.path.join(root, file))

    # shuffle the male and female lists independently
    random.shuffle(male_files)
    random.shuffle(female_files)

    # split the male and female lists into train and test based on the desired ratio
    male_train = male_files[:int(2/3*len(male_files))]
    male_test = male_files[int(2/3*len(male_files)):]

    female_train = female_files[:int(2/3*len(female_files))]
    female_test = female_files[int(2/3*len(female_files)):]

    
    
    # merge the train and test lists for both male and female
    train_files = male_train + female_train
    test_files = male_test + female_test

    # load the MFCC features from the saved files for the train and test sets
    train_mfccs = []
    test_mfccs = []

    for file in train_files:
        train_mfccs.append(np.loadtxt(file, delimiter=','))

    for file in test_files:
        test_mfccs.append(np.loadtxt(file, delimiter=','))

    # print the shapes of the train and test MFCC feature arrays
    print(f"Train male MFCCs shape: {np.array(male_train).shape}")
    print(f"Test male MFCCs shape: {np.array( male_test).shape}")
    print(f"Train female MFCCs shape: {np.array(female_train).shape}")
    print(f"Test female MFCCs shape: {np.array( female_test).shape}")
    print(f"Train MFCCs shape: {np.array(train_mfccs).shape}")
    print(f"Test MFCCs shape: {np.array(test_mfccs).shape}")
    
    return train_mfccs, test_mfccs


# Defining the functions that train the different GMM models and than save them as a pkl file

In [5]:
def gmm16(train_mfccs):
    # Initialize the GMM model with 16 classes
    gmm = GaussianMixture(n_components=16, covariance_type='diag', random_state=0)

    # Fit the GMM model to the training data
    gmm.fit(train_mfccs)
    
    # Save the trained GMM model to a file
    joblib.dump(gmm, r'gmm_model16_spanish.pkl')

    return gmm


In [6]:
def gmm32(train_mfccs):
    # Initialize the GMM model with 32 classes
    gmm = GaussianMixture(n_components=32, covariance_type='diag', random_state=0)

    # Fit the GMM model to the training data
    gmm.fit(train_mfccs)
    
    # Save the trained GMM model to a file
    joblib.dump(gmm, r'gmm_model32_spanish.pkl')

    return gmm

In [7]:
def gmm64(train_mfccs):
    # Initialize the GMM model with 64 classes
    gmm = GaussianMixture(n_components=64, covariance_type='diag', random_state=0)

    # Fit the GMM model to the training data
    gmm.fit(train_mfccs)
    
    # Save the trained GMM model to a file
    joblib.dump(gmm, r'gmm_model64_spanish.pkl')

    return gmm

In [8]:
def gmm128(train_mfccs):
    # Initialize the GMM model with 128 classes
    gmm = GaussianMixture(n_components=128, covariance_type='diag', random_state=0)

    # Fit the GMM model to the training data
    gmm.fit(train_mfccs)
    
    # Save the trained GMM model to a file
    joblib.dump(gmm, r'gmm_model128_spanish.pkl')

    return gmm

In [9]:
def gmm256(train_mfccs):
    # Initialize the GMM model with 254 classes
    gmm = GaussianMixture(n_components=256, covariance_type='diag', random_state=0)

    # Fit the GMM model to the training data
    gmm.fit(train_mfccs)
    
    # Save the trained GMM model to a file
    joblib.dump(gmm, r'gmm_model256_spanish.pkl')

    return gmm

# Getting the audios frequencies and filepaths from their directory using the function defined above

In [10]:
audios, freqs, filepaths, problematic_files = read_audios(r'Spanish_wav')


# Extracting the mfcc features - Removing silence and saving the mfccs

In [11]:
mfccs = extractMfccs_RemoveSilence_saveMfccs(audios, freqs, filepaths,r'mfcc_spanish')

MFCCs before removing silence: (497, 13)
MFCCs after removing silence: (497, 13)
MFCCs before removing silence: (534, 13)
MFCCs after removing silence: (534, 13)
MFCCs before removing silence: (482, 13)
MFCCs after removing silence: (482, 13)
MFCCs before removing silence: (482, 13)
MFCCs after removing silence: (482, 13)
MFCCs before removing silence: (519, 13)
MFCCs after removing silence: (519, 13)
MFCCs before removing silence: (527, 13)
MFCCs after removing silence: (527, 13)
MFCCs before removing silence: (460, 13)
MFCCs after removing silence: (460, 13)
MFCCs before removing silence: (497, 13)
MFCCs after removing silence: (497, 13)
MFCCs before removing silence: (491, 13)
MFCCs after removing silence: (491, 13)
MFCCs before removing silence: (533, 13)
MFCCs after removing silence: (533, 13)
MFCCs before removing silence: (468, 13)
MFCCs after removing silence: (468, 13)
MFCCs before removing silence: (460, 13)
MFCCs after removing silence: (460, 13)
MFCCs before removing silenc

MFCCs before removing silence: (491, 13)
MFCCs after removing silence: (491, 13)
MFCCs before removing silence: (531, 13)
MFCCs after removing silence: (531, 13)
MFCCs before removing silence: (497, 13)
MFCCs after removing silence: (497, 13)
MFCCs before removing silence: (449, 13)
MFCCs after removing silence: (449, 13)
MFCCs before removing silence: (472, 13)
MFCCs after removing silence: (472, 13)
MFCCs before removing silence: (526, 13)
MFCCs after removing silence: (526, 13)
MFCCs before removing silence: (479, 13)
MFCCs after removing silence: (479, 13)
MFCCs before removing silence: (531, 13)
MFCCs after removing silence: (531, 13)
MFCCs before removing silence: (473, 13)
MFCCs after removing silence: (473, 13)
MFCCs before removing silence: (491, 13)
MFCCs after removing silence: (491, 13)
MFCCs before removing silence: (496, 13)
MFCCs after removing silence: (496, 13)
MFCCs before removing silence: (514, 13)
MFCCs after removing silence: (514, 13)
MFCCs before removing silenc

MFCCs before removing silence: (494, 13)
MFCCs after removing silence: (494, 13)
MFCCs before removing silence: (498, 13)
MFCCs after removing silence: (498, 13)
MFCCs before removing silence: (511, 13)
MFCCs after removing silence: (511, 13)
MFCCs before removing silence: (506, 13)
MFCCs after removing silence: (506, 13)
MFCCs before removing silence: (444, 13)
MFCCs after removing silence: (444, 13)
MFCCs before removing silence: (523, 13)
MFCCs after removing silence: (523, 13)
MFCCs before removing silence: (515, 13)
MFCCs after removing silence: (515, 13)
MFCCs before removing silence: (494, 13)
MFCCs after removing silence: (494, 13)
MFCCs before removing silence: (516, 13)
MFCCs after removing silence: (516, 13)
MFCCs before removing silence: (460, 13)
MFCCs after removing silence: (460, 13)
MFCCs before removing silence: (520, 13)
MFCCs after removing silence: (520, 13)
MFCCs before removing silence: (477, 13)
MFCCs after removing silence: (477, 13)
MFCCs before removing silenc

MFCCs before removing silence: (467, 13)
MFCCs after removing silence: (467, 13)
MFCCs before removing silence: (467, 13)
MFCCs after removing silence: (467, 13)
MFCCs before removing silence: (445, 13)
MFCCs after removing silence: (445, 13)
MFCCs before removing silence: (611, 13)
MFCCs after removing silence: (611, 13)
MFCCs before removing silence: (492, 13)
MFCCs after removing silence: (492, 13)
MFCCs before removing silence: (665, 13)
MFCCs after removing silence: (664, 13)
MFCCs before removing silence: (467, 13)
MFCCs after removing silence: (467, 13)
MFCCs before removing silence: (579, 13)
MFCCs after removing silence: (579, 13)
MFCCs before removing silence: (244, 13)
MFCCs after removing silence: (244, 13)
MFCCs before removing silence: (589, 13)
MFCCs after removing silence: (589, 13)
MFCCs before removing silence: (539, 13)
MFCCs after removing silence: (537, 13)
MFCCs before removing silence: (618, 13)
MFCCs after removing silence: (618, 13)
MFCCs before removing silenc

MFCCs before removing silence: (661, 13)
MFCCs after removing silence: (661, 13)
MFCCs before removing silence: (561, 13)
MFCCs after removing silence: (560, 13)
MFCCs before removing silence: (579, 13)
MFCCs after removing silence: (579, 13)
MFCCs before removing silence: (762, 13)
MFCCs after removing silence: (762, 13)
MFCCs before removing silence: (521, 13)
MFCCs after removing silence: (521, 13)
MFCCs before removing silence: (597, 13)
MFCCs after removing silence: (597, 13)
MFCCs before removing silence: (744, 13)
MFCCs after removing silence: (744, 13)
MFCCs before removing silence: (514, 13)
MFCCs after removing silence: (514, 13)
MFCCs before removing silence: (600, 13)
MFCCs after removing silence: (600, 13)
MFCCs before removing silence: (834, 13)
MFCCs after removing silence: (834, 13)
MFCCs before removing silence: (669, 13)
MFCCs after removing silence: (667, 13)
MFCCs before removing silence: (370, 13)
MFCCs after removing silence: (370, 13)
MFCCs before removing silenc

MFCCs before removing silence: (532, 13)
MFCCs after removing silence: (532, 13)
MFCCs before removing silence: (287, 13)
MFCCs after removing silence: (287, 13)
MFCCs before removing silence: (427, 13)
MFCCs after removing silence: (426, 13)
MFCCs before removing silence: (532, 13)
MFCCs after removing silence: (532, 13)
MFCCs before removing silence: (535, 13)
MFCCs after removing silence: (534, 13)
MFCCs before removing silence: (755, 13)
MFCCs after removing silence: (754, 13)
MFCCs before removing silence: (330, 13)
MFCCs after removing silence: (330, 13)
MFCCs before removing silence: (413, 13)
MFCCs after removing silence: (413, 13)
MFCCs before removing silence: (568, 13)
MFCCs after removing silence: (568, 13)
MFCCs before removing silence: (510, 13)
MFCCs after removing silence: (509, 13)
MFCCs before removing silence: (640, 13)
MFCCs after removing silence: (640, 13)
MFCCs before removing silence: (543, 13)
MFCCs after removing silence: (543, 13)
MFCCs before removing silenc

MFCCs before removing silence: (478, 13)
MFCCs after removing silence: (477, 13)
MFCCs before removing silence: (334, 13)
MFCCs after removing silence: (333, 13)
MFCCs before removing silence: (363, 13)
MFCCs after removing silence: (363, 13)
MFCCs before removing silence: (413, 13)
MFCCs after removing silence: (413, 13)
MFCCs before removing silence: (316, 13)
MFCCs after removing silence: (315, 13)
MFCCs before removing silence: (395, 13)
MFCCs after removing silence: (392, 13)
MFCCs before removing silence: (777, 13)
MFCCs after removing silence: (777, 13)
MFCCs before removing silence: (593, 13)
MFCCs after removing silence: (593, 13)
MFCCs before removing silence: (478, 13)
MFCCs after removing silence: (475, 13)
MFCCs before removing silence: (273, 13)
MFCCs after removing silence: (273, 13)
MFCCs before removing silence: (309, 13)
MFCCs after removing silence: (305, 13)
MFCCs before removing silence: (291, 13)
MFCCs after removing silence: (289, 13)
MFCCs before removing silenc

## -----> We can see here that the size of the mfcc features has decreased after removing the frames ot silence

# Splitting into test and train sets according to gender

In [12]:
train_mfccs, test_mfccs = train_test_split(r'mfcc_spanish')

Train male MFCCs shape: (239,)
Test male MFCCs shape: (120,)
Train female MFCCs shape: (243,)
Test female MFCCs shape: (122,)
Train MFCCs shape: (482,)
Test MFCCs shape: (242,)


  print(f"Train MFCCs shape: {np.array(train_mfccs).shape}")
  print(f"Test MFCCs shape: {np.array(test_mfccs).shape}")


# Stacking vertically the train and test MFCC features so that we can fit the gmm models

In [13]:
#stack vertically the train MFCC features 
mfcc_train = []
for train_mfcc in train_mfccs:
    mfcc_train.append(train_mfcc)
mfcc_train = np.concatenate(mfcc_train, axis=0)

#stack vertically the test MFCC features 
mfcc_test = []
for test_mfcc in test_mfccs:
    mfcc_test.append(test_mfcc)
mfcc_test = np.concatenate(mfcc_test, axis=0)


# Saving the test set into a txt file 

In [14]:
#Save the test mfccs in a file
test_mfccs = np.vstack(test_mfccs)
test_mfccs = np.array(test_mfccs, dtype=float)
np.savetxt(r'spanishTest', test_mfccs, delimiter=',')



In [15]:
mfcc_train.shape

(273016, 13)

# Training the different Gmm Models 

In [16]:
gmm16 = gmm16(mfcc_train)

In [17]:
gmm32 = gmm32(mfcc_train)

In [18]:
gmm64 = gmm64(mfcc_train)

In [19]:
gmm128= gmm128(mfcc_train)

In [20]:
gmm256= gmm256(mfcc_train)

# Evaluate the performance of each GMM model on the test set using the score_samples() function that returns an array containing the log-likelihood of each frame of the mfcc features

In [21]:
scores = []
for model in [gmm16, gmm32, gmm64, gmm128, gmm256]:
    score = model.score_samples(mfcc_test)
    scores.append(score)

# Print the scores
print('GMM16 score:', scores[0])
print('GMM32 score:', scores[1])
print('GMM64 score:', scores[2])
print('GMM128 score:', scores[3])
print('GMM256 score:', scores[4])



GMM16 score: [-49.05870833 -53.82938823 -54.62287386 ... -55.33671129 -54.93640764
 -56.76824437]
GMM32 score: [-49.83049892 -55.05438122 -54.63209593 ... -50.62301822 -51.85412279
 -54.43271046]
GMM64 score: [-49.73172227 -54.93804853 -54.90668681 ... -50.52303528 -51.50889269
 -54.14906249]
GMM128 score: [-51.05292796 -54.20700153 -54.10273388 ... -48.42882178 -49.66743458
 -51.65727999]
GMM256 score: [-51.45463984 -52.62560254 -52.26864691 ... -48.51905157 -48.50956296
 -51.43966293]


# Comparing the size of our mfcc_test set with the size of the scores array 

In [22]:
mfcc_test.shape

(126588, 13)

# Indeed the size of the scores array is the same 

In [23]:
 scores[0].shape

(126588,)

# In order to compare between the different GMM Models we need to calculate the score for the hole test set and we can do that by calculating the mean of the individual scores

In [24]:
#calculationg the score of the hole test set
print('GMM16 score:', scores[0].mean())
print('GMM32 score:', scores[1].mean())
print('GMM64 score:', scores[2].mean())
print('GMM128 score:', scores[3].mean())
print('GMM256 score:', scores[4].mean())


GMM16 score: -50.398661978238515
GMM32 score: -49.97678880120494
GMM64 score: -49.52923493099654
GMM128 score: -49.15012612764365
GMM256 score: -48.810046543206504


### From the results above we can see that the best score (the closest one to 0) is given by the model using 16 gaussians 