**<center><h1>Speaker Classification with Deep Learning</h1></center>**
**<center><h2>Speech Technology Assignment 2023-24</h2></center>**
**<center><h3>Matthias Bartolo</h3></center>**

**<h3>Package Installation</h3>**

In [1]:
# !pip install librosa
# !pip install tensorflow
# !pip install keras
# !pip install matplotlib
# !pip install numpy

**<h3>Package Imports</h3>**

In [2]:
import os
import random
import librosa
import librosa.display
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
import warnings

warnings.filterwarnings('ignore')

# Declaring constants
SAMPLE_RATE = 16000
N_MELS = 128
MEL_SPEC_FRAME_SIZE = 1024
NUM_CLASSES = 285

**<h3>Loading and Filtering Dataset</h3>**

In [3]:
def get_speaker_roots_in_data_path(datapath=os.path.join(os.getcwd(), 'ABI-1 Corpus\\accents')):
    """Function to get the list of speaker roots in the data path.
    
    Args:
        datapath (str): Path to the data folder.

    Returns:
        speaker_list (list): List of speaker roots in the data path.
    """
    # Declaring the list of speakers
    speaker_list = []

    # Retrieving the list of accent subfolders
    accent_subfolders = [f.path for f in os.scandir(datapath) if f.is_dir()]
    
    # Iterating through the accent subfolders
    for accent in accent_subfolders:
        # Iterating through the gender
        for gender in ['female', 'male']:
            # Retrieving the list of speaker folders
            speaker_folders = os.listdir(os.path.join(accent, gender))
            
            # Iterating through the speaker folders
            for speaker in speaker_folders:
                # Checking if the speaker folder is not a hidden folder
                if not speaker.startswith('.'):
                    speaker_list.append(os.path.join(accent, gender, speaker))
    
    # Returning the list of speakers
    return speaker_list

In [4]:
def get_wav_files_in_path(datapath):
    """Function to get the list of wav files in the data path.

    Args:
        datapath (str): Path to the data folder.
    
    Returns:
        wav_files (list): List of wav files in the data path.
    """
    # Retrieving the list of files in the data path
    files = os.listdir(datapath)

    # Filtering the list of files to get only the wav files
    wav_files = [f for f in files if f.endswith('.wav')]

    # Appending the path to the wav files
    wav_files = [os.path.join(datapath, f) for f in wav_files]
    
    # Returning the list of wav files
    return wav_files

In [5]:
# Retrieving the list of speaker roots in the data path
speaker_roots = get_speaker_roots_in_data_path()
print('\033[1m' + 'Number of speakers found: ' + '\033[0m' + str(len(speaker_roots)))

# Retrieving the list of wav files in the data path
wav_files = []

# Iterating through the speaker roots
for speaker_root in speaker_roots:
    # Retrieving the list of wav files in the speaker root
    wav_files.extend(get_wav_files_in_path(speaker_root))

print('\033[1m' + 'Number of wav files found: ' + '\033[0m' + str(len(wav_files)))

# Declaring constant to hold the number of classes
NUM_CLASSES = len(speaker_roots)

[1mNumber of speakers found: [0m285
[1mNumber of wav files found: [0m2850


**<h3>Preprocessing Data, Chunking and Dataset Splitting</h3>**

In [6]:
def display_spectrogram(spectrogram, sampling_rate=SAMPLE_RATE, hop_length=160, y_axis='linear', title='Linear Spectrogram'):
    """Function to display the spectrogram.
    
    Args:
        spectrogram (numpy.ndarray): Spectrogram to be displayed.
        sampling_rate (int): Sampling rate of the audio (default is 16000).
        hop_length (int): Hop length of the spectrogram (default is 160).
        y_axis (str): Type of y-axis to be displayed (default is linear).
        title (str): Title of the plot (default is Linear Spectrogram).
    """
    # Setting the figure size
    plt.figure(figsize=(20, 8))

    # Setting the title
    plt.xlabel('Time')

    # Setting the y-axis
    plt.ylabel('Mel-Frequency')
    
    # Displaying the spectrogram
    librosa.display.specshow(spectrogram,
                            y_axis=y_axis,
                            fmax=sampling_rate / 2,
                            sr=sampling_rate,
                            hop_length=hop_length,
                            x_axis='time')
    
    # Displaying the colorbar
    plt.colorbar(format='%+2.0f dB')
    
    # Displaying the title
    plt.title(title)
    
    # Displaying the plot
    plt.show()

In [7]:
def chunk_audio(audio_path, chunk_size=3, plot=False):
    """Function to chunk the audio file into specified-second segments.
    
    Args:
        audio_path (str): Path to the audio file.
        chunk_size (int): Duration of each audio chunk in seconds (default is 3 seconds).
        plot (bool): Flag to plot the audio chunks (default is False).
    
    Returns:
        audio_chunks (list): List of audio chunks.
    """
    # Reading the audio file, whilst ensuring the sampling rate is 16kHz
    audio, sampling_rate = librosa.load(audio_path, sr=SAMPLE_RATE)

    # Preprocessing the audio by normalizing the audio
    audio /= np.max(np.abs(audio), axis=0)
    
    # Calculating the number of samples per chunk
    samples_per_chunk = int(sampling_rate * chunk_size)
    
    # Calculating the number of chunks
    num_chunks = int(np.floor(len(audio) / samples_per_chunk))
    
    # Initializing the list of audio chunks
    audio_chunks = []
    
    # Iterating through the audio chunks
    for i in range(num_chunks):
        # Calculating the start and end sample
        start_sample = i * samples_per_chunk
        end_sample = (i + 1) * samples_per_chunk

        # Calculating the audio chunk
        audio_chunk = audio[start_sample:end_sample]

        # Extracting the mel spectrogram using librosa
        mel_spectrogram = librosa.feature.melspectrogram(y=audio_chunk, 
                                                         sr=sampling_rate,
                                                         center=True,
                                                         n_fft=MEL_SPEC_FRAME_SIZE,
                                                         hop_length=int(MEL_SPEC_FRAME_SIZE / 2),
                                                         n_mels=N_MELS)

        # Converting the raw amplitude results to decibels (log scale)
        mel_spectrogram = librosa.power_to_db(mel_spectrogram, ref=1.0)

        # Plotting the spectrogram
        if plot:
            display_spectrogram(mel_spectrogram, sampling_rate=sampling_rate, title='Mel Spectrogram of Audio Chunk ' + str(i + 1))

        # Appending the audio chunk to the list of audio chunks
        audio_chunks.append(mel_spectrogram)
    
    # Returning the list of audio chunks
    return audio_chunks

In [8]:
def preprocess_data(speaker_roots, training_set_ratio, validation_set_ratio, testing_set_ratio, do_display=False, plot=False):
    """"Function to preprocess, chunk and split the data.

    Args:
        speaker_roots (list): List of speaker roots in the data path.
        training_set_ratio (float): Ratio of the training set.
        validation_set_ratio (float): Ratio of the validation set.
        testing_set_ratio (float): Ratio of the testing set.
        do_display (bool): Boolean to display the audio chunks (default is False).
        plot (bool): Boolean to plot the spectrogram (default is False).

    Returns:
        training_set (list): List of training examples.
        validation_set (list): List of validation examples.
        testing_set (list): List of testing examples.
    """
    # Error checking for the ratios
    if training_set_ratio + validation_set_ratio + testing_set_ratio != 1:
        raise ValueError('The sum of the ratios must be equal to 1.')
    
    # Creating dictionary to store the speak to utterances mapping
    speaker_to_utterances = {}

    # Retrieving the list of speakers through the speaker roots, and giving each speaker a unique ID, since one of the speakers has the same name
    speakers ={speaker_root.split('\\')[-1]+str(unique_id): speaker_root for unique_id, speaker_root in enumerate(speaker_roots)}
  
    # Iterating through the speakers
    for speaker, speaker_root in speakers.items():
        if do_display:
            # Printing the speaker being processed
            print_message = '\033[32m' + 'Executing Speaker: ' + '\033[0m' + speaker + '\t {} / {}'.format(speaker_roots.index(speaker_root) + 1, len(speaker_roots))
            print(print_message)
            print('-' * len(print_message))

        # Retrieving the list of wav files in the speaker root
        speaker_wav_files = get_wav_files_in_path(speaker_root)
        
        # Initializing the list of utterances
        utterances = []
        
        # Iterating through the wav files
        for wav_file in speaker_wav_files:
            # Chunking the audio file into 3 seconds segments
            utterances.extend(chunk_audio(wav_file, plot=plot))
        
        # Appending the list of utterances to the dictionary
        speaker_to_utterances[speaker] = utterances

    # Calculating the total number of utterances
    total_num_utterances = sum([len(utterances) for utterances in speaker_to_utterances.values()])

    # Calculating the number of utterances for each set
    num_training_utterances = int(total_num_utterances * training_set_ratio)
    num_validation_utterances = int(total_num_utterances * validation_set_ratio)

    # Sorting the speakers by the number of utterances
    sorted_speakers = sorted(speaker_to_utterances.items(), key=lambda item: len(item[1]), reverse=True)

    # Initializing the relevant sets
    training_set = []
    validation_set = []
    testing_set = []

    # Declaring counter to keep track of the number of utterances
    train_num_utterances = 0
    val_num_utterances = 0
    test_num_utterances = 0

    # Assigning speakers to the training set
    while len(training_set) < num_training_utterances:
        # Retrieving the speaker and utterances
        speaker, utterances = sorted_speakers.pop(0)

        # Appending the utterances to the training set
        training_set.extend([(utterance, speaker) for utterance in utterances])

        # Updating the number of utterances in the training set
        train_num_utterances += len(utterances)
    
    # Assigning speakers to the validation set
    while len(validation_set) < num_validation_utterances:
        # Retrieving the speaker and utterances
        speaker, utterances = sorted_speakers.pop(0)

        # Appending the utterances to the validation set
        validation_set.extend([(utterance, speaker) for utterance in utterances])

        # Updating the number of utterances in the validation set
        val_num_utterances += len(utterances)

    # Assigning speakers to the testing set
    for speaker, utterances in sorted_speakers:
        # Appending all the remaining utterances to the testing set
        testing_set.extend([(utterance, speaker) for utterance in utterances])

        # Updating the number of utterances in the testing set
        test_num_utterances += len(utterances)

    # Shuffling the relevant sets
    random.shuffle(training_set)
    random.shuffle(validation_set)
    random.shuffle(testing_set)

    # Displaying the number of utterances in each set
    if do_display:
        print('\033[35m' + 'Percentage of utterances in each set:' + '\033[0m')
        print('\033[35m' + 'Training Set: ' + '\033[0m' + '{:.2%}'.format(train_num_utterances / total_num_utterances))
        print('\033[35m' + 'Validation Set: ' + '\033[0m' + '{:.2%}'.format(val_num_utterances / total_num_utterances))
        print('\033[35m' + 'Testing Set: ' + '\033[0m' + '{:.2%}'.format(test_num_utterances / total_num_utterances))
    
    # Returning the relevant sets
    return training_set, validation_set, testing_set

In [9]:
# Calling the preprocess_data function
training_set, validation_set, testing_set = preprocess_data(speaker_roots, 0.6, 0.2, 0.2, do_display=True, plot=False)

[32mExecuting Speaker: [0malw0010	 1 / 285
--------------------------------------------
[32mExecuting Speaker: [0mcxb0011	 2 / 285
--------------------------------------------
[32mExecuting Speaker: [0mjah0012	 3 / 285
--------------------------------------------
[32mExecuting Speaker: [0mjep0013	 4 / 285
--------------------------------------------
[32mExecuting Speaker: [0mknb0014	 5 / 285
--------------------------------------------
[32mExecuting Speaker: [0mlcg0015	 6 / 285
--------------------------------------------
[32mExecuting Speaker: [0mlst0016	 7 / 285
--------------------------------------------
[32mExecuting Speaker: [0mmjd0017	 8 / 285
--------------------------------------------
[32mExecuting Speaker: [0mmpt0018	 9 / 285
--------------------------------------------
[32mExecuting Speaker: [0mrkk0019	 10 / 285
---------------------------------------------
[32mExecuting Speaker: [0majh00110	 11 / 285
----------------------------------------------
[32

**<h3>Speaker Identification (SID) Model Design and Implementation</h3>**

In [10]:
# Defining the model
model = keras.Sequential()

# Input layer
model.add(keras.layers.Conv2D(32, (3, 3), activation='relu', input_shape=(128, 47, 1)))

# Hidden layers
model.add(keras.layers.Conv2D(64, (3, 3), activation='relu'))
model.add(keras.layers.MaxPooling2D(pool_size=(2, 2)))

model.add(keras.layers.Conv2D(64, (3, 3), activation='relu'))
model.add(keras.layers.MaxPooling2D(pool_size=(2, 2)))

# Adding an LSTM layer
model.add(keras.layers.Reshape((64, 90*3)))
model.add(keras.layers.LSTM(64, return_sequences=True))  # You can adjust the number of LSTM units as needed

# Flattening the output
model.add(keras.layers.Flatten())

model.add(keras.layers.BatchNormalization())
model.add(keras.layers.Dropout(0.5))  # Dropout layer for regularization
model.add(keras.layers.Dense(10, activation='softmax'))  # Output layer with 10 classes

# Compiling the model
model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.0001),
              loss='categorical_crossentropy',
              metrics=['accuracy'])

# Printing the model summary
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d (Conv2D)             (None, 126, 45, 32)       320       
                                                                 
 conv2d_1 (Conv2D)           (None, 124, 43, 64)       18496     
                                                                 
 max_pooling2d (MaxPooling2  (None, 62, 21, 64)        0         
 D)                                                              
                                                                 
 conv2d_2 (Conv2D)           (None, 60, 19, 64)        36928     
                                                                 
 max_pooling2d_1 (MaxPoolin  (None, 30, 9, 64)         0         
 g2D)                                                            
                                                                 
 reshape (Reshape)           (None, 64, 270)           0

In [11]:
# Extracting all unique y_trian labels
y_train_labels = set(label for _, label in training_set)
print('\033[1m' + 'Number of unique labels in the training set: ' + '\033[0m' + str(len(y_train_labels)))

# Extracting all unique y_val labels
y_val_labels = set(label for _, label in validation_set)
print('\033[1m' + 'Number of unique labels in the validation set: ' + '\033[0m' + str(len(y_val_labels)))

# Extracting all unique y_test labels
y_test_labels = set(label for _, label in testing_set)
print('\033[1m' + 'Number of unique labels in the testing set: ' + '\033[0m' + str(len(y_test_labels)))

# Printing total number of unique labels
print('\033[1m' + 'Total number of unique labels: ' + '\033[0m' + str(len(y_train_labels | y_val_labels | y_test_labels)))

[1mNumber of unique labels in the training set: [0m143
[1mNumber of unique labels in the validation set: [0m66
[1mNumber of unique labels in the testing set: [0m76
[1mTotal number of unique labels: [0m285


In [14]:
# Preparing the training, validation and testing sets
x_train = np.array([utterance for utterance, speaker in training_set])
y_train = np.array([speaker for utterance, speaker in training_set])

x_val = np.array([utterance for utterance, speaker in validation_set])
y_val = np.array([speaker for utterance, speaker in validation_set])

x_test = np.array([utterance for utterance, speaker in testing_set])
y_test = np.array([speaker for utterance, speaker in testing_set])


In [15]:
print(x_train)
print(x_train.shape)
print(y_train)
print(y_train.shape)

[[[-14.032067    -8.648956    -9.814945   ... -18.148458   -18.78042
   -20.902922  ]
  [-13.273803   -14.622496   -22.493608   ... -23.861475   -18.785421
   -22.297794  ]
  [-13.301507   -29.018728   -36.8356     ... -30.761219   -12.55027
   -19.129429  ]
  ...
  [-30.591118   -24.632444   -27.777576   ... -39.230064   -28.173628
   -33.921585  ]
  [-30.857285   -28.413536   -37.362095   ... -50.90469    -41.79158
   -46.69903   ]
  [-39.880646   -52.014168   -50.204197   ... -66.41766    -63.736988
   -60.426186  ]]

 [[-26.324398   -16.26928    -16.446526   ... -17.862951    -4.651102
    -5.9604015 ]
  [-25.9967     -23.202648   -27.995464   ... -23.575256    -8.9940815
   -14.820309  ]
  [-30.850307   -13.072872    -9.059519   ... -25.829401   -12.41645
   -32.17312   ]
  ...
  [-59.134087   -56.37001    -54.937584   ... -57.61374    -52.293274
   -59.372677  ]
  [-61.98456    -60.104782   -54.905487   ... -58.8386     -58.78263
   -56.735397  ]
  [-64.14887    -65.33355    -64.

In [None]:
# Training the model
history = model.fit(x_train, y_train, batch_size=32, epochs=100, validation_data=(x_val, y_val))

# Plotting the training and validation accuracy
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.show()


# Plotting the training and validation loss
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()