**<center><h1>Speaker Classification with Deep Learning</h1></center>**
**<center><h2>Speech Technology Assignment 2023-24</h2></center>**
**<center><h3>Matthias Bartolo</h3></center>**

**<h3>Package Installation</h3>**

In [1]:
# !pip install librosa

**<h3>Package Imports</h3>**

In [2]:
import os
import random
import librosa
import librosa.display
import numpy as np
import warnings

warnings.filterwarnings('ignore')

**<h3>Loading and Filtering Dataset</h3>**

In [3]:
def get_speaker_roots_in_data_path(datapath=os.path.join(os.getcwd(), 'ABI-1 Corpus\\accents')):
    """Function to get the list of speaker roots in the data path.
    
    Args:
        datapath (str): Path to the data folder.

    Returns:
        speaker_list (list): List of speaker roots in the data path.
    """
    # Declaring the list of speakers
    speaker_list = []

    # Retrieving the list of accent subfolders
    accent_subfolders = [f.path for f in os.scandir(datapath) if f.is_dir()]
    
    # Iterating through the accent subfolders
    for accent in accent_subfolders:
        # Iterating through the gender
        for gender in ['female', 'male']:
            # Retrieving the list of speaker folders
            speaker_folders = os.listdir(os.path.join(accent, gender))
            
            # Iterating through the speaker folders
            for speaker in speaker_folders:
                # Checking if the speaker folder is not a hidden folder
                if not speaker.startswith('.'):
                    speaker_list.append(os.path.join(accent, gender, speaker))
    
    # Returning the list of speakers
    return speaker_list

In [4]:
def get_wav_files_in_path(datapath):
    """Function to get the list of wav files in the data path.

    Args:
        datapath (str): Path to the data folder.
    
    Returns:
        wav_files (list): List of wav files in the data path.
    """
    # Retrieving the list of files in the data path
    files = os.listdir(datapath)

    # Filtering the list of files to get only the wav files
    wav_files = [f for f in files if f.endswith('.wav')]

    # Appending the path to the wav files
    wav_files = [os.path.join(datapath, f) for f in wav_files]
    
    # Returning the list of wav files
    return wav_files

In [5]:
# Retrieving the list of speaker roots in the data path
speaker_roots = get_speaker_roots_in_data_path()
print('\033[1m' + 'Number of speakers found: ' + '\033[0m' + str(len(speaker_roots)))

# Retrieving the list of wav files in the data path
wav_files = []

# Iterating through the speaker roots
for speaker_root in speaker_roots:
    # Retrieving the list of wav files in the speaker root
    wav_files.extend(get_wav_files_in_path(speaker_root))

print('\033[1m' + 'Number of wav files found: ' + '\033[0m' + str(len(wav_files)))

[1mNumber of speakers found: [0m285
[1mNumber of wav files found: [0m2850


**<h3>Preprocessing Data, Chunking and Dataset Splitting</h3>**

In [6]:
def chunk_audio(audio_path, chunk_size=3):
    """Function to chunk the audio file into specified-second segments.
    
    Args:
        audio_path (str): Path to the audio file.
        chunk_size (int): Duration of each audio chunk in seconds (default is 3 seconds).
    
    Returns:
        audio_chunks (list): List of audio chunks.
    """
    # Reading the audio file, whilst ensuring the sampling rate is 16kHz
    audio, sampling_rate = librosa.load(audio_path, sr=16000)

    # Preprocessing the audio
    audio /= np.max(np.abs(audio), axis=0)
    
    # Calculating the number of samples per chunk
    samples_per_chunk = int(sampling_rate * chunk_size)
    
    # Calculating the number of chunks
    num_chunks = int(np.floor(len(audio) / samples_per_chunk))
    
    # Initializing the list of audio chunks
    audio_chunks = []
    
    # Iterating through the audio chunks
    for i in range(num_chunks):
        # Calculating the start and end sample
        start_sample = i * samples_per_chunk
        end_sample = (i + 1) * samples_per_chunk

        # Calculating the audio chunk
        audio_chunk = audio[start_sample:end_sample]

        # Extracting the spectrogram through short-time Fourier transform (STFT) using librosa
        spectrogram = librosa.stft(audio_chunk, n_fft=512, hop_length=160, win_length=400)

        # Converting the raw amplitude rsults to decibels (log scale)
        spectrogram = librosa.amplitude_to_db(spectrogram, ref=np.max)

        # Appending the audio chunk to the list of audio chunks
        audio_chunks.append(spectrogram)
    
    # Returning the list of audio chunks
    return audio_chunks

In [7]:
def preprocess_data(speaker_roots, wav_files, training_set_ratio, validation_set_ratio, testing_set_ratio, do_display=False):
    """"Function to preprocess, chunk and split the data.

    Args:
        speaker_roots (list): List of speaker roots in the data path.
        wav_files (list): List of wav files in the data path.
        training_set_ratio (float): Ratio of the training set.
        validation_set_ratio (float): Ratio of the validation set.
        testing_set_ratio (float): Ratio of the testing set.
        do_display (bool): Boolean to display the audio chunks (default is False).

    Returns:
        training_set (list): List of training examples.
        validation_set (list): List of validation examples.
        testing_set (list): List of testing examples.
    """
    # Error checking for the ratios
    if training_set_ratio + validation_set_ratio + testing_set_ratio != 1:
        raise ValueError('The sum of the ratios must be equal to 1.')
    
    # Creating dictionary to store the speak to utterances mapping
    speaker_to_utterances = {}

    # Retrieving the list of speakers through the speaker roots
    speakers ={speaker_root.split('\\')[-1]:speaker_root for speaker_root in speaker_roots}

    # Iterating through the speakers
    for speaker, speaker_root in speakers.items():
        if do_display:
            # Printing the speaker being processed
            print_message = '\033[32m' + 'Executing Speaker: ' + '\033[0m' + speaker + '\t {} / {}'.format(speaker_roots.index(speaker_root) + 1, len(speaker_roots))
            print(print_message)
            print('-' * len(print_message))

        # Retrieving the list of wav files in the speaker root
        speaker_wav_files = get_wav_files_in_path(speaker_root)
        
        # Initializing the list of utterances
        utterances = []
        
        # Iterating through the wav files
        for wav_file in speaker_wav_files:
            # Chunking the audio file into 3 seconds segments
            utterances.extend(chunk_audio(wav_file))
        
        # Appending the list of utterances to the dictionary
        speaker_to_utterances[speaker] = utterances

    # Calculating the total number of utterances
    total_num_utterances = sum([len(utterances) for utterances in speaker_to_utterances.values()])

    # Declaring the training, validation and testing sets
    training_set = {'data': [], 'number of utterances': 0, 'expected number of utterances': int(total_num_utterances * training_set_ratio)}
    validation_set = {'data': [], 'number of utterances': 0, 'expected number of utterances': int(total_num_utterances * validation_set_ratio)}
    testing_set = {'data': [], 'number of utterances': 0, 'expected number of utterances': int(total_num_utterances * testing_set_ratio)}

    # Declaring variable which will hold the current speaker counter
    current_speaker = 0

    """Partitioning the speakers into training, validation and testing sets based on the ratios and the number of utterances, to ensure utterance independence"""

    # Iterating until the number of utterances in the training set is equal to the expected number of utterances
    while training_set['number of utterances'] < training_set['expected number of utterances']:
        # Adding the speaker to the validation set
        training_set['data'].extend(speaker_to_utterances[list(speaker_to_utterances.keys())[current_speaker]])
        
        # Incrementing the current speaker counter
        current_speaker += 1

        # Updating the number of utterances in the training set
        training_set['number of utterances'] = len(training_set['data'])
    
    # Iterating until the number of utterances in the validation set is equal to the expected number of utterances
    while validation_set['number of utterances'] < validation_set['expected number of utterances']:
        # Adding the speaker to the validation set
        validation_set['data'].extend(speaker_to_utterances[list(speaker_to_utterances.keys())[current_speaker]])
        
        # Incrementing the current speaker counter
        current_speaker += 1

        # Updating the number of utterances in the validation set
        validation_set['number of utterances'] = len(validation_set['data'])

    # Iterating until the number of utterances in the testing set is equal to the expected number of utterances
    while current_speaker < len(speaker_to_utterances):
        # Adding the speaker to the testing set
        testing_set['data'].extend(speaker_to_utterances[list(speaker_to_utterances.keys())[current_speaker]])
        
        # Incrementing the current speaker counter
        current_speaker += 1

        # Updating the number of utterances in the testing set
        testing_set['number of utterances'] = len(testing_set['data'])
    
    # Shuffling the different sets
    random.shuffle(training_set['data'])
    random.shuffle(validation_set['data'])
    random.shuffle(testing_set['data'])

    # Error checking for the number of utterances
    if training_set['number of utterances'] + validation_set['number of utterances'] + testing_set['number of utterances'] != total_num_utterances:
        raise ValueError('The number of utterances in the partitioned sets does not match the total number of utterances.')
    
    # Printing out the percentage of utterances in each set
    if do_display:
        print('\033[35m' + 'Percentage of utterances in each set:' + '\033[0m')
        print('\033[35m' + 'Training Set: ' + '\033[0m' + '{:.2%}'.format(training_set['number of utterances'] / total_num_utterances))
        print('\033[35m' + 'Validation Set: ' + '\033[0m' + '{:.2%}'.format(validation_set['number of utterances'] / total_num_utterances))
        print('\033[35m' + 'Testing Set: ' + '\033[0m' + '{:.2%}'.format(testing_set['number of utterances'] / total_num_utterances))

    
    # Returning the training, validation and testing sets
    return training_set, validation_set, testing_set

In [8]:
# Calling the preprocess_data function
training_set, validation_set, testing_set = preprocess_data(speaker_roots, wav_files, 0.6, 0.2, 0.2, do_display=True)

[32mExecuting Speaker: [0malw001	 1 / 285
-------------------------------------------
[32mExecuting Speaker: [0mcxb001	 2 / 285
-------------------------------------------
[32mExecuting Speaker: [0mjah001	 3 / 285
-------------------------------------------
[32mExecuting Speaker: [0mjep001	 4 / 285
-------------------------------------------
[32mExecuting Speaker: [0mknb001	 5 / 285
-------------------------------------------
[32mExecuting Speaker: [0mlcg001	 6 / 285
-------------------------------------------
[32mExecuting Speaker: [0mlst001	 7 / 285
-------------------------------------------
[32mExecuting Speaker: [0mmjd001	 8 / 285
-------------------------------------------
[32mExecuting Speaker: [0mmpt001	 9 / 285
-------------------------------------------
[32mExecuting Speaker: [0mrkk001	 10 / 285
--------------------------------------------
[32mExecuting Speaker: [0majh001	 11 / 285
--------------------------------------------
[32mExecuting Speaker: [0m

**<h3>Speaker Identification (SID) Model Design and Implementation</h3>**