# Arabic Spoken Digit Recognition

## Load the Datasets

In [337]:
import numpy as np


def read_data(f:str, n_speakers:int, num_times:int=10) -> list[np.ndarray]:
    """
    Read data from the dataset.
    Data is formatted in blocks. Each block has 13 rows and ~30 columns.
    Blocks are separated by an empty row.
    Each column represents one of the 13 cepstral coefficients.
    Each block is a numpy array of shape (13, ~30).

    Parameters
    ----------
    f : str
        Path to the dataset.

    n_speakers : int
        Number of speakers whose voice is recorded in the dataset.

    num_times : int
        Number of times each speaker speaks a given number in the dataset.

    Returns
    -------
    data : list
        List of numpy arrays, where each element is a numpy array.
        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] # 10 digits
        |\
        | \
        [0, 1, 3, 4, 5, ..., 660]  # 660 = n_speakers * num_times
        |\
        | \
        np.array([[0, 1, 2, 3, 4, 5, ..., 13],
                  [0, 1, 2, 3, 4, 5, ..., 13],
                  ...
                  [0, 1, 2, 3, 4, 5, ..., 13]])  # 13 = number of cepstral coefficients, as many rows as there are frames (~30)
    """

    data = []  # Initialize an empty list to store the data
    current_digit_data = []  # Initialize a list to store data for the current digit
    current_block = []  # Initialize a list to store data for the current block

    with open(f, 'r') as file:
        for line in file:
            line = line.strip()
            if line:  # Non-empty line
                values = [float(value) for value in line.split()]  # Parse values from the line
                current_block.append(np.array(values).reshape(-1, 13))
            else:  # Empty line (block separator)
                to_add = np.array(current_block).reshape(-1, 13)
                current_digit_data.append(to_add)
                current_block = []  # Reset the current block
                if len(current_digit_data) == n_speakers * num_times:
                    data.append(current_digit_data)
                    current_digit_data = []  # Reset the current_digit_data, move on to the next digit

        # Add last digit's data
        # Edge case: last digit's data is not added in the loop above
        current_digit_data.append(np.array(current_block).reshape(-1, 13))
        data.append(current_digit_data)

    return data

In [338]:
TEST = read_data("data/Test_Arabic_Digit.txt", n_speakers=22)
print(f'Number of digitis in the test dataset: {len(TEST)}')
print(f'Total number of blocks in the test dataset: {sum([len(digit) for digit in TEST])}')
print(f'Number of blocks for each digit: {[len(digit) for digit in TEST]}')
print(f'Sample block shapes for each digit: {[digit[0].shape for digit in TEST]}')

Number of digitis in the test dataset: 10
Total number of blocks in the test dataset: 2200
Number of blocks for each digit: [220, 220, 220, 220, 220, 220, 220, 220, 220, 220]
Sample block shapes for each digit: [(28, 13), (31, 13), (36, 13), (41, 13), (37, 13), (34, 13), (50, 13), (39, 13), (46, 13), (36, 13)]


In [339]:
TRAIN = read_data("data/Train_Arabic_Digit.txt", n_speakers=66)
print(f'Number of digitis in the train dataset: {len(TRAIN)}')
print(f'Total number of blocks in the train dataset: {sum([len(digit) for digit in TRAIN])}')
print(f'Number of blocks for each digit: {[len(digit) for digit in TRAIN]}')
print(f'Sample block shapes for each digit: {[digit[0].shape for digit in TRAIN]}')

Number of digitis in the train dataset: 10
Total number of blocks in the train dataset: 6600
Number of blocks for each digit: [660, 660, 660, 660, 660, 660, 660, 660, 660, 660]
Sample block shapes for each digit: [(38, 13), (32, 13), (35, 13), (38, 13), (31, 13), (29, 13), (42, 13), (31, 13), (44, 13), (34, 13)]


In [340]:
# Supplementary information about how many frames are male for each digit
# This is the sum of the lengths for the first half of the utterances for each digit

def male_frames(data):
    male_frames = []
    for digit in data:
        ld = len(digit)
        half_ld = ld // 2
        s = sum([len(digit[i]) for i in range(half_ld)])
        male_frames.append(s)
    return male_frames

train_male_frames = male_frames(TRAIN)
test_male_frames = male_frames(TEST)

print('Train male frames')
print(train_male_frames)

print('Test male frames')
print(test_male_frames)

Train male frames
[11588, 10803, 12987, 14020, 13018, 10403, 14014, 12253, 15546, 11431]
Test male frames
[3817, 3964, 4786, 4794, 4647, 3508, 5073, 4405, 5736, 3837]


In [341]:
# We need to gather the points from all digits, not just the first one
# Make a list of all the digits -- i.e. append all utterances within each digit
# Each element in the list X is of the form:

# [-----------------13-----------------]
# |
# |
# | n_speakers * ~30 * 10
# |
# |
# |

import pandas as pd


def concatenate(x: list[np.ndarray]) -> list[np.ndarray]:
    '''
    Concatenate all utterances for each digit.
    '''
    to_append = []
    for digit in x:
        digit_utterances = []
        for utterance in digit:
            if utterance.shape[1] == 13:
                digit_utterances.append(utterance)
        to_append.append(digit_utterances)

    return [np.concatenate(to_append_digit, axis=0) for to_append_digit in to_append]

# Train data is all the utterances for each digit concatenated
X_TRAIN = concatenate(TRAIN)

X_TEST = concatenate(TEST)

# Display the shapes of the data as a dataframe
df = pd.DataFrame({'Train': [x.shape for x in X_TRAIN]})
df.style.set_caption("Shape of the datasets for each digit")


Unnamed: 0,Train
0,"(23344, 13)"
1,"(22652, 13)"
2,"(27938, 13)"
3,"(29406, 13)"
4,"(27555, 13)"
5,"(21631, 13)"
6,"(28991, 13)"
7,"(24924, 13)"
8,"(32860, 13)"
9,"(23955, 13)"


## Create the Model

In [342]:
from sklearn.mixture import GaussianMixture
from sklearn.neighbors import KernelDensity
from tqdm import tqdm


class DigitClassifier:
    '''
    Class for a digit classifier using Gaussian Mixture Models and Kernel Density Estimation based on the log-likelihoods.

    Parameters
    ----------
    num_clusters : int
        Number of clusters for each digit.

    bandwidth : float
        Bandwidth for the kernel density estimation.

    cepstral_mask : list
        List of booleans to mask the cepstral coefficients. Defaults to [True] * 13.

    covariance_type : str
        Covariance type for the Gaussian Mixture Model. Defaults to 'full'.
        Other options are 'spherical', 'diag', and 'tied'.
    '''

    def __init__(self, num_clusters: int = 8, bandwidth: float = 0.5, cepstral_mask = [True] * 13, covariance_type: str = 'full'):
        self.num_clusters = num_clusters
        self.covariance_type = covariance_type
        self.GMMs = []
        self.weights = []
        self.cepstral_mask = cepstral_mask
        self.kde = KernelDensity(bandwidth=bandwidth)

    def model_info(self):
        '''
        Print the model information.

        Returns
        -------
        info : str
        '''
        return f'GMM with {self.num_clusters} clusters, cepstral mask {self.cepstral_mask}, and covariance type {self.covariance_type}'
    
    def fit(self, train_data: list[np.ndarray]):
        '''
        Fit the model to the training data.

        Parameters
        ----------
        train_data : list
            List of numpy arrays, where each element is the concatenation of all utterances for a given digit.
        '''

        self.gmms = []

        # Mask the cepstral coefficients by copying and slicing the data
        processed_train_data = []
        for digit in train_data:
            processed_train_data.append(digit[:, self.cepstral_mask])

        # for i in tqdm(range(10), leave=False):
        for i in range(10):
            x = processed_train_data[i]
            gmm = GaussianMixture(n_components=self.num_clusters, random_state=42, covariance_type=self.covariance_type).fit(x)
            self.GMMs.append(gmm)
            self.weights = gmm.weights_

    def predictions(self, x: list[np.ndarray]) -> list[float]:
        '''
        Predict the digit for each utterance in x.

        Parameters
        ----------
        x : list
            List of numpy arrays, where each element is a numpy array of shape (13, ~30).

        Returns
        -------
        predicted_means : list
            List of predicted means for each utterance in x.
        '''

        predicted_means = []

        for gmm in self.GMMs:
            # Compute the log-likelihood of each point in x_digit
            log_likelihood = gmm.score_samples(x)

            # Estimate the PDF of the likelihoods using KernelDensity
            self.kde.fit(log_likelihood.reshape(-1, 1))

            # Print the mean values
            predicted_mean = np.mean(log_likelihood)
            predicted_means.append(predicted_mean)

        return predicted_means
    
    def predict(self, x: list[np.ndarray]) -> int:
        '''
        Predict the digit for the utterance x.

        Parameters
        ----------
        x : list
            List of numpy arrays, where each element is a numpy array of shape (13, ~30).

        Returns
        -------
        predicted_digit : int
            Predicted digit.
        '''
        # If the mask hasn't been applied, apply it
        if x.shape[1]  != sum([1 for i in self.cepstral_mask if i]):
            x = x[:, self.cepstral_mask]
            
        predicted_means = self.predictions(x)
        return np.argmax(predicted_means)
    
    def test_accuracy(self, test_data: list[np.ndarray], verbose: bool = False) -> float:
        '''
        Compute the accuracy of the model on the test data.

        Parameters
        ----------
        test_data : list
            List of numpy arrays, where each element is a numpy array of shape (13, ~30).

        verbose : bool
            Whether to print the accuracy for each digit.

        Returns
        -------
        accuracies : tuple
            Tuple of (overall_accuracy, digit_accuracies).
        '''

        # Again mask the cepstral coefficients by copying and slicing the data
        processed_test_data = []
        for digit in test_data:
            digit_utterances = []
            for utterance in digit:
                # Apply the mask
                digit_utterances.append(utterance[:, self.cepstral_mask])
            processed_test_data.append(digit_utterances)

        total_tested = 0
        correct = 0

        digit_accuracies = []

        # Loop through all different digits
        for digit_num, digit in enumerate(processed_test_data):
            digit_utterances_correct = 0
            total_digit_utterances = len(digit)

            # Loop through all utterances for the current digit
            for utterance in digit:
                predicted_digit = self.predict(utterance)
                if predicted_digit == digit_num:
                    digit_utterances_correct += 1

            if verbose:
                print(f'Accuracy for digit {digit_num}: {digit_utterances_correct / total_digit_utterances}')

            digit_accuracies.append(digit_utterances_correct / total_digit_utterances)

            total_tested += total_digit_utterances
            correct += digit_utterances_correct

        if verbose:
            print(f'Overall accuracy: {correct / total_tested}')

        return correct / total_tested, [digit_accuracy for digit_accuracy in digit_accuracies]
        

In [343]:
class GenderDigitClassifier(DigitClassifier):

    def __init__(self, num_clusters: int = 8, bandwidth: float = 0.5, cepstral_mask = [True] * 13, covariance_type: str = 'full'):
        super().__init__(num_clusters, bandwidth, cepstral_mask, covariance_type)

    def fit(self, train_data: list[np.ndarray]):
        '''
        Fit the model to the training data.
        Assumes that train_data is actually the train data, as it uses the sizes from before to understand male/female distribution. 

        Parameters
        ----------
        train_data : list
            List of numpy arrays, where each element is the concatenation of all utterances for a given digit.
        '''

        self.gmms = []

        # Mask the cepstral coefficients by copying and slicing the data
        # Also split for gender
        # That is, processed_train_data is of the form [0M, 0F, 1M, 1F, ..., 9M, 9F]
        processed_train_data = []
        for i, digit in enumerate(train_data):
            sliced = digit[:, self.cepstral_mask]

            # Now split by train_male_frames, as in the first train_male_frames rows will be added independently
            processed_train_data.append(sliced[:train_male_frames[i], :])
            processed_train_data.append(sliced[train_male_frames[i]:, :])

        # for i in tqdm(range(20), leave=False):
        for i in range(20):
            x = processed_train_data[i]
            gmm = GaussianMixture(n_components=self.num_clusters, random_state=42, covariance_type=self.covariance_type).fit(x)
            self.GMMs.append(gmm)
            self.weights = gmm.weights_
    
    def predict(self, x: list[np.ndarray]) -> int:
        '''
        Predict the digit for the utterance x.

        Parameters
        ----------
        x : list
            List of numpy arrays, where each element is a numpy array of shape (13, ~30).

        Returns
        -------
        predicted_digit : int
            Predicted digit.
        '''

        # If the mask hasn't been applied, apply it
        if x.shape[1]  != sum([1 for i in self.cepstral_mask if i]):
            x = x[:, self.cepstral_mask]

        predicted_means = self.predictions(x)
        # Divide by two as to get the digit rather than the GMM
        # The decimal precision is used to determine the calculated gender
        return np.argmax(predicted_means) / 2
    

    def test_accuracy(self, test_data: list[np.ndarray], verbose: bool = False) -> float:
        '''
        Compute the accuracy of the model on the test data.
        Assumes that test_data is actually the test data, as it uses the sizes from before to understand male/female distribution.

        Parameters
        ----------
        test_data : list
            List of numpy arrays, where each element is a numpy array of shape (13, ~30).

        verbose : bool
            Whether to print the accuracy for each digit.

        Returns
        -------
        accuracies : tuple
            Tuple of (overall_accuracy, digit_accuracies).
        '''

        # Again mask the cepstral coefficients by copying and slicing the data
        processed_test_data = []
        for i, digit in enumerate(test_data):
            digit_utterances = []
            for utterance in digit:
                # Apply the mask
                sliced = utterance[:, self.cepstral_mask]
                # Slice by male and female speakers
                m = sliced[:test_male_frames[i], :]
                f = sliced[test_male_frames[i]:, :]
                # Append both only if both have nonzero shape
                if m.shape[0] > 0:
                    digit_utterances.append(m)
                if f.shape[0] > 0:
                    digit_utterances.append(f)
            processed_test_data.append(digit_utterances)

        print('here')

        print(processed_test_data[-1][-2].shape)


        total_tested = 0
        correct = 0

        digit_accuracies = []

        # Loop through all different digits
        for digit_num, digit in enumerate(processed_test_data):
            digit_utterances_correct = 0
            total_digit_utterances = len(digit)

            # Loop through all utterances for the current digit
            for utterance in digit:
                predicted_digit = self.predict(utterance)
                # Int as to ignore gender
                if int(predicted_digit) == digit_num:
                    digit_utterances_correct += 1

            if verbose:
                print(f'Accuracy for digit {digit_num}: {digit_utterances_correct / total_digit_utterances}')

            digit_accuracies.append(digit_utterances_correct / total_digit_utterances)

            total_tested += total_digit_utterances
            correct += digit_utterances_correct

        if verbose:
            print(f'Overall accuracy: {correct / total_tested}')

        return correct / total_tested, [digit_accuracy for digit_accuracy in digit_accuracies]



6.0

In [349]:
num = 4
gdc = GenderDigitClassifier(num_clusters=3, bandwidth=0.5, cepstral_mask=[*([True] * num), *([False] * (13-num))], covariance_type='full')

gdc.fit(X_TRAIN)

# _ = gdc.test_accuracy(TEST, verbose=True)

# predict the first test utterance
# gdc.predict(TEST[6][200])

6.5

In [350]:
num = 6
dc = DigitClassifier(num_clusters=6, bandwidth=0.5, cepstral_mask=[*([True] * num), *([False] * (13-num))], covariance_type='spherical')

dc.fit(X_TEST)


In [351]:
_ = dc.test_accuracy(TEST, verbose=True)

Accuracy for digit 0: 0.8227272727272728
Accuracy for digit 1: 0.9318181818181818
Accuracy for digit 2: 0.8590909090909091
Accuracy for digit 3: 0.7818181818181819
Accuracy for digit 4: 0.7363636363636363
Accuracy for digit 5: 0.8636363636363636
Accuracy for digit 6: 0.9136363636363637
Accuracy for digit 7: 0.5318181818181819
Accuracy for digit 8: 0.8363636363636363
Accuracy for digit 9: 0.7772727272727272
Overall accuracy: 0.8054545454545454
