In [3]:
import numpy as np
import librosa
import tensorflow as tf

2024-04-02 10:41:22.794977: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-04-02 10:41:23.199618: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-04-02 10:41:23.419606: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [4]:
class PhonemeRecognizer(tf.keras.Model):
    def __init__(self, num_phonemes):
        super(PhonemeRecognizer, self).__init__()
        self.conv1d_layer = tf.keras.layers.Conv1D(filters=64, kernel_size=3, activation='relu')
        self.pooling_layer = tf.keras.layers.MaxPooling1D(pool_size=2)
        self.flatten_layer = tf.keras.layers.Flatten()
        self.dense_layer = tf.keras.layers.Dense(units=num_phonemes, activation='softmax')
        
    def call(self, inputs):
        x = self.conv1d_layer(inputs)
        x = self.pooling_layer(x)
        x = self.flatten_layer(x)
        output = self.dense_layer(x)
        return output

In [5]:
# Define the Pronunciation Model (PM) architecture
class PronunciationModel(tf.keras.Model):
    def __init__(self, num_phonemes, embedding_dim):
        super(PronunciationModel, self).__init__()
        self.embedding_layer = tf.keras.layers.Embedding(input_dim=num_phonemes, output_dim=embedding_dim)
        self.lstm_layer = tf.keras.layers.LSTM(units=128, return_sequences=True)
        self.dense_layer = tf.keras.layers.Dense(units=num_phonemes, activation='softmax')
        
    def call(self, inputs):
        acoustic_features, auxiliary_embeddings = inputs
        
        # Pass the acoustic features through the embedding layer
        acoustic_embeddings = self.embedding_layer(acoustic_features)
        
        # Concatenate the acoustic embeddings with the auxiliary embeddings
        combined_embeddings = tf.concat([acoustic_embeddings, auxiliary_embeddings], axis=-1)
        
        # Pass the combined embeddings through LSTM layer
        lstm_output = self.lstm_layer(combined_embeddings)
        
        # Pass LSTM output through dense layer
        output = self.dense_layer(lstm_output)
        
        return output

In [6]:
# Define the Pronunciation Error Detector (PED) architecture
class PronunciationErrorDetector(tf.keras.Model):
    def __init__(self):
        super(PronunciationErrorDetector, self).__init__()
        
    def call(self, inputs):
        phonemes, pronunciation_likelihoods, canonical_phonemes = inputs
        
        # Compute the alignment between canonical and recognized phoneme sequences
        # Use dynamic programming algorithm
        
        # Compute the probabilities of mispronunciation
        probabilities = tf.where(tf.equal(aligned_phonemes, recognized_phonemes), 0.0, 1.0 - pronunciation_likelihoods)
        
        return probabilities

In [7]:
# Load audio file using librosa
def load_audio(audio_path, sr=16000):
    audio, _ = librosa.load(audio_path, sr=sr)
    return audio

In [8]:
# Extract mel spectrogram features from audio
def extract_mel_spectrogram(audio, sr=16000, n_fft=1024, hop_length=512, n_mels=80):
    mel_spectrogram = librosa.feature.melspectrogram(y=audio, sr=sr, n_fft=n_fft, hop_length=hop_length, n_mels=n_mels)
    mel_spectrogram_db = librosa.power_to_db(mel_spectrogram, ref=np.max)
    return mel_spectrogram_db