Import all libraries

In [38]:
import os
import re
import pandas as pd
import numpy as np 
import tensorflow as tf 

from transformers import TFWav2Vec2Model, Wav2Vec2FeatureExtractor, Wav2Vec2CTCTokenizer, TFWav2Vec2ForCTC
from keras.models import Model, Sequential
from keras.optimizers import Adam
from keras import backend as K

import librosa as lb
from librosa.effects import trim
import librosa.display

from sklearn.model_selection import train_test_split

Setting Up the datasets

In [2]:
dataset_path = "dataset/"
metadata = "Datasets.csv"

audio_directory = "dataset/"

# Create a dataframe for the transcript
dataframe = pd.read_csv(metadata)

# Preprocess transcript
def preprocess_text(text):
    text = text.lower()  # Convert text to lowercase
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)  # Remove non-alphanumeric characters (except spaces)
    text = text.replace(" ", "")  # Remove all whitespace
    return text

dataframe['clean_transcript'] = dataframe['Transcription'].apply(preprocess_text)

print(dataframe.head())

  File_Path Speaker         Transcription Session clean_transcript
0     03M_1     03M  1 2 3 4 5 6 7 8 9 10       1      12345678910
1     03M_2     03M                   ata       2              ata
2     03M_3     03M                   ana       3              ana
3     03M_4     03M                   ara       4              ara
4     03M_5     03M                  atha       5             atha


Data Pre-Processing

In [3]:
# Function to preprocess audio and connect to transcripts
def combine_audio_with_transcript(directory, dataframe):
    audio_data = []    
     # Iterate over each row in the DataFrame
    for index, row in dataframe.iterrows():
        file_name = row['File_Path']  # Get the file name from the CSV (without .wav)
        transcript = row['clean_transcript']  # Get the transcript
        
        # Construct the full file path by combining directory and file name with .wav extension
        file_path = os.path.join(directory, f"{file_name}.wav")
        
        # Check if the file exists in the audio directory
        if os.path.exists(file_path):
            try:
                # Append the processed data along with the transcript
                audio_data.append({
                    "file_path": file_path,
                    "transcript": transcript,
                })
            except Exception as e:
                print(f"Error processing file {file_path}: {e}")
    
    return audio_data

# Preprocess audio files and connect to transcripts
audio_data_with_transcripts = combine_audio_with_transcript(audio_directory, dataframe)
print(audio_data_with_transcripts)

[{'file_path': 'dataset/01F_1.wav', 'transcript': '12345678910'}, {'file_path': 'dataset/01F_3.wav', 'transcript': 'ana'}, {'file_path': 'dataset/01F_4.wav', 'transcript': 'ara'}]


Prepare dataset

In [4]:
# # Get Dataset from the folder

# prepared_audio_data = []

# # Dataset file path
# dataset_file_path = [item['file_path'] for item in audio_data_with_transcripts]

# # Transcript
# dataset_transcript = [item['transcript'] for item in audio_data_with_transcripts]

# max_length = 0
# sample_rate = 16000
# num_mels = 128

# try:
#     for file_path in dataset_file_path:
#         y, sr = librosa.load(file_path, sr=sample_rate, mono=True)
#         temp_length = len(y)
        
#         if temp_length > max_length:
#             max_length = temp_length
# except Exception as e:
#     print(f"Error loading {file_path}: {e}")
    
# try:
#     for file_path in dataset_file_path:
#         # Load the audio file
#         y, sr = lb.load(file_path, sr=sample_rate, mono=True)
        
#         # Trim silent edges of the audio
#         y, _ = lb.effects.trim(y)
        
#         # Normalize the audio
#         y = lb.util.normalize(y)
        
#         # Pad the audio to the maximum length
#         if len(y) < max_length:
#             y = np.pad(y, (0, max_length - len(y)))
#         else:
#             y = y[:max_length]  # This line can be skipped if no truncation is desired
        
#         # Convert to Mel spectrogram
#         mel_spec = lb.feature.melspectrogram(y=y, sr=sr, n_mels=num_mels)
        
#         # Convert to dB scale (log scale)
#         mel_spec_db = lb.power_to_db(mel_spec, ref=np.max)
        
#         # Normalize the spectrogram between 0 and 1
#         mel_spec_db = (mel_spec_db - np.min(mel_spec_db)) / (np.max(mel_spec_db) - np.min(mel_spec_db))
        
#         # Append the processed Mel spectrogram to the list
#         prepared_audio_data.append(mel_spec_db.T)  # Transpose to match the expected input shape

#         print(f"Loaded {file_path} with shape {y.shape}")

# except Exception as e:
#     print(f"Error loading {file_path}: {e}")

Set up the model

In [39]:
extractor = Wav2Vec2FeatureExtractor.from_pretrained(
    "./wav2vec2-large-xlsr-53",
    from_pt=True
)

model = TFWav2Vec2ForCTC.from_pretrained(
    "./wav2vec2-large-xlsr-53",
    from_pt=True
)

tokenizer = tf.keras.preprocessing.text.Tokenizer()


TFWav2Vec2ForCTC has backpropagation operations that are NOT supported on CPU. If you wish to train/fine-tune this model, you need a GPU or a TPU
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFWav2Vec2ForCTC: ['project_q.bias', 'project_hid.weight', 'project_hid.bias', 'project_q.weight', 'quantizer.weight_proj.bias', 'quantizer.codevectors', 'quantizer.weight_proj.weight']
- This IS expected if you are initializing TFWav2Vec2ForCTC from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFWav2Vec2ForCTC from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFWav2Vec2ForCTC were not initialized from the PyTorch model and are newly initializ

In [40]:

tokenizer.fit_on_texts(labels['transcript'] for labels in audio_data_with_transcripts)

max_audio_length = 0
for sample in audio_data_with_transcripts:
    speech, _ = librosa.load(sample['file_path'], sr=16000)
    audio_length = len(speech)  # Length of the audio file (in samples)
    if audio_length > max_audio_length:
        max_audio_length = audio_length


def preprocess(audio_data_with_transcripts, sampling_rate=16000):
    speech, _ = librosa.load(audio_data_with_transcripts['file_path'], sr=sampling_rate)
    input_values = extractor(speech, sampling_rate=sampling_rate, return_tensors="tf").input_values

    labels = tokenizer.texts_to_sequences([audio_data_with_transcripts['transcript']])
    label_len = len(labels[0])

    padded_labels = tf.keras.preprocessing.sequence.pad_sequences(
        labels, maxlen=max_audio_length, padding='post'
    )

    return {
        "input_values": input_values[0],
        "labels": padded_labels[0],
        "input_length": tf.shape(input_values[0])[0], # model_downsample_rate,  # adjust this
        "label_length": label_len
    }

processed_data = [preprocess(data) for data in audio_data_with_transcripts]

for data in processed_data:
    print(data)

{'input_values': <tf.Tensor: shape=(113960,), dtype=float32, numpy=
array([-0.10316028, -0.1667561 , -0.13397591, ..., -0.01674847,
       -0.00446125, -0.01840178], dtype=float32)>, 'labels': array([1, 0, 0, ..., 0, 0, 0]), 'input_length': <tf.Tensor: shape=(), dtype=int32, numpy=113960>, 'label_length': 1}
{'input_values': <tf.Tensor: shape=(148146,), dtype=float32, numpy=
array([0.13108213, 0.21958746, 0.18175448, ..., 0.08491193, 0.09992137,
       0.09167296], dtype=float32)>, 'labels': array([2, 0, 0, ..., 0, 0, 0]), 'input_length': <tf.Tensor: shape=(), dtype=int32, numpy=148146>, 'label_length': 1}
{'input_values': <tf.Tensor: shape=(148857,), dtype=float32, numpy=
array([-0.00935751, -0.01834278, -0.03925886, ...,  0.12503275,
        0.166711  ,  0.10804716], dtype=float32)>, 'labels': array([3, 0, 0, ..., 0, 0, 0]), 'input_length': <tf.Tensor: shape=(), dtype=int32, numpy=148857>, 'label_length': 1}


In [None]:
def data_generator():
    for item in processed_data:
        yield {
            'input_values': item['input_values'],
            'labels': item['labels'],
            'input_length': item['input_length'],
            'label_length': item['label_length']
        }

output_signature = {
    'input_values': tf.TensorSpec(shape=(None,), dtype=tf.float32),
    'labels': tf.TensorSpec(shape=(max_audio_length,), dtype=tf.int32),
    'input_length': tf.TensorSpec(shape=(), dtype=tf.int32),
    'label_length': tf.TensorSpec(shape=(), dtype=tf.int32),
}

dataset = tf.data.Dataset.from_generator(data_generator, output_signature=output_signature)

dataset = dataset.padded_batch(
    batch_size=4,
    padded_shapes={
        'input_values': [None],  # Variable length
        'labels': [max_audio_length],  # Fixed length for labels (adjust this as needed)
        'input_length': [],  # Single scalar for the length of input sequence
        'label_length': []  # Single scalar for the length of label sequence
    },
    padding_values={
        'input_values': 0.0,  # Padding value for input
        'labels': -100,  # Padding value for labels (if using CTC loss)
        'input_length': 0,  # Padding value for input_length
        'label_length': 0  # Padding value for label_length
    },
    drop_remainder=True  # This ensures no partial batches are included
).prefetch(tf.data.AUTOTUNE)



In [54]:
class CTCLossModel(tf.keras.Model):
    def __init__(self, base_model):
        super().__init__()
        self.base_model = base_model

    def train_step(self, data):
        # Extract the data
        input_values = data['input_values']
        y_true = data['labels']
        input_lengths = data['input_length']
        label_lengths = data['label_length']

        # Reshape input_lengths and label_lengths to match the expected shape
        input_lengths = tf.reshape(input_lengths, (-1, 1))
        label_lengths = tf.reshape(label_lengths, (-1, 1))

        with tf.GradientTape() as tape:
            # Forward pass through the base model
            model_output = self.base_model(input_values, training=True)
            logits = model_output.logits

            # Calculate CTC loss
            loss = tf.keras.backend.ctc_batch_cost(y_true, logits, input_lengths, label_lengths)

        # Calculate gradients and apply them
        grads = tape.gradient(loss, self.trainable_variables)
        self.optimizer.apply_gradients(zip(grads, self.trainable_variables))

        return {"loss": loss}

# Compile the model
CTCModel = CTCLossModel(model)

for layer in model.layers:
    layer.trainable = True

CTCModel.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4))


Train Model

In [55]:
history = CTCModel.fit(dataset, epochs=50)

Epoch 1/50


ResourceExhaustedError: in user code:

    File "c:\Users\Marwin Jay\Desktop\ASR Client\tensorflow-asr\venv\lib\site-packages\keras\src\engine\training.py", line 1377, in train_function  *
        return step_function(self, iterator)
    File "c:\Users\Marwin Jay\Desktop\ASR Client\tensorflow-asr\venv\lib\site-packages\keras\src\engine\training.py", line 1360, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "c:\Users\Marwin Jay\Desktop\ASR Client\tensorflow-asr\venv\lib\site-packages\keras\src\engine\training.py", line 1349, in run_step  **
        outputs = model.train_step(data)
    File "C:\Users\Marwin Jay\AppData\Local\Temp\ipykernel_20904\1761028653.py", line 27, in train_step
        self.optimizer.apply_gradients(zip(grads, self.trainable_variables))
    File "c:\Users\Marwin Jay\Desktop\ASR Client\tensorflow-asr\venv\lib\site-packages\keras\src\optimizers\optimizer.py", line 1223, in apply_gradients
        return super().apply_gradients(grads_and_vars, name=name)
    File "c:\Users\Marwin Jay\Desktop\ASR Client\tensorflow-asr\venv\lib\site-packages\keras\src\optimizers\optimizer.py", line 638, in apply_gradients
        self.build(trainable_variables)
    File "c:\Users\Marwin Jay\Desktop\ASR Client\tensorflow-asr\venv\lib\site-packages\keras\src\optimizers\adam.py", line 145, in build
        self.add_variable_from_reference(
    File "c:\Users\Marwin Jay\Desktop\ASR Client\tensorflow-asr\venv\lib\site-packages\keras\src\optimizers\optimizer.py", line 1125, in add_variable_from_reference
        return super().add_variable_from_reference(
    File "c:\Users\Marwin Jay\Desktop\ASR Client\tensorflow-asr\venv\lib\site-packages\keras\src\optimizers\optimizer.py", line 508, in add_variable_from_reference
        initial_value = tf.zeros(
    File "c:\Users\Marwin Jay\Desktop\ASR Client\tensorflow-asr\venv\lib\site-packages\tensorflow\dtensor\python\api.py", line 64, in call_with_layout
        return fn(*args, **kwargs)

    ResourceExhaustedError: {{function_node __wrapped__Fill_device_/job:localhost/replica:0/task:0/device:CPU:0}} OOM when allocating tensor with shape[4096,1024] and type float on /job:localhost/replica:0/task:0/device:CPU:0 by allocator cpu [Op:Fill] name: 
