Import all packages here

In [53]:
import torch
from torch.utils.data import DataLoader, TensorDataset
from datasets import Dataset
from torch.nn.utils.rnn import pad_sequence
from torch.optim import AdamW
from torch.nn import CTCLoss
from tqdm import tqdm

from dataclasses import dataclass
from typing import Dict, List, Union

from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC, Trainer, TrainingArguments, Wav2Vec2CTCTokenizer

import librosa
from librosa.effects import trim
import librosa.display

from IPython.display import Audio

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

import pydub as pyd

import jiwer as jw

import pandas as pd 
import numpy as np 

import os

import matplotlib.pyplot as plt

import re


Mounting Gdrive for the datasets

Setting up the Dataset

In [54]:
dataset_path = "dataset/"
metadata = "dataset.csv"

audio_directory = "dataset/"

# Create a dataframe for the transcript
dataframe = pd.read_csv(metadata)

# Preprocess transcript
def preprocess_text(text):
    text = text.upper()  # Convert text to uppercase
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)  # Remove non-alphanumeric characters (except spaces)
    return text

dataframe['clean_transcript'] = dataframe['Transcription'].apply(preprocess_text)

print(dataframe.head())

  File_Path Speaker         Transcription Session      clean_transcript
0     03M_1     03M  1 2 3 4 5 6 7 8 9 10       1  1 2 3 4 5 6 7 8 9 10
1     03M_2     03M                   ata       2                   ATA
2     03M_3     03M                   ana       3                   ANA
3     03M_4     03M                   ara       4                   ARA
4     03M_5     03M                  atha       5                  ATHA


Data Pre-Processing

In [55]:


# Function to preprocess audio and connect to transcripts
def combine_audio_with_transcript(directory, dataframe):
    audio_data = []    
     # Iterate over each row in the DataFrame
    for index, row in dataframe.iterrows():
        file_name = row['File_Path']  # Get the file name from the CSV (without .wav)
        transcript = row['clean_transcript']  # Get the transcript
        
        # Construct the full file path by combining directory and file name with .wav extension
        file_path = os.path.join(directory, f"{file_name}.wav")
        
        # Check if the file exists in the audio directory
        if os.path.exists(file_path):
            try:
                # Append the processed data along with the transcript
                audio_data.append({
                    "file_path": file_path,
                    "transcript": transcript,
                })
            except Exception as e:
                print(f"Error processing file {file_path}: {e}")
    
    return audio_data

# Preprocess audio files and connect to transcripts
audio_data_with_transcripts = combine_audio_with_transcript(audio_directory, dataframe)


Convert Dataset into Pytorch Dataset

In [56]:
audio_with_transcript_dataframe = pd.DataFrame(audio_data_with_transcripts)

for index, row in audio_with_transcript_dataframe.iterrows():
    file_path = row['file_path']
    try:
        # Load the audio file
        audio, sr = librosa.load(file_path, sr=None)  # sr=None to preserve the original sample rate
        
        # Display basic information about the audio
        print(f"Audio file {file_path} loaded successfully")
        print(f"Audio length: {len(audio)} samples")
        print(f"Sample rate: {sr} Hz\n")
    
    except Exception as e:
        print(f"Error loading {file_path}: {e}")

Audio file dataset/03M_1.wav loaded successfully
Audio length: 717953 samples
Sample rate: 48000 Hz

Audio file dataset/03M_2.wav loaded successfully
Audio length: 561018 samples
Sample rate: 48000 Hz

Audio file dataset/03M_3.wav loaded successfully
Audio length: 697276 samples
Sample rate: 48000 Hz

Audio file dataset/03M_4.wav loaded successfully
Audio length: 643847 samples
Sample rate: 48000 Hz

Audio file dataset/03M_5.wav loaded successfully
Audio length: 864068 samples
Sample rate: 48000 Hz

Audio file dataset/03M_6.wav loaded successfully
Audio length: 728750 samples
Sample rate: 48000 Hz

Audio file dataset/03M_7.wav loaded successfully
Audio length: 715758 samples
Sample rate: 48000 Hz

Audio file dataset/03M_8.wav loaded successfully
Audio length: 484327 samples
Sample rate: 48000 Hz

Audio file dataset/03M_9.wav loaded successfully
Audio length: 453342 samples
Sample rate: 48000 Hz

Audio file dataset/03M_10.wav loaded successfully
Audio length: 483563 samples
Sample rate:

Build The Model

In [57]:
# Initialize the processor
asr_processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")

# Initialize the model
asr_model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
asr_tokenizer = Wav2Vec2CTCTokenizer.from_pretrained("facebook/wav2vec2-large-960h")

# Send the model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
asr_model.to(device)

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Wav2Vec2ForCTC(
  (wav2vec2): Wav2Vec2Model(
    (feature_extractor): Wav2Vec2FeatureEncoder(
      (conv_layers): ModuleList(
        (0): Wav2Vec2GroupNormConvLayer(
          (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False)
          (activation): GELUActivation()
          (layer_norm): GroupNorm(512, 512, eps=1e-05, affine=True)
        )
        (1-4): 4 x Wav2Vec2NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
          (activation): GELUActivation()
        )
        (5-6): 2 x Wav2Vec2NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,), bias=False)
          (activation): GELUActivation()
        )
      )
    )
    (feature_projection): Wav2Vec2FeatureProjection(
      (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (projection): Linear(in_features=512, out_features=768, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder)

In [58]:
sampling_rate=16000

def prepare_dataset(batch):
    """
    Prepare dataset with correct data types.
    """
    # Load the audio file
    audio, sr = librosa.load(batch['file_path'], sr=sampling_rate, mono=True)
    
    # Trim silence from the audio
    trimmed_audio, _ = librosa.effects.trim(audio, top_db=20)
    
    # Process audio
    audio_features = asr_processor(
        trimmed_audio, 
        sampling_rate=sampling_rate, 
        padding=False,
        return_tensors=None
    ).input_values[0]
    
    # Ensure audio features are float32
    audio_features = audio_features.astype(np.float32)
    
    # Process the transcript
    with asr_tokenizer.as_target_tokenizer():
        labels = asr_tokenizer(
            batch['transcript'],
            padding=False,
            return_tensors=None
        ).input_ids
    
    return {
        "input_values": audio_features,
        "labels": labels
    }

def custom_data_collator(batch):
    """
    Custom collator with correct data types.
    """
    # Get max length in the batch
    max_audio_length = max(len(x["input_values"]) for x in batch)
    max_label_length = max(len(x["labels"]) for x in batch)
    
    batch_audio = []
    batch_labels = []
    
    for sample in batch:
        # Pad audio
        audio_length = len(sample["input_values"])
        padded_audio = np.pad(
            sample["input_values"],
            (0, max_audio_length - audio_length),
            mode='constant',
            constant_values=0
        )
        batch_audio.append(padded_audio)
        
        # Pad labels
        label_length = len(sample["labels"])
        padded_labels = np.pad(
            sample["labels"],
            (0, max_label_length - label_length),
            mode='constant',
            constant_values=-100
        )
        batch_labels.append(padded_labels)
    
    # Convert to tensors with correct dtypes
    batch_audio = torch.tensor(batch_audio, dtype=torch.float32)
    batch_labels = torch.tensor(batch_labels, dtype=torch.long)
    
    return {
        "input_values": batch_audio,
        "labels": batch_labels
    }

In [None]:
dataset = Dataset.from_pandas(audio_with_transcript_dataframe)
processed_dataset = dataset.map(prepare_dataset, remove_columns=["file_path", "transcript"])

# Check if dataset is good
processed_dataframe = processed_dataset.to_pandas()
print(processed_dataframe.head())



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

In [None]:
split_dataset = processed_dataset.train_test_split(test_size=0.3, seed=42)

train_dataset = split_dataset["train"]
val_dataset = split_dataset["test"]

# Print dataset details
print("Training dataset size:", len(train_dataset))
print("Validation dataset size:", len(val_dataset))

In [51]:
# Training arguments
training_args = TrainingArguments(
    output_dir="./wav2vec2-finetuned",
    eval_strategy="steps",
    save_strategy="steps",
    learning_rate=1e-6,
    per_device_train_batch_size=32,
    num_train_epochs=30,
    fp16=True,
    logging_dir="./logs",
)

trainer = Trainer(
    model=asr_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    processing_class=asr_processor,
    data_collator=custom_data_collator
)

Training the Model

In [None]:
trainer.train()

Training Results

Visualization of Results using MatPlotLib

Save the model

More evaluation 