Import all packages here

In [None]:
import tensorflow as tf
from transformers import Wav2Vec2Processor, TFWav2Vec2Model

import librosa
from librosa.effects import trim
import librosa.display

import soundfile

import scipy as sy
import pydub as pyd

import jiwer as jw

import pandas as pd 
import numpy as np 

import os

import matplotlib.pyplot as plt


Mounting Gdrive for the datasets

In [None]:
# Still finding alternatives to access dataset on gdrive without downloading the datasets locally

Setting up the Dataset

In [5]:
dataset_path = "dataset/"
metatadata_path = "metadata"

audio_directory = "dataset/"

audio_data = []

# Create a dataframe for the metadata
# dataframe = pd.read_csv(metatadata_path)

# Display dataframe to check if ready

# print(dataframe.head())

Data Pre-Processing

In [7]:
# Load the audio files and preprocess

sampling_rate = 16000

for filename in os.listdir(audio_directory):
    if(filename.endswith(".wav")):
        file_path = os.path.join(audio_directory, filename)
        
        # Load file with Sampling Rate of 16000 Hz
        signal, sr = librosa.load(file_path, sr=sampling_rate)
        
        # Normalizing the audio data
        signal = signal / max(abs(signal))
        
        # Trim silence
        signal, _ = trim(signal)
        
        # Appending the data to a variable
        audio_data.append({"file_path": file_path, "signal": signal, "Sampling Rate": sr })


# Check if audio_data is ready

print(f"File: {audio_data[0]['file_path']}, Signal Length: {audio_data[0]['signal']}")

File: dataset/01F_1.wav, Signal Length: [-0.01692389 -0.02734679 -0.02197435 ... -0.00276161 -0.00074782
 -0.00303257]


Extract Mel Spectrograms


In [22]:
test_audio = "01F_1.wav"
sample_audio = os.path.join(dataset_path, test_audio)

y, sr = librosa.load(sample_audio, sr=sampling_rate)

mel_spectrogram = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128, fmax=8000)
mel_spectrogram_db = librosa.power_to_db(mel_spectrogram, ref=np.max)

# Plot the Mel spectrogram
plt.figure(figsize=(10, 4))
librosa.display.specshow(mel_spectrogram_db, sr=sr, x_axis='time', y_axis='mel', fmax=8000, cmap='viridis')
plt.colorbar(format='%+2.0f dB')  # Add a color bar for amplitude
plt.title('Mel Spectrogram')
plt.xlabel('Time (s)')
plt.ylabel('Mel Frequency')
plt.tight_layout()
plt.show()


AttributeError: module 'matplotlib' has no attribute 'figure'

Build The Model

In [None]:
# Load pre-trained Wav2Vec model and processor
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
wav2vec_model = TFWav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h")

# Define a custom model
class ASRModel(tf.keras.Model):
    def __init__(self, wav2vec_model, vocab_size):
        super(ASRModel, self).__init__()
        self.wav2vec = wav2vec_model
        self.dense = tf.keras.layers.Dense(vocab_size)

    def call(self, inputs):
        wav2vec_outputs = self.wav2vec(inputs).last_hidden_state
        logits = self.dense(wav2vec_outputs)
        return logits

# Define the vocabulary size (number of tokens)
vocab_size = len(processor.tokenizer)
model = ASRModel(wav2vec_model, vocab_size)

Training the Model

In [None]:

# Function to preprocess audio
def preprocess_audio(file_path):
    audio, _ = librosa.load(file_path, sr=16000)
    return audio

# Function to tokenize transcriptions
def tokenize_transcription(transcription):
    return processor.tokenizer(transcription).input_ids

# Create a TensorFlow dataset
def prepare_dataset(file_paths, transcriptions):
    audio_features = [preprocess_audio(fp) for fp in file_paths]
    labels = [tokenize_transcription(t) for t in transcriptions]
    return tf.data.Dataset.from_tensor_slices((audio_features, labels))

# Compile the model
model.compile(
    optimizer=Adam(learning_rate=1e-4),
    loss=SparseCategoricalCrossentropy(from_logits=True),
    metrics=["accuracy"]
)

# Train the model
model.fit(train_dataset, validation_data=val_dataset, epochs=10)

Training Results

Visualization of Results using MatPlotLib

Save the model

More evaluation 