In [None]:
import os
import torchaudio
import torch
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import jiwer

def load_audio(file_path):
    waveform, sample_rate = torchaudio.load(file_path)
    if sample_rate != 16000:
        waveform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)(waveform)
    waveform = torchaudio.transforms.Vol(1.0, gain_type='amplitude')(waveform)  # Normalize audio
    return waveform.squeeze().numpy()

def read_text(file_path):
    with open(file_path, 'r') as file:
        return file.read().strip()

def evaluate_asr_and_calculate_cer(audio_dir, text_dir, model_name='jonatasgrosman/wav2vec2-large-xlsr-53-english'):
    processor = Wav2Vec2Processor.from_pretrained(model_name)
    model = Wav2Vec2ForCTC.from_pretrained(model_name)
    model.eval()

    cer_scores = []
    total_samples = 0

    for audio_file in os.listdir(audio_dir):
        if audio_file.endswith('.wav'):
            audio_path = os.path.join(audio_dir, audio_file)
            text_file = audio_file.replace('.wav', '.txt')
            text_path = os.path.join(text_dir, text_file)

            if os.path.exists(text_path):
                # Load and preprocess the audio
                input_audio = load_audio(audio_path)
                input_values = processor(input_audio, sampling_rate=16000, return_tensors="pt").input_values

                # Perform ASR
                with torch.no_grad():
                    logits = model(input_values).logits
                predicted_ids = torch.argmax(logits, dim=-1)
                transcription = processor.batch_decode(predicted_ids)[0].upper()  # Convert to uppercase

                # Load reference transcription
                reference = read_text(text_path).upper()  # Convert to uppercase

                # Check if reference and transcription are not empty
                if reference and transcription:
                    # Compute CER
                    cer = jiwer.cer(reference, transcription)
                    cer_scores.append(cer)
                    total_samples += 1

                    print(f"File: {audio_file}")
                    print(f"Reference: {reference}")
                    print(f"Transcription: {transcription}")
                    print(f"CER: {cer:.4f}")
                    print("---------")
                else:
                    print(f"Skipping file {audio_file}: Reference or transcription is empty.")
                    print("---------")

    if total_samples > 0:
        average_cer = sum(cer_scores) / total_samples
        print(f"Average CER: {average_cer:.4f}")
    else:
        print("No valid samples found to compute CER.")

# Example usage
audio_directory = '/content/drive/MyDrive/DataSet/wav'
text_directory = '/content/drive/MyDrive/DataSet/corrected_txt'
evaluate_asr_and_calculate_cer(audio_directory, text_directory, model_name='jonatasgrosman/wav2vec2-large-xlsr-53-english')

Some weights of the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
Y

File: 000351e792c6e0e90a93cd73d2d8fca99950db6de0122db253aa1336.wav
Reference: WE HAVE PROVIDED IS ACTUALLY WE CAN OPTIMIZE THE REINFORCEMENT LAYERS BECAUSE AS WE SEE
Transcription: WE HAVE PROVIDED IS ACTUALLY WE CAN OPTIMIZE THE REINFORCEMENT PLAYERS BECAUSE AS WE SE
CER: 0.0230
---------
File: 000104538b2f194bfbe315e09483c0d345f1549ba72a7dd4571ad984.wav
Reference: ARE PUT INTO AN INSTRUMENT CALLED STRAIN MEASURING BRIDGE WERE THIS EMF AND CONNECTION MAGNIFICATION
Transcription: INGER PUT INTO AN INSTRUMENTAL STAIN MEASURING BRIDGE WHERE THIS EMEF AND A CONNECTION MAGNIFICATION
CER: 0.1400
---------
File: 00012fd31a06366368f7a824e76ea8f1c83311be593242717e9daea5.wav
Reference: STRAND IT IS GOING TO LINK WITH THE T ON THE OTHER STRAND NOW IF YOU LOOK VERY CAREFULLY THE
Transcription: IS STRAND IT IS GOING TO LINK WITH THE T ON THE OTHER STRANDNOW IF YOU LOOK VERY CAREFULLY
CER: 0.0870
---------
File: 0002927d79779cca83dc81d5e2f8ca026e16997e7f5840079edcb26b.wav
Reference: WITH PARAMETERS