
# Part I: Download dataset, select subset and preprocess data


In [None]:
# Clean previous data. Uncomment if data become messed up.
# !rm -rf LibriSpeech minilibri dev-clean-2*

In [None]:
!pip install datasets transformers jiwer



In [None]:
!wget https://www.openslr.org/resources/31/dev-clean-2.tar.gz

--2025-04-11 11:36:58--  https://www.openslr.org/resources/31/dev-clean-2.tar.gz
Resolving www.openslr.org (www.openslr.org)... 46.101.158.64
Connecting to www.openslr.org (www.openslr.org)|46.101.158.64|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://openslr.elda.org/resources/31/dev-clean-2.tar.gz [following]
--2025-04-11 11:36:59--  https://openslr.elda.org/resources/31/dev-clean-2.tar.gz
Resolving openslr.elda.org (openslr.elda.org)... 141.94.109.138, 2001:41d0:203:ad8a::
Connecting to openslr.elda.org (openslr.elda.org)|141.94.109.138|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 126046265 (120M) [application/x-gzip]
Saving to: ‘dev-clean-2.tar.gz’


2025-04-11 11:37:10 (12.1 MB/s) - ‘dev-clean-2.tar.gz’ saved [126046265/126046265]



In [None]:
%%capture

!tar xvf dev-clean-2.tar.gz

In [None]:
import jiwer
import numpy as np
import os
import random
import shutil
import subprocess
import torch

from datasets import Dataset, Audio, load_dataset
from pathlib import Path
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor

In [None]:
# DONE: Set the seed to your date of birth (DDMM) e.g. 0101 for 1st of January
SEED = int("2406") # 24th of June

random.seed(SEED)  # For more reproducible results

In [None]:
LIBRISPEECH_DIR = "LibriSpeech/dev-clean-2"
SUBSET_DIR = "minilibri"

In [None]:
def find_speakers(librispeech_dir):
  """Returns a list of all speakers"""
  return os.listdir(librispeech_dir)


def find_files(librispeech_dir, speaker, extension):
  """Finds files with specific extension for a specific speaker"""
  return [
    f.as_posix()
    for f in (Path(librispeech_dir) / speaker).glob(f"**/*.{extension}")
  ]


def select_random_subset(utterance_ids, num=5):
  """Select specified number of random items from an input list"""
  utts = list(utterance_ids)
  random.shuffle(utts)
  return utts[:num]


def merge_transcriptions(transcription_dicts):
  """Takes a list of transcription dictionaries, and merges them
  into a single dictionary
  """
  merged = {}
  for d in transcription_dicts:
    merged.update(d)
  return merged


def get_utterance_id_from_path(path):
  return Path(path).stem


def get_speaker_from_utterance_id(utt_id):
  return utt_id.split("-")[0]


def convert_flac_to_wav(flac_path, output_dir):
    """
    Converts a FLAC audio file to WAV format using ffmpeg, with a sample rate of 16000 Hz and mono audio.

    Parameters:
    flac_path (str): The path to the input FLAC file.
    output_dir (str): The directory where the output WAV file will be saved.

    The output WAV file is saved in a subdirectory of `output_dir` matching the input file's immediate parent directory,
    and the file name is preserved with the `.wav` extension.
    """
    # Extract the filename without extension and the directory name
    utt_id = get_utterance_id_from_path(flac_path)
    output_file = f"{output_dir}/{utt_id}.wav"

    # Construct the ffmpeg command
    command = [
        'ffmpeg',
        '-i', flac_path, # Input file
        '-ar', '16000',  # Set audio sample rate to 16000 Hz
        '-ac', '1',      # Set audio channels to 1 (mono)
        output_file
    ]

    # Execute the ffmpeg command
    try:
        subprocess.run(command, check=True)
        print(f"Conversion successful, output file saved to {output_file}")
    except subprocess.CalledProcessError as e:
        print(f"Error during conversion: {e}")

In [None]:
def find_flac_files(librispeech_dir, speaker):
  """find all flac files for a specific speaker (use find_files)"""
  return find_files(librispeech_dir, speaker, "flac")


def find_transcriptions(librispeech_dir, speaker):
  """find all trans.txt files for a specific speaker"""
  return find_files(librispeech_dir, speaker, "trans.txt")


def convert_flac_list_to_dict(flac_files):
  """Takes a list of flac files and returns a dict with all files in the format
  {
    utterance_id: flac_filepath
  }
  The utterance id can be extracted from the filepath using Path(filepath).stem
  """

  return {Path(filepath).stem: filepath for filepath in flac_files}  #creates a dictionary with utt_id in full flac path


def read_transcription(transcription_file, lowercase=False):
  """Read a transcription file.
  Open file, read lines and convert to lower case if lowercase == True.
  Returns a dictionary with all transcriptions in the format:
  {
    utterance_id: transcription_text
  }
  """
  transcription_dict = {} #open a blank dict to add then utt_id and trans_txt
  with open(transcription_file, 'r', encoding='utf-8') as file: #open and read the trans_file
    for line in file:  #read each line in file
      split_text = line.strip().split() #split line into list
      utterance_id = split_text[0]  # take only the first element which is the ID number
      transcription_text = " ".join(split_text[1:])   # after the ID I take the rest of the trans_text

      # save in the afforementioned dict
      transcription_dict[utterance_id] = transcription_text

  return transcription_dict

def create_librispeech_subset(librispeech_dir):
  """Selects a subset of 5 utterance for each speaker and returns 2 dictionaries
  1. flac_dict: maps the selected utterance ids to the corresponding flac files
  2. transcription_dict: maps the selected utterance ids to the corresponding
     transcriptions
  How:
    a. Get all speakers
    b. For each speaker
      i. get all flac files for that speaker
      ii. read all transcriptions for that speaker
      iii. select a random subset of utterance_ids
      iv. populate selected_flacs dict with the flacs corresponding to the
        selected utterance ids
      v. populated selected_transcriptions dic twith the transcriptions
        corresponding to the selected utterance ids
    c. Return selected_flacs, selected_transcriptions
  """
  speakers = find_speakers(librispeech_dir) # take a list of all speakers
  selected_flac_files = {} # make a dict for then storing of the flac files
  selected_transcriptions = {} # make a dict for the selected transcriptions then

  for speaker in speakers:
    flac_files = find_flac_files(librispeech_dir, speaker) #get all the flac files for each speaker
    transcription_files = find_transcriptions(librispeech_dir, speaker) #get all trans texts for each speaker

    flac_dict = convert_flac_list_to_dict(flac_files) #convert flac list to dict
    transcription_dicts = [read_transcription(file) for file in transcription_files] #read all the trans.txt files
    transcription_dict = merge_transcriptions(transcription_dicts) #merge all trans dicts into one dict

    ids = list(set(flac_dict) & set(transcription_dict)) # take the IDs that are in both flac files and trans txts
    selected_ids = select_random_subset(ids, num=5) # take 5 random utterance IDs

    for utterance_id in selected_ids:
        selected_flac_files[utterance_id] = flac_dict[utterance_id] # save selected flac paths

    for utterance_id in selected_ids:
        selected_transcriptions[utterance_id] = transcription_dict[utterance_id] # save corresponding transcriptions

  return selected_flac_files, selected_transcriptions

def write_trans_txt(transcription_dict, output_file):
  """Write a transcription dict in a output trans.txt file.
  Lines in the format:
    utterance_id this is the transcript
  """
  with open(output_file, 'w', encoding='utf-8') as f: #open output .txt file for writing
    for utterance_id, transcription_text in transcription_dict.items():
      f.write(f"{utterance_id} {transcription_text}\n") #write each line based on the commented format


def write_subset_to_disk(flac_dict, transcription_dict, output_dir):
  """
  1. Create output_dir
  2. For each file in flac_dict convert it to wav and write in {output_dir}
  3. Write the transcription_dict in the file {output_dir}/transcipt.trans.txt
  """
  os.makedirs(output_dir, exist_ok=True) #create output directory

  for utterance_id, flac_path in flac_dict.items():
    convert_flac_to_wav(flac_path, output_dir) # convert the selected flac files to WAV files

  output_transcription_path = f"{output_dir}/transcript.trans.txt" # create the output trans file path
  write_trans_txt(transcription_dict, output_transcription_path) # write the trans dict in the file path

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Call implemented functions
selected_flacs, selected_transcripts = create_librispeech_subset(LIBRISPEECH_DIR)
write_subset_to_disk(selected_flacs, selected_transcripts, SUBSET_DIR)

Conversion successful, output file saved to minilibri/6319-57405-0003.wav
Conversion successful, output file saved to minilibri/6319-57405-0007.wav
Conversion successful, output file saved to minilibri/6319-57405-0006.wav
Conversion successful, output file saved to minilibri/6319-57405-0010.wav
Conversion successful, output file saved to minilibri/6319-57405-0009.wav
Conversion successful, output file saved to minilibri/2803-154320-0002.wav
Conversion successful, output file saved to minilibri/2803-161169-0008.wav
Conversion successful, output file saved to minilibri/2803-154320-0006.wav
Conversion successful, output file saved to minilibri/2803-154320-0014.wav
Conversion successful, output file saved to minilibri/2803-161169-0013.wav
Conversion successful, output file saved to minilibri/1988-147956-0025.wav
Conversion successful, output file saved to minilibri/1988-24833-0015.wav
Conversion successful, output file saved to minilibri/1988-147956-0026.wav
Conversion successful, output f

# Part II: Convert data to huggingface dataset

In [None]:
def make_wav_dataset(subset_dir):
  """Create a huggingface dataset based on a list of wav files
  1. Use glob to get a list of wav files from the subset_dir
  2. Use Dataset.from_dict to create a dataset from the list of wav files
  3. Use cast_column to cast the "audio" column to Audio()
  """    # I altered the provided code, because the previous one used Path.glob() in Dataset.from_dict and didn t work - it returned an iterator and not a list with file paths !
  list_of_wavs = [str(p) for p in Path(subset_dir).glob("*.wav")]  # convert paths to strings
  dataset = Dataset.from_dict({"audio": list_of_wavs})  # create a huggingface dataset
  dataset = dataset.cast_column("audio", Audio())      #cast the column to Audio type
  return dataset

def augment_dataset_with_ids(dataset):
  """Create a column "id" for each sample containing the corresponding
  utterance id.
  Hint: Use dataset.map and get_utterance_id_from_path
  """
  def extract_id(sample):
    path = sample["audio"]["path"] #take the path of audio file
    utterance_id = get_utterance_id_from_path(path) #extact the utterance_id
    return {"id": utterance_id} # make it a dict to add it as a column

  return dataset.map(extract_id) # apply the function to all dataset rows


def augment_dataset_with_transcriptions(dataset, subset_dir):
  """Create a column "text" for each sample containing the sample's
  transcription.
  Hint: use read_transcription to read the transcript.trans.txt and dataset.map
  to add the transcription to each sample based on the "id" column
  """
  transcript_path = Path(subset_dir) / "transcript.trans.txt"  # path to the transcription file
  trans_dict = read_transcription(transcript_path) # read the file in a dict

  def add_transcription(sample):
    utt_id = sample ["id"] # take the ID of the current sample
    return {"text": trans_dict[utt_id]}  # add the corresponding transcription as new column with id

  return dataset.map(add_transcription) # apply the function on the dataset

def augment_dataset_with_speakers(dataset):
  """Create a column "speaker" for each sample containing the sample's speaker
  Hint: Use dataset.map and get_speaker_from_utterance_id
  """
  def add_speaker(sample): # make a function to add a new column "speaker" connected to their ID
    utt_id = sample["id"] # take the utterance id
    speaker = get_speaker_from_utterance_id(utt_id)  # extract speaker id from it
    return {"speaker": speaker}  # add the speaker ID as new column

  return dataset.map(add_speaker) # apply to the dataset

In [None]:
def make_dataset(subset_dir):
  """Create the minilibrispeech dataset using the helper functions"""
  dataset = make_wav_dataset(subset_dir)
  dataset = augment_dataset_with_ids(dataset)
  dataset = augment_dataset_with_transcriptions(dataset, subset_dir)
  dataset = augment_dataset_with_speakers(dataset)

  return dataset

# Part III: Augment dataset with noise

In [None]:
def download_noise_dataset():
  """Download a dataset with noise recordings from huggingface hub and
  downsample to 16000 Hz
  """
  noise_dataset = load_dataset("Nexdata/Scene_Noise_Data", split="train")
  # Downsample at 16 kHz
  noise_dataset = noise_dataset.cast_column("audio", Audio(sampling_rate=16000))
  return noise_dataset


def adjust_noise_length(noise, target_length):
  """Adjust the length of the noise to match the target length.
  Args:
      noise (numpy.ndarray): Noise signal.
      target_length (int): Desired length of the noise signal.
  Returns:
      numpy.ndarray: Adjusted noise signal.
  """
  if len(noise) > target_length:
    return noise[:target_length]
  elif len(noise) < target_length:
    repeat_count = target_length // len(noise) + 1
    return np.tile(noise, repeat_count)[:target_length]
  return noise


def snr_db_to_linear(snr_db):
  """
  Convert SNR from decibels (dB) to a linear scale.
  Args:
      snr_db (float): SNR value in decibels.
  Returns:
      float: Actual SNR as a ratio.
  """
  return 10 ** (snr_db / 10)


def snr_linear_to_db(snr):
  """
  Convert SNR from linear tp decibels (dB)
  Args:
      snr (float): SNR value.
  Returns:
      float: SNR in decibels (DB).
  """
  return 10 * np.log10(snr)

In [None]:
def calculate_power(signal):
  """
  Calculate the power of a signal. The power is calculated as the sum of
  the squared signal.
  """
  return (signal * signal).sum()


def calculate_snr(signal, noise):
  """
  Calculate the signal-to-noise ratio (SNR) for given signal and noise arrays.
  1. Calculate the power of the signal and the noise
  2. Return the ratio signal_power / noise_power
  """
  signal_power = calculate_power(signal)  # calculate signal power
  noise_power = calculate_power(noise)    # calculate noise_power

  return signal_power / noise_power  # return the ratio signal_power / noise_power

def calculate_noise_coefficient(signal, noise, desired_snr_db):
  """
  Calculate the noise coefficient required to scale the noise to achieve a
  desired SNR in dB using the signal power calculation function.

  1. Calculate the signal and the noise power
  2. Convert the desired SNR from DB to linear scale
  3. Calculate the target noise power as signal_power / desired_snr
  4. Calculate the coefficient as the square root of
     target_noise_power / noise_power
  """
  signal_power = calculate_power(signal) # calculate signal power
  noise_power = calculate_power(noise) # calculate noise power

  linear_snr = snr_db_to_linear(desired_snr_db) # convert dB to linear scale
  target_noise_power = signal_power / linear_snr # calculate the target noise power

  coeff = np.sqrt(target_noise_power / noise_power) # calculate the coeff

  return coeff

def add_noise_to_signal(signal, noise, desired_snr_db):
  """
  Add noise to a signal to achieve a specified SNR.
  1. Adjust the noise length
  2. Calculate the noise coefficient for the target snr_db
  3. Scale (multiply) the noise by the coefficient
  4. Add the scaled noise to the input signal to create the noisy signal
  5. Return the noisy signal
  """
  noise = adjust_noise_length(noise, len(signal)) # adjust the noise length
  coeff = calculate_noise_coefficient(signal, noise, desired_snr_db) #calculate noise coeff for the target snr_db
  scale_noise = noise * coeff # scale the n oise by the coeff
  noisy_signal = signal + scale_noise #add the scaled noise to the input signal to create the noisy signal
  return noisy_signal

def make_noisy_dataset(audio_dataset, noise_dataset, desired_snr_db):
  """
  Add noise to each audio signal in the dataset to achieve a specified SNR
  using dataset.map
  """
  def process_sample(sample):
    """
    1. Choose a random sample from the noise_dataset
    2. Use add_noise_to_signal with the sample["audio"]["array"] and the
       random noise sample to create a noisy_signal
    3. Set sample["audio"]["array"] as the noisy_signal
    """
    noise_example = random.choice (noise_dataset) # Choose a random sample from the noise_dataset
    noise_array = noise_example ["audio"]["array"] # take the noise array

    noisy_signal = add_noise_to_signal(sample["audio"]["array"],noise_array,desired_snr_db)
    # create a noisy_signal

    sample["audio"]["array"] = noisy_signal # set sample["audio"]["array"] as the noisy_signal

    return sample

  dataset = audio_dataset.map(process_sample)
  return dataset

In [None]:
dataset_clean = make_dataset(SUBSET_DIR)
noise_dataset = download_noise_dataset()
dataset_snr_3 = make_noisy_dataset(dataset_clean, noise_dataset, 3)
dataset_snr_6 = make_noisy_dataset(dataset_clean, noise_dataset, 6)
dataset_snr_9 = make_noisy_dataset(dataset_clean, noise_dataset, 9)

Map:   0%|          | 0/130 [00:00<?, ? examples/s]

Map:   0%|          | 0/130 [00:00<?, ? examples/s]

Map:   0%|          | 0/130 [00:00<?, ? examples/s]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/3.12k [00:00<?, ?B/s]

Resolving data files:   0%|          | 0/30 [00:00<?, ?it/s]

Downloading data:   0%|          | 0/30 [00:00<?, ?files/s]

G0001_P0002_01.wav:   0%|          | 0.00/1.77M [00:00<?, ?B/s]

G0002_P0001_01.wav:   0%|          | 0.00/1.77M [00:00<?, ?B/s]

G0001_P0005_01.wav:   0%|          | 0.00/1.77M [00:00<?, ?B/s]

G0002_P0006_01.wav:   0%|          | 0.00/1.77M [00:00<?, ?B/s]

G0002_P0004_01.wav:   0%|          | 0.00/1.77M [00:00<?, ?B/s]

G0002_P0002_01.wav:   0%|          | 0.00/1.77M [00:00<?, ?B/s]

G0001_P0006_01.wav:   0%|          | 0.00/1.88M [00:00<?, ?B/s]

G0002_P0005_01.wav:   0%|          | 0.00/1.77M [00:00<?, ?B/s]

G0001_P0004_01.wav:   0%|          | 0.00/1.77M [00:00<?, ?B/s]

G0001_P0001_01.wav:   0%|          | 0.00/1.77M [00:00<?, ?B/s]

G0002_P0008_01.wav:   0%|          | 0.00/1.77M [00:00<?, ?B/s]

G0002_P0007_01.wav:   0%|          | 0.00/1.77M [00:00<?, ?B/s]

G0002_P0003_01.wav:   0%|          | 0.00/1.77M [00:00<?, ?B/s]

G0005_P0003_01.wav:   0%|          | 0.00/1.97M [00:00<?, ?B/s]

G0005_P0004_01.wav:   0%|          | 0.00/1.77M [00:00<?, ?B/s]

G0001_P0007_01.wav:   0%|          | 0.00/1.77M [00:00<?, ?B/s]

G0001_P0003_01.wav:   0%|          | 0.00/1.77M [00:00<?, ?B/s]

G0005_P0005_01.wav:   0%|          | 0.00/1.77M [00:00<?, ?B/s]

G0005_P0006_01.wav:   0%|          | 0.00/1.77M [00:00<?, ?B/s]

G0005_P0008_01.wav:   0%|          | 0.00/1.77M [00:00<?, ?B/s]

G0005_P0007_01.wav:   0%|          | 0.00/1.77M [00:00<?, ?B/s]

G0005_P0010_01.wav:   0%|          | 0.00/1.77M [00:00<?, ?B/s]

G0006_P0001_01.wav:   0%|          | 0.00/1.77M [00:00<?, ?B/s]

G0006_P0004_01.wav:   0%|          | 0.00/1.77M [00:00<?, ?B/s]

G0005_P0011_01.wav:   0%|          | 0.00/1.77M [00:00<?, ?B/s]

G0006_P0002_01.wav:   0%|          | 0.00/1.77M [00:00<?, ?B/s]

G0006_P0005_01.wav:   0%|          | 0.00/1.77M [00:00<?, ?B/s]

G0006_P0006_01.wav:   0%|          | 0.00/1.77M [00:00<?, ?B/s]

G0006_P0007_01.wav:   0%|          | 0.00/1.77M [00:00<?, ?B/s]

G0006_P0003_01.wav:   0%|          | 0.00/1.77M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/30 [00:00<?, ? examples/s]

Map:   0%|          | 0/130 [00:00<?, ? examples/s]

Map:   0%|          | 0/130 [00:00<?, ? examples/s]

Map:   0%|          | 0/130 [00:00<?, ? examples/s]

In [None]:
# experimenting listening to an audio sample to verify that the noise was added correctly - Listen to examples of the noisy data to understand how each noise level is perceived
from IPython.display import Audio

sample = dataset_snr_3[20]
Audio(sample["audio"]["array"], rate=sample["audio"]["sampling_rate"])

In [None]:
sample = dataset_snr_6[20]
Audio(sample["audio"]["array"], rate=sample["audio"]["sampling_rate"])

In [None]:
sample = dataset_snr_9[20]
Audio(sample["audio"]["array"], rate=sample["audio"]["sampling_rate"])

# Part IV: Create speaker specific subsets

In [None]:
def filter_dataset_for_speaker(dataset, speaker):
  """Use dataset.filter to create a new dataset containing only the samples
  that correspond to the speaker
  """

  def target_speaker(sample): # make a function that returns true if the speaker of the example matches the targeted filtered speaker
     return sample["speaker"] == speaker

  new_dataset = dataset.filter(target_speaker) # Use dataset.filter for new dataset with corresponding speaker
  return new_dataset

In [None]:
SPEAKER1 = "1272"
SPEAKER2 = "3000"

dataset_speaker_1 = filter_dataset_for_speaker(dataset_clean, SPEAKER1)
dataset_speaker_2 = filter_dataset_for_speaker(dataset_clean, SPEAKER2)

dataset_snr_3_speaker_1 = filter_dataset_for_speaker(dataset_snr_3, SPEAKER1)
dataset_snr_3_speaker_2 = filter_dataset_for_speaker(dataset_snr_3, SPEAKER2)

Filter:   0%|          | 0/130 [00:00<?, ? examples/s]

Filter:   0%|          | 0/130 [00:00<?, ? examples/s]

Filter:   0%|          | 0/130 [00:00<?, ? examples/s]

Filter:   0%|          | 0/130 [00:00<?, ? examples/s]

In [None]:
# write this just to confirm that each dataset contains samples after filtering
print(len(dataset_speaker_1))
print(len(dataset_speaker_2))
print(len(dataset_snr_3_speaker_1))
print(len(dataset_snr_3_speaker_2))

5
5
5
5


# Part V: Experiments with wav2vec2

In [None]:
WAV2VEC2_BASE = "facebook/wav2vec2-base-960h"
WAV2VEC2_LARGE = "facebook/wav2vec2-large-960h"
WAV2VEC2_LARGE_SELF = "facebook/wav2vec2-large-960h-lv60-self"

In [None]:
def get_model_and_processor(model_name):
  """Load pretrained huggingface model"""
  model = Wav2Vec2ForCTC.from_pretrained(model_name)
  processor = Wav2Vec2Processor.from_pretrained(model_name)
  return model, processor


def calculate_wer(reference, hypothesis):
  """Calculate Word Error Rate between reference (ground truth) text and
  hypothesis (predicted) text
  """
  return jiwer.wer(reference, hypothesis) * 100


def calculate_cer(reference, hypothesis):
  """Calculate Character Error Rate between reference (ground truth) text and
  hypothesis (predicted) text
  """
  return jiwer.cer(reference, hypothesis) * 100

In [None]:
def evaluate(model, processor, dataset):
  model = model.to("cuda")
  model.eval()

  def map_to_pred(batch):
    """
    1. Pass batch["audio"]["array"] through the processor and return
      pytorch (pt) tensors. Get the input values of the resulting object.
    2. Use with torch.no_grad() to disable gradients
      Calculate the logits by passing the input values through the model
    3. Get the predicted ids using argmax on the logits
    4. Get the transcription by using processor.batch_decode
    5. Set batch["transcription"] to the transcription
    """
    inputs = processor(batch["audio"]["array"], sampling_rate=16000, return_tensors="pt", padding=True)
    # Pass batch["audio"]["array"] through the processor and return pytorch (pt) tensors and apply padding for input alignment

    with torch.no_grad(): # disable gradient calculation
      logits = model(inputs.input_values.to("cuda")).logits #calculate the logits from the model

    predicted_ids = torch.argmax (logits, dim=-1) # get the predicted ids using argmax on the logits (get the index of max logit)
    transcription = processor.batch_decode(predicted_ids)[0] # get the transcription
    batch["transcription"] = transcription # set batch["transcription"] to the transcription
    return batch  # return the updated batch

  result = dataset.map(map_to_pred, batched=False, remove_columns=["audio"])
  WER = calculate_wer(result["text"], result["transcription"])
  CER = calculate_cer(result["text"], result["transcription"])
  return WER, CER

In [None]:
# Run experiment on clean data - WAV2VEC2-BASE
model_base, processor_base = get_model_and_processor(WAV2VEC2_BASE)
wer_clean, cer_clean = evaluate(model_base, processor_base, dataset_clean)
print(f"WER - Dataset clean - wav2vec2 base: {wer_clean}")
print(f"CER - Dataset clean - wav2vec2 base: {cer_clean}")

# DONE: Run the rest of the experiments for the noisy and the single speaker datasets for wav2vec2-base
# DONE: Run the rest of the experiments for the clean, noisy, and the single speaker datasets for wav2vec2-large
# DONE: Run the rest of the experiments for the clean, noisy, and the single speaker datasets for wav2vec2-large-self

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/130 [00:00<?, ? examples/s]

WER - Dataset clean - wav2vec2 base: 3.6475409836065573
CER - Dataset clean - wav2vec2 base: 1.2459838570644932


In [None]:
# WAV2VEC2-BASE for noisy datasets (SNR 3, 6, 9)
model_base, processor_base = get_model_and_processor(WAV2VEC2_BASE)
wer_snr3_base, cer_snr3_base = evaluate(model_base,processor_base,dataset_snr_3)
print(f"WER - SNR 3 - wav2vec2 base: {wer_snr3_base}")
print(f"CER - SNR 3 - wav2vec2 base: {cer_snr3_base}")

wer_snr6_base, cer_snr6_base = evaluate(model_base,processor_base,dataset_snr_6)
print(f"WER - SNR 6 - wav2vec2 base: {wer_snr6_base}")
print(f"CER - SNR 6 - wav2vec2 base: {cer_snr6_base}")

wer_snr9_base, cer_snr9_base = evaluate(model_base,processor_base,dataset_snr_9)
print(f"WER - SNR 9 - wav2vec2 base: {wer_snr9_base}")
print(f"CER - SNR 9 - wav2vec2 base: {cer_snr9_base}")

# WAV2VEC2-BASE for single speaker clean dataset | speakers 1 and 2
wer_clean_speaker1_base, cer_clean_speaker1_base = evaluate(model_base, processor_base, dataset_speaker_1)
print(f"WER - Clean speaker 1 - wav2vec2 base: {wer_clean_speaker1_base}")
print(f"CER - Clean speaker 1 - wav2vec2 base: {cer_clean_speaker1_base}")

wer_clean_speaker2_base, cer_clean_speaker2_base = evaluate(model_base, processor_base, dataset_speaker_2)
print(f"WER - Clean speaker 2 - wav2vec2 base: {wer_clean_speaker2_base}")
print(f"CER - Clean speaker 2 - wav2vec2 base: {cer_clean_speaker2_base}")

# WAV2VEC2-BASE for single speaker noisy (SNR 3) | speakers 1 and 2
wer_noisy_speaker1_base, cer_noisy_speaker1_base = evaluate(model_base, processor_base, dataset_snr_3_speaker_1)
print(f"WER - Noisy SNR 3 speaker 1- wav2vec2 base: {wer_noisy_speaker1_base}")
print(f"CER - Noisy SNR 3 speaker 1- wav2vec2 base: {cer_noisy_speaker1_base}")

wer_noisy_speaker2_base, cer_noisy_speaker2_base = evaluate(model_base, processor_base, dataset_snr_3_speaker_2)
print(f"WER - Noisy SNR 3 speaker 2- wav2vec2 base: {wer_noisy_speaker2_base}")
print(f"CER - Noisy SNR 3 speaker 2- wav2vec2 base: {cer_noisy_speaker2_base}")

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/130 [00:00<?, ? examples/s]

WER - SNR 3 - wav2vec2 base: 46.0655737704918
CER - SNR 3 - wav2vec2 base: 26.126479116056732


Map:   0%|          | 0/130 [00:00<?, ? examples/s]

WER - SNR 6 - wav2vec2 base: 24.016393442622952
CER - SNR 6 - wav2vec2 base: 11.127654572525664


Map:   0%|          | 0/130 [00:00<?, ? examples/s]

WER - SNR 9 - wav2vec2 base: 13.483606557377048
CER - SNR 9 - wav2vec2 base: 5.532481780424732


Map:   0%|          | 0/5 [00:00<?, ? examples/s]

WER - Clean speaker 1 - wav2vec2 base: 6.493506493506493
CER - Clean speaker 1 - wav2vec2 base: 1.932367149758454


Map:   0%|          | 0/5 [00:00<?, ? examples/s]

WER - Clean speaker 2 - wav2vec2 base: 4.3478260869565215
CER - Clean speaker 2 - wav2vec2 base: 1.6326530612244898


Map:   0%|          | 0/5 [00:00<?, ? examples/s]

WER - Noisy SNR 3 speaker 1- wav2vec2 base: 77.92207792207793
CER - Noisy SNR 3 speaker 1- wav2vec2 base: 48.309178743961354


Map:   0%|          | 0/5 [00:00<?, ? examples/s]

WER - Noisy SNR 3 speaker 2- wav2vec2 base: 73.91304347826086
CER - Noisy SNR 3 speaker 2- wav2vec2 base: 42.6530612244898


In [None]:
# Apply Wav2Vec2-Large-960h

model_large, processor_large = get_model_and_processor(WAV2VEC2_LARGE)

# Wav2Vec2-Large for clean dataset
wer_clean_large, cer_clean_large = evaluate(model_large, processor_large, dataset_clean)
print(f"WER - Clean - wav2vec2 large: {wer_clean_large}")
print(f"CER - Clean - wav2vec2 large: {cer_clean_large}")

# Wav2Vec2-Large for SNR noisy dataset
wer_snr3_large, cer_snr3_large = evaluate(model_large, processor_large, dataset_snr_3)
print(f"WER - SNR 3 - wav2vec2 large: {wer_snr3_large}")
print(f"CER - SNR 3 - wav2vec2 large: {cer_snr3_large}")

wer_snr6_large, cer_snr6_large = evaluate(model_large, processor_large, dataset_snr_6)
print(f"WER - SNR 6 - wav2vec2 large: {wer_snr6_large}")
print(f"CER - SNR 6 - wav2vec2 large: {cer_snr6_large}")

wer_snr9_large, cer_snr9_large = evaluate(model_large, processor_large, dataset_snr_9)
print(f"WER - SNR 9 - wav2vec2 large: {wer_snr9_large}")
print(f"CER - SNR 9 - wav2vec2 large: {cer_snr9_large}")

#  Wav2Vec2-Large for single speaker clean dataset | speakers 1 and 2
wer_clean_speaker1_large, cer_clean_speaker1_large = evaluate(model_large, processor_large, dataset_speaker_1)
print(f"WER - Clean speaker 1 - wav2vec2 large: {wer_clean_speaker1_large}")
print(f"CER - Clean speaker 1 - wav2vec2 large: {cer_clean_speaker1_large}")

wer_clean_speaker2_large, cer_clean_speaker2_large = evaluate(model_large, processor_large, dataset_speaker_2)
print(f"WER - Clean speaker 2 - wav2vec2 large: {wer_clean_speaker2_large}")
print(f"CER - Clean speaker 2 - wav2vec2 large: {cer_clean_speaker2_large}")

# WAV2VEC2-Large for single speaker noisy (SNR 3) | speakers 1 and 2
wer_noisy_speaker1_large, cer_noisy_speaker1_large = evaluate(model_large, processor_large, dataset_snr_3_speaker_1)
print(f"WER - Noisy SNR 3 speaker 1- wav2vec2 large: {wer_noisy_speaker1_large}")
print(f"CER - Noisy SNR 3 speaker 1- wav2vec2 large: {cer_noisy_speaker1_large}")

wer_noisy_speaker2_large, cer_noisy_speaker2_large = evaluate(model_large, processor_large, dataset_snr_3_speaker_2)
print(f"WER - Noisy SNR 3 speaker 2- wav2vec2 large: {wer_noisy_speaker2_large}")
print(f"CER - Noisy SNR 3 speaker 2- wav2vec2 large: {cer_noisy_speaker2_large}")

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/130 [00:00<?, ? examples/s]

WER - Clean - wav2vec2 large: 3.0737704918032787
CER - Clean - wav2vec2 large: 0.956037928062064


Map:   0%|          | 0/130 [00:00<?, ? examples/s]

WER - SNR 3 - wav2vec2 large: 25.573770491803277
CER - SNR 3 - wav2vec2 large: 13.768513439385627


Map:   0%|          | 0/130 [00:00<?, ? examples/s]

WER - SNR 6 - wav2vec2 large: 13.647540983606557
CER - SNR 6 - wav2vec2 large: 6.261264791160567


Map:   0%|          | 0/130 [00:00<?, ? examples/s]

WER - SNR 9 - wav2vec2 large: 7.745901639344262
CER - SNR 9 - wav2vec2 large: 3.1502233367290966


Map:   0%|          | 0/5 [00:00<?, ? examples/s]

WER - Clean speaker 1 - wav2vec2 large: 6.493506493506493
CER - Clean speaker 1 - wav2vec2 large: 2.657004830917874


Map:   0%|          | 0/5 [00:00<?, ? examples/s]

WER - Clean speaker 2 - wav2vec2 large: 2.1739130434782608
CER - Clean speaker 2 - wav2vec2 large: 0.6122448979591837


Map:   0%|          | 0/5 [00:00<?, ? examples/s]

WER - Noisy SNR 3 speaker 1- wav2vec2 large: 48.05194805194805
CER - Noisy SNR 3 speaker 1- wav2vec2 large: 25.845410628019323


Map:   0%|          | 0/5 [00:00<?, ? examples/s]

WER - Noisy SNR 3 speaker 2- wav2vec2 large: 48.91304347826087
CER - Noisy SNR 3 speaker 2- wav2vec2 large: 28.163265306122447


In [None]:
# Apply Wav2Vec2-Large-960h-Lv60 + Self-Training
model_self, processor_self = get_model_and_processor(WAV2VEC2_LARGE_SELF)

# Wav2Vec2-Large-Self for clean dataset
wer_clean_self, cer_clean_self = evaluate(model_self, processor_self, dataset_clean)
print(f"WER - Clean - wav2vec2 large-self: {wer_clean_self}")
print(f"CER - Clean - wav2vec2 large-self: {cer_clean_self}")

# Wav2Vec2-Large-Self for SNR noisy dataset
wer_snr3_self, cer_snr3_self = evaluate(model_self, processor_self, dataset_snr_3)
print(f"WER - SNR 3 - wav2vec2 large-self: {wer_snr3_self}")
print(f"CER - SNR 3 - wav2vec2 large-self: {cer_snr3_self}")

wer_snr6_self, cer_snr6_self = evaluate(model_self, processor_self, dataset_snr_6)
print(f"WER - SNR 6 - wav2vec2 large-self: {wer_snr6_self}")
print(f"CER - SNR 6 - wav2vec2 large-self: {cer_snr6_self}")

wer_snr9_self, cer_snr9_self = evaluate(model_self, processor_self, dataset_snr_9)
print(f"WER - SNR 9 - wav2vec2 large-self: {wer_snr9_self}")
print(f"CER - SNR 9 - wav2vec2 large-self: {cer_snr9_self}")

#  Wav2Vec2-Large-Self for single speaker clean dataset | speakers 1 and 2
wer_clean_speaker1_self, cer_clean_speaker1_self = evaluate(model_self, processor_self, dataset_speaker_1)
print(f"WER - Clean speaker 1 - wav2vec2 large-self: {wer_clean_speaker1_self}")
print(f"CER - Clean speaker 1 - wav2vec2 large-self: {cer_clean_speaker1_self}")

wer_clean_speaker2_self, cer_clean_speaker2_self = evaluate(model_self, processor_self, dataset_speaker_2)
print(f"WER - Clean speaker 2 - wav2vec2 large-self: {wer_clean_speaker2_self}")
print(f"CER - Clean speaker 2 - wav2vec2 large-self: {cer_clean_speaker2_self}")

# WAV2VEC2-Large-Self for single speaker noisy (SNR 3) | speakers 1 and 2
wer_noisy_speaker1_self, cer_noisy_speaker1_self = evaluate(model_self, processor_self, dataset_snr_3_speaker_1)
print(f"WER - Noisy SNR 3 speaker 1 - wav2vec2 large-self: {wer_noisy_speaker1_self}")
print(f"CER - Noisy SNR 3 speaker 1 - wav2vec2 large-self: {cer_noisy_speaker1_self}")

wer_noisy_speaker2_self, cer_noisy_speaker2_self = evaluate(model_self, processor_self, dataset_snr_3_speaker_2)
print(f"WER - Noisy SNR 3 speaker 2 - wav2vec2 large-self: {wer_noisy_speaker2_self}")
print(f"CER - Noisy SNR 3 speaker 2 - wav2vec2 large-self: {cer_noisy_speaker2_self}")

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-960h-lv60-self and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/130 [00:00<?, ? examples/s]

WER - Clean - wav2vec2 large-self: 1.8032786885245904
CER - Clean - wav2vec2 large-self: 0.5720554815453335


Map:   0%|          | 0/130 [00:00<?, ? examples/s]

WER - SNR 3 - wav2vec2 large-self: 14.262295081967213
CER - SNR 3 - wav2vec2 large-self: 7.569939659901262


Map:   0%|          | 0/130 [00:00<?, ? examples/s]

WER - SNR 6 - wav2vec2 large-self: 6.024590163934426
CER - SNR 6 - wav2vec2 large-self: 2.5860042316432885


Map:   0%|          | 0/130 [00:00<?, ? examples/s]

WER - SNR 9 - wav2vec2 large-self: 3.6475409836065573
CER - SNR 9 - wav2vec2 large-self: 1.496747903769297


Map:   0%|          | 0/5 [00:00<?, ? examples/s]

WER - Clean speaker 1 - wav2vec2 large-self: 5.194805194805195
CER - Clean speaker 1 - wav2vec2 large-self: 1.6908212560386473


Map:   0%|          | 0/5 [00:00<?, ? examples/s]

WER - Clean speaker 2 - wav2vec2 large-self: 1.0869565217391304
CER - Clean speaker 2 - wav2vec2 large-self: 0.20408163265306123


Map:   0%|          | 0/5 [00:00<?, ? examples/s]

WER - Noisy SNR 3 speaker 1 - wav2vec2 large-self: 46.75324675324675
CER - Noisy SNR 3 speaker 1 - wav2vec2 large-self: 22.22222222222222


Map:   0%|          | 0/5 [00:00<?, ? examples/s]

WER - Noisy SNR 3 speaker 2 - wav2vec2 large-self: 20.652173913043477
CER - Noisy SNR 3 speaker 2 - wav2vec2 large-self: 10.408163265306122


**Part V: Experiments with wav2vec2** |
ΤΩΡΑ ΞΕΚΙΝΑΩ ΝΑ ΒΛΕΠΩ ΑΚΡΙΒΩΣ ΤΑ ΛΑΘΗ ΣE 5 TRANSCRIPTIONS ΕΝΔΕΙΚΤΙΚΑ (ΟΧΙ ΟΛΑ ΛΟΓΩ ΧΡΟΝΟΥ ΚΑΙ ΥΠΟΛΟΓΙΣΤΙΚΟΥ ΒΑΡΟΥΣ)

Υποερώτημα 3.Inspect the resulting transcriptions and comment on the errors (for each model and dataset)

In [None]:
# Αποφάσισα να τρέξω εδώ ξανά τον κώδικα με τo result αυτην την φορά, ώστε να έχω ξεχωριστά τα metrics και τα transcriptions. Τώρα βλέπω ακριβώς τι λάθη (πρόβλεψη-στόχος) έγιναν σε καθε dataset ανά μοντέλο.
def evaluate(model, processor, dataset):
  model = model.to("cuda")
  model.eval()

  def map_to_pred(batch):
    """
    1. Pass batch["audio"]["array"] through the processor and return
      pytorch (pt) tensors. Get the input values of the resulting object.
    2. Use with torch.no_grad() to disable gradients
      Calculate the logits by passing the input values through the model
    3. Get the predicted ids using argmax on the logits
    4. Get the transcription by using processor.batch_decode
    5. Set batch["transcription"] to the transcription
    """
    inputs = processor(batch["audio"]["array"], sampling_rate=16000, return_tensors="pt", padding=True)

    with torch.no_grad():
      logits = model(inputs.input_values.to("cuda")).logits

    predicted_ids = torch.argmax (logits, dim=-1) # ti einai to dim=1 xreiazetai ?
    transcription = processor.batch_decode(predicted_ids)[0] # na dw is prsthikes
    batch["transcription"] = transcription
    return batch

    # DONE

  result = dataset.map(map_to_pred, batched=False, remove_columns=["audio"])
  WER = calculate_wer(result["text"], result["transcription"])
  CER = calculate_cer(result["text"], result["transcription"])
  return WER, CER, result  # WANT TO RETURN NOW THE RESULT OF THE TRANSCRIPTIONS (GROUND TRUTH AND PREDICTION)

In [None]:
# WAV2VEC2-BASE for clean dataset
wer_clean, cer_clean, result_clean = evaluate(model_base, processor_base, dataset_clean)

for i in range(5):
    print(f"\n-Sample {i+1}")
    print("Ground truth   :", result_clean[i]["text"])
    print("Predicted text :", result_clean[i]["transcription"])

Map:   0%|          | 0/130 [00:00<?, ? examples/s]


-Sample 1
Ground truth   : NO MISTER HARDWIGG SAID THE CAPTAIN NO FEAR OF THAT
Predicted text : NO MISTER HARDWIG SAID THE CAPTAIN NO FEAR OF THAT

-Sample 2
Ground truth   : EACH MUST CRY LOUDER AND WANDER FARTHER YET MAY LUCK BE WITH THEM BOTH THAT THEY MAY FIND THEIR OWN AT NIGHTFALL
Predicted text : EACH MUST CRY LOUDER AND WANDER FARTHER YET MAY LUCK BE WITH THEM BOTH THAT THEY MAY FIND THEIR OWN AT NIGHTFALL

-Sample 3
Ground truth   : AND IF YOU HAVE ANY DESIRE TO SHORTEN THE JOURNEY AND PUT YOURSELF EASILY IN THE WAY OF SALVATION COME WITH ME AND I WILL SHOW YOU HOW TO BECOME A KNIGHT ERRANT A CALLING WHEREIN SO MANY HARDSHIPS AND MISHAPS ARE ENCOUNTERED THAT IF THEY BE TAKEN AS PENANCES THEY WILL LODGE YOU IN HEAVEN IN A TRICE
Predicted text : AND IF YOU HAVE ANY DESIRE TO SHORTEN THE JOURNEY AND PUT YOURSELF EASILY IN THE WAY OF SALVATION COME WITH ME AND I WILL SHOW YOU HOW TO BECOME A KNIGHT ERRANT A CALLING WHEREIN SO MANY HARDSHIPS AND MISHAPS ARE ENCOUNTERED THAT IF THE

The wav2vec2-base model shows very good performance on clean data, with most transcriptions matching the ground truth exactly. We may notice that the word “HARDWIGG” was transcribed as “HARDWIG,” which is a small truncation error. In Sample 4, we see “A” before “HIGHWAYMAN,” changing “BETTER FRIAR THAN HIGHWAYMAN” to “BETTER FRIAR THAN A HIGHWAYMAN,” which is a minor insertion. In general, these small errors are rare and do not significantly affect the overall intelligibility.







In [None]:
# WAV2VEC2-BASE for SNR-3 noisy dataset
wer_snr3_base, cer_snr3_base, result_snr3_base = evaluate(model_base, processor_base, dataset_snr_3)

for i in range(5):
    print(f"\n-Sample {i+1}")
    print("Ground truth   :", result_snr3_base[i]["text"])
    print("Predicted text :", result_snr3_base[i]["transcription"])

# WAV2VEC2-BASE for SNR-6 noisy dataset
wer_snr6_base, cer_snr6_base, result_snr6_base = evaluate(model_base, processor_base, dataset_snr_6)

for i in range(5):
    print(f"\n-Sample {i+1}")
    print("Ground truth   :", result_snr6_base[i]["text"])
    print("Predicted text :", result_snr6_base[i]["transcription"])

# WAV2VEC2-BASE for SNR-9 noisy dataset
wer_snr9_base, cer_snr9_base, result_snr9_base = evaluate(model_base, processor_base, dataset_snr_9)

for i in range(5):
    print(f"\n-Sample {i+1}")
    print("Ground truth   :", result_snr9_base[i]["text"])
    print("Predicted text :", result_snr9_base[i]["transcription"])

Map:   0%|          | 0/130 [00:00<?, ? examples/s]


-Sample 1
Ground truth   : NO MISTER HARDWIGG SAID THE CAPTAIN NO FEAR OF THAT
Predicted text : OH MISTER HARDWICG SAID THE CAPTAIN I FEAR THAT

-Sample 2
Ground truth   : EACH MUST CRY LOUDER AND WANDER FARTHER YET MAY LUCK BE WITH THEM BOTH THAT THEY MAY FIND THEIR OWN AT NIGHTFALL
Predicted text : EACH MUST CRY LOUDER AND WANTER FATHER YET MAY LUCK BE WITH THEM BOTH THAT THEY MAY FIND THEIR OWN AT NIGHT

-Sample 3
Ground truth   : AND IF YOU HAVE ANY DESIRE TO SHORTEN THE JOURNEY AND PUT YOURSELF EASILY IN THE WAY OF SALVATION COME WITH ME AND I WILL SHOW YOU HOW TO BECOME A KNIGHT ERRANT A CALLING WHEREIN SO MANY HARDSHIPS AND MISHAPS ARE ENCOUNTERED THAT IF THEY BE TAKEN AS PENANCES THEY WILL LODGE YOU IN HEAVEN IN A TRICE
Predicted text : AND IF YOU HAVE ANY DESIRE TO SHORTEN THE JOURNNEY AND PUT YOURSELF EASILY IN WAY OF SALVATION COME WITH MEA AND I WILL SOW YOU GOTTL BE COM TAN MY LAWYER DOT A CALLING WHERIN FUAT HARD SHIPS AND MISHAPS ARE ENCOUNTERED GET IF THEY BE TAKEN HER

Map:   0%|          | 0/130 [00:00<?, ? examples/s]


-Sample 1
Ground truth   : NO MISTER HARDWIGG SAID THE CAPTAIN NO FEAR OF THAT
Predicted text : NO MISTER HARDWICG SAID THE CAPTAIN NO FEAR OF THAT

-Sample 2
Ground truth   : EACH MUST CRY LOUDER AND WANDER FARTHER YET MAY LUCK BE WITH THEM BOTH THAT THEY MAY FIND THEIR OWN AT NIGHTFALL
Predicted text : EACH MUST CRY LOUDER AND WANTER FARTHER YET MAY LUCK BE WITH THEM BOTH THAT THEY MAY FIND THEIR OWN AT NIP

-Sample 3
Ground truth   : AND IF YOU HAVE ANY DESIRE TO SHORTEN THE JOURNEY AND PUT YOURSELF EASILY IN THE WAY OF SALVATION COME WITH ME AND I WILL SHOW YOU HOW TO BECOME A KNIGHT ERRANT A CALLING WHEREIN SO MANY HARDSHIPS AND MISHAPS ARE ENCOUNTERED THAT IF THEY BE TAKEN AS PENANCES THEY WILL LODGE YOU IN HEAVEN IN A TRICE
Predicted text : AND IF YOU HAVE ANY DESIRE TO SHORTEN THE JOURNY AND PUT YOURSELF EASILY G WAY OF SALVATION COME WITH ME AND I WILL SHOW YOU HOW TO BECOME A NIGHT BAST A CALLING WHERE SO MANY HARDSHIPS AND MISHALPS OF ER EN COUNTER L THAT IF THEY BE TAKEN A

Map:   0%|          | 0/130 [00:00<?, ? examples/s]


-Sample 1
Ground truth   : NO MISTER HARDWIGG SAID THE CAPTAIN NO FEAR OF THAT
Predicted text : NO MISTER HARDWICG SAID THE CAPTAIN NO FEAR OF THAT

-Sample 2
Ground truth   : EACH MUST CRY LOUDER AND WANDER FARTHER YET MAY LUCK BE WITH THEM BOTH THAT THEY MAY FIND THEIR OWN AT NIGHTFALL
Predicted text : EACH MUST CRY LOUDER AND WANDER FARTHER YET MAY LUCK BE WITH THEM BOTH THAT THEY MAY FIND THEIR OWN AT NIGHTFALL

-Sample 3
Ground truth   : AND IF YOU HAVE ANY DESIRE TO SHORTEN THE JOURNEY AND PUT YOURSELF EASILY IN THE WAY OF SALVATION COME WITH ME AND I WILL SHOW YOU HOW TO BECOME A KNIGHT ERRANT A CALLING WHEREIN SO MANY HARDSHIPS AND MISHAPS ARE ENCOUNTERED THAT IF THEY BE TAKEN AS PENANCES THEY WILL LODGE YOU IN HEAVEN IN A TRICE
Predicted text : AND IF YOU HAVE ANY DESIRE TO SHORTEN THE JOURNEYY AND PUT YOURSELF EASILY N WAY SALVATION COME WITH ME AND I WILL SHOW YOU HOW TO BECOME A KNIGHT ERRANT A CALLING WHEREIN SO MANY HARDSHIPS AND MISHAPS OR ENCOUNTERTHAT IF THEY BE TAKEN

The transcriptions under noisy dataset in base model (SNR 3, SNR 6, and SNR 9) show better accuracy as noise decreases. At SNR 3, the model severe errors, including insertions, substitutions. For example, we see that "KNIGHT ERRANT" becomes "MY LAWYER DOT", showing significant distortion. Example 2: "NO MISTER HARDWIGG" is turned into "OH MISTER HARDWICG". At SNR 6, the model still struggles, but its performance is better. Example 3:  "PENANCES" is misrecognized as "PENANCELS", which is a substitution. At SNR 9, transcription quality improves further, with only minor distortions. Words like "JOURNEY" become "JOURNEYY", indicating an insertion, and "PENANCES" becomes "TENNANCES". Overall, the model handles low noise (SNR 9) fairly well, but under heavier noise (SNR 3) WER and CER show degradation.

In [None]:
# WAV2VEC2-BASE for single speaker clean dataset 1
wer_clean_speaker1_base, cer_clean_speaker1_base, result_clean_speaker1_base = evaluate(model_base, processor_base, dataset_speaker_1)
for i in range(5):
    print(f"\n-Sample {i+1}")
    print("Ground truth   :", result_clean_speaker1_base[i]["text"])
    print("Predicted text :", result_clean_speaker1_base[i]["transcription"])

#for single speaker clean dataset 2
wer_clean_speaker2_base, cer_clean_speaker2_base, result_clean_speaker2_base = evaluate(model_base, processor_base, dataset_speaker_2)
for i in range(5):
    print(f"\n-Sample {i+1}")
    print("Ground truth   :", result_clean_speaker2_base[i]["text"])
    print("Predicted text :", result_clean_speaker2_base[i]["transcription"])

# for single speaker 1 noisy (SNR 3)
wer_noisy_speaker1_base, cer_noisy_speaker1_base, result_noisy_speaker1_base = evaluate(model_base, processor_base, dataset_snr_3_speaker_1)
for i in range(5):
    print(f"\n-Sample {i+1}")
    print("Ground truth   :", result_noisy_speaker1_base[i]["text"])
    print("Predicted text :", result_noisy_speaker1_base[i]["transcription"])

# for single speaker 2 noisy (SNR 3)
wer_noisy_speaker2_base, cer_noisy_speaker2_base, result_noisy_speaker2_base = evaluate(model_base, processor_base, dataset_snr_3_speaker_2)
for i in range(5):
    print(f"\n-Sample {i+1}")
    print("Ground truth   :", result_noisy_speaker2_base[i]["text"])
    print("Predicted text :", result_noisy_speaker2_base[i]["transcription"])

Map:   0%|          | 0/5 [00:00<?, ? examples/s]


-Sample 1
Ground truth   : THE STRENGTH THAT ENABLES SOMEONE IN A TRANCE TO HOLD HIS BODY STIFF AND UNSUPPORTED EXCEPT AT TWO POINTS THE HEAD AND HEELS
Predicted text : THE STRENGTH THAT ENABLES SOMEONE IN A TRANCE TO HOLD HIS BODY STIFF AND UNSUPPORTED EXCEPT AT TWO POINTS THE HEAD AND HEELS

-Sample 2
Ground truth   : THE METAL FOREST IS IN THE GREAT DOMED CAVERN THE LARGEST IN ALL OUR DOMINIONS REPLIED KALIKO
Predicted text : THE METAL FOREST IS IN THE GREAT DOMED CAVERN THE LARGEST IN ALL OUR DOMINIONS REPLIED CALICO

-Sample 3
Ground truth   : IROLG LOOKED AMAZED AT THE SUDDEN FURY OF THE ATTACK THEN SMILED
Predicted text : IROLD LOOKED AMAZED AT THE SUDDEN FURY OF THE ATTACK THEN SMILED

-Sample 4
Ground truth   : THE OTHER VOICE SNAPPED WITH A HARSH URGENCY CLEARLY USED TO COMMAND
Predicted text : THE UTTER VOICE SNAPPED WITH A HARSH URGENCY CLEARLY USED TO COMMAND

-Sample 5
Ground truth   : THE KING HAS FLED IN DISGRACE AND YOUR FRIENDS ARE ASKING FOR YOU
Predicted text : THE

Map:   0%|          | 0/5 [00:00<?, ? examples/s]


-Sample 1
Ground truth   : BUT NEITHER THE GLORIFIED WOODS ON THE ONE HAND NOR THE LAKE ON THE OTHER COULD AT FIRST HOLD THE EYE
Predicted text : BUT NEITHER THE GLORIFIED WOODS ON THE ONE HAND NOR THE LAKE ON THE OTHER COULD AT FIRST HOLD THE EYE

-Sample 2
Ground truth   : THE ASCENT OF LASSEN'S BUTTE IS AN EASY WALK AND THE VIEWS FROM THE SUMMIT ARE EXTREMELY TELLING
Predicted text : THE ASCENT OF LASSIN'S BUTE IS AN EASY WALK AND THE VIEWS FROM THE SUMMIT ARE EXTREMELY TELLING

-Sample 3
Ground truth   : IN SETTING OUT FROM STRAWBERRY VALLEY BY BEARING OFF TO THE NORTHWESTWARD A FEW MILES YOU MAY SEE
Predicted text : AND SETTING OUT FROM STRAUGHTBERRY VALLEY BY BEARING OFF TO THE NORTHWESTWARD A FEW MILES YOU MAY SEE

-Sample 4
Ground truth   : GO QUIETLY ALONE NO HARM WILL BEFALL YOU
Predicted text : GO QUIETLY ALONE NO HARM WILL BEFALL YOU

-Sample 5
Ground truth   : THE DUCKS LESS WARY KEPT THEIR PLACES MERELY SWIMMING IN AND OUT THROUGH OPENINGS IN THE RUSHES RIPPLING THE GLAS

Map:   0%|          | 0/5 [00:00<?, ? examples/s]


-Sample 1
Ground truth   : THE STRENGTH THAT ENABLES SOMEONE IN A TRANCE TO HOLD HIS BODY STIFF AND UNSUPPORTED EXCEPT AT TWO POINTS THE HEAD AND HEELS
Predicted text : STRENGTHEDIN NEHE SOG IN AD FAST TO IS BODY STEAF NIN SUPPORER EFOART  TE HEAD I HEELS

-Sample 2
Ground truth   : THE METAL FOREST IS IN THE GREAT DOMED CAVERN THE LARGEST IN ALL OUR DOMINIONS REPLIED KALIKO
Predicted text : INE FIRST IS IN  GREATOIN CA CR LOGE A ALL EANDS E CA CAE

-Sample 3
Ground truth   : IROLG LOOKED AMAZED AT THE SUDDEN FURY OF THE ATTACK THEN SMILED
Predicted text : I WOKE THEMAZED AT THE SAME FUR OF THE ATTACK AND SNELLED

-Sample 4
Ground truth   : THE OTHER VOICE SNAPPED WITH A HARSH URGENCY CLEARLY USED TO COMMAND
Predicted text : ELER HAW SLET WITHOUT OG O AND SAYIN CILIN IS MAN

-Sample 5
Ground truth   : THE KING HAS FLED IN DISGRACE AND YOUR FRIENDS ARE ASKING FOR YOU
Predicted text : THE KING IS OLD DISGRACE FY OUR FRIEND AS AR ASPOL FOR YOU


Map:   0%|          | 0/5 [00:00<?, ? examples/s]


-Sample 1
Ground truth   : BUT NEITHER THE GLORIFIED WOODS ON THE ONE HAND NOR THE LAKE ON THE OTHER COULD AT FIRST HOLD THE EYE
Predicted text : BET NEITHER THE BLUSHIL A HA ON HER OR THE WAY HE HELDED CULD IT FOT HERG

-Sample 2
Ground truth   : THE ASCENT OF LASSEN'S BUTTE IS AN EASY WALK AND THE VIEWS FROM THE SUMMIT ARE EXTREMELY TELLING
Predicted text : THE ASCENT OF LASOS GANT AS IN ISINAC AND EVENCE FROM THE SENAT ARSTRANI TELL

-Sample 3
Ground truth   : IN SETTING OUT FROM STRAWBERRY VALLEY BY BEARING OFF TO THE NORTHWESTWARD A FEW MILES YOU MAY SEE
Predicted text : AND SETTING OUT FOR STAR BY BEARING OU THE MAT GLASS BOA FUA  O MIGHT SE

-Sample 4
Ground truth   : GO QUIETLY ALONE NO HARM WILL BEFALL YOU
Predicted text : AT GO HERE I ILL YET ALONE LOK ON OE FOR YE

-Sample 5
Ground truth   : THE DUCKS LESS WARY KEPT THEIR PLACES MERELY SWIMMING IN AND OUT THROUGH OPENINGS IN THE RUSHES RIPPLING THE GLASSY WATER AND RAISING SPANGLES IN THEIR WAKE
Predicted text : TDOS LES PA

In **clean conditions, both speaker 1 and 2** achieve high transcription accuracy with only small errors, such as rare name substitutions (“KALIKO” → “CALICO”) or character changes-cer (“IROLG” → “IROLD”).

However, under **noisy SNR 3 conditions**, transcription quality drops sharply. The model does severe distortions, including incorrect word insertions. 1 Example: “THE STRENGTH...” becomes “STRENGTHEDIN NEHE SOG...”. This highlights the model’s strong performance on clean data but clear vulnerability to noise.

In [None]:
# Model large - Clean dataset
wer_clean_large, cer_clean_large, result_clean_large = evaluate(model_large, processor_large, dataset_clean)
for i in range(5):
    print(f"\n-Sample {i+1}")
    print("Ground truth   :", result_clean_large[i]["text"])
    print("Predicted text :", result_clean_large[i]["transcription"])

Map:   0%|          | 0/130 [00:00<?, ? examples/s]


-Sample 1
Ground truth   : NO MISTER HARDWIGG SAID THE CAPTAIN NO FEAR OF THAT
Predicted text : NO MISTER HARDWIG SAID THE CAPTAIN NO FEAR OF THAT

-Sample 2
Ground truth   : EACH MUST CRY LOUDER AND WANDER FARTHER YET MAY LUCK BE WITH THEM BOTH THAT THEY MAY FIND THEIR OWN AT NIGHTFALL
Predicted text : EACH MUST CRY LOUDER AND WANDER FARTHER YET MAY LUCK BE WITH THEM BOTH THAT THEY MAY FIND THEIR OWN AT NIGHTFALL

-Sample 3
Ground truth   : AND IF YOU HAVE ANY DESIRE TO SHORTEN THE JOURNEY AND PUT YOURSELF EASILY IN THE WAY OF SALVATION COME WITH ME AND I WILL SHOW YOU HOW TO BECOME A KNIGHT ERRANT A CALLING WHEREIN SO MANY HARDSHIPS AND MISHAPS ARE ENCOUNTERED THAT IF THEY BE TAKEN AS PENANCES THEY WILL LODGE YOU IN HEAVEN IN A TRICE
Predicted text : AND IF YOU HAVE ANY DESIRE TO SHORTEN THE JOURNEY AND PUT YOURSELF EASILY IN THE WAY OF SALVATION COME WITH ME AND I WILL SHOW YOU HOW TO BECOME A KNIGHT ERRANT A CALLING WHEREIN SO MANY HARDSHIPS AND MISHAPS ARE ENCOUNTERED THAT IF THE

The wav2vec2-large model shows excellent performance on clean speech with perfect transcriptions. There are minor changes between predicted and ground truth transcriptions such as “HARDWIGG” being shortened to “HARDWIG” and slight character changes (insertions, substitutions) like “GASCON” → “GASCONE” or “CATALAN” → “CATALIN.” These are low-impact substitutions and do not affect the overall meaning. Overall, the model handles clean audio with high accuracy.

In [None]:
# WAV2VEC2-LARGE for SNR-3 noisy dataset
wer_snr3_large, cer_snr3_large, result_snr3_large = evaluate(model_large, processor_large, dataset_snr_3)

for i in range(5):
    print(f"\n-Sample {i+1}")
    print("Ground truth   :", result_snr3_large[i]["text"])
    print("Predicted text :", result_snr3_large[i]["transcription"])

# WAV2VEC2-LARGE for SNR-6 noisy dataset
wer_snr6_large, cer_snr6_large, result_snr6_large = evaluate(model_large, processor_large, dataset_snr_6)

for i in range(5):
    print(f"\n-Sample {i+1}")
    print("Ground truth   :", result_snr6_large[i]["text"])
    print("Predicted text :", result_snr6_large[i]["transcription"])

# WAV2VEC2-LARGE for SNR-9 noisy dataset
wer_snr9_large, cer_snr9_large, result_snr9_large = evaluate(model_large, processor_large, dataset_snr_9)

for i in range(5):
    print(f"\n-Sample {i+1}")
    print("Ground truth   :", result_snr9_large[i]["text"])
    print("Predicted text :", result_snr9_large[i]["transcription"])

Map:   0%|          | 0/130 [00:00<?, ? examples/s]


-Sample 1
Ground truth   : NO MISTER HARDWIGG SAID THE CAPTAIN NO FEAR OF THAT
Predicted text : O MISTER HARDWICK SAID THE CAPTAIN NO FEAR OF THAT

-Sample 2
Ground truth   : EACH MUST CRY LOUDER AND WANDER FARTHER YET MAY LUCK BE WITH THEM BOTH THAT THEY MAY FIND THEIR OWN AT NIGHTFALL
Predicted text : EACH MUST CRY LOUDER AND WANDER FARTHER YET MAY LUCK BE WITH THEM BOTH THAT THEY MAY FIND THEIR OWN AT NIGHTFALL

-Sample 3
Ground truth   : AND IF YOU HAVE ANY DESIRE TO SHORTEN THE JOURNEY AND PUT YOURSELF EASILY IN THE WAY OF SALVATION COME WITH ME AND I WILL SHOW YOU HOW TO BECOME A KNIGHT ERRANT A CALLING WHEREIN SO MANY HARDSHIPS AND MISHAPS ARE ENCOUNTERED THAT IF THEY BE TAKEN AS PENANCES THEY WILL LODGE YOU IN HEAVEN IN A TRICE
Predicted text : AND IF YOU HAVE ANY DESIRE TO SHORTENTHE JOURNEY AND PUT YOURSELF EASILY IN THA WAY OF SALVATION COME WITH ME UP AND I ALL SHOW YO THAT'LL BECOME AN MY LAWYER GOT A CALLIN WHEREIN SO MANY HARDSHIPS AND MISHAPS ARE ENCOUNTERED THAT IF TH

Map:   0%|          | 0/130 [00:00<?, ? examples/s]


-Sample 1
Ground truth   : NO MISTER HARDWIGG SAID THE CAPTAIN NO FEAR OF THAT
Predicted text : NO MISTER HARDWICK SAID THE CAPTAIN NO FEAR OF THAT

-Sample 2
Ground truth   : EACH MUST CRY LOUDER AND WANDER FARTHER YET MAY LUCK BE WITH THEM BOTH THAT THEY MAY FIND THEIR OWN AT NIGHTFALL
Predicted text : EACH MUST CRY LOUDER AND WANDER FARTHER YET MAY LUCK BE WITH THEM BOTH THAT THEY MAY FIND THEIR OWN AT NIGHTFALL

-Sample 3
Ground truth   : AND IF YOU HAVE ANY DESIRE TO SHORTEN THE JOURNEY AND PUT YOURSELF EASILY IN THE WAY OF SALVATION COME WITH ME AND I WILL SHOW YOU HOW TO BECOME A KNIGHT ERRANT A CALLING WHEREIN SO MANY HARDSHIPS AND MISHAPS ARE ENCOUNTERED THAT IF THEY BE TAKEN AS PENANCES THEY WILL LODGE YOU IN HEAVEN IN A TRICE
Predicted text : AND IF YOU HAVE ANY DESIRE TO SHORTEN THAT JOURMN AND PUT YOURSELF EASILY BY WAY OF SALVATION COME WITH ME AND I WILL SHOW YOU HOW TO BECOME A NIGHT BARRT A CALLING WHEREIN SO MANY HARDSHIPS AND MISHAPS OR INCOUNTERED THAT IF THEY BE T

Map:   0%|          | 0/130 [00:00<?, ? examples/s]


-Sample 1
Ground truth   : NO MISTER HARDWIGG SAID THE CAPTAIN NO FEAR OF THAT
Predicted text : NO MISTER HARDWIG SAID THE CAPTAIN NO FEAR OF THAT

-Sample 2
Ground truth   : EACH MUST CRY LOUDER AND WANDER FARTHER YET MAY LUCK BE WITH THEM BOTH THAT THEY MAY FIND THEIR OWN AT NIGHTFALL
Predicted text : EACH MUST CRY LOUDER AND WANDER FARTHER YET MAY LUCK BE WITH THEM BOTH THAT THEY MAY FIND THEIR OWN AT NIGHTFALL

-Sample 3
Ground truth   : AND IF YOU HAVE ANY DESIRE TO SHORTEN THE JOURNEY AND PUT YOURSELF EASILY IN THE WAY OF SALVATION COME WITH ME AND I WILL SHOW YOU HOW TO BECOME A KNIGHT ERRANT A CALLING WHEREIN SO MANY HARDSHIPS AND MISHAPS ARE ENCOUNTERED THAT IF THEY BE TAKEN AS PENANCES THEY WILL LODGE YOU IN HEAVEN IN A TRICE
Predicted text : AND IF YOU HAVE ANY DESIRE TO SHORTEN THE JOURNEY AND PUT YOURSELF EASILY IN WAY OF SALVATION COME WITH ME AND I WILL SHOW YOU HOW TO BECOME A KNIGHT ERRANT A CALLING WHEREIN SO MANY HARDSHIPS AND MISHAPS ARE ENCOUNTERED THAT IF THEY BE

At SNR 3 in large model, we can notice many errors in CER and WER but there are fewer errors than these of the base model snr3-snr6-snr9. Example: substitutions and meaningless phrases like “MY LAWYER GOT A CALLIN.” In SNR 6, accuracy improves but some distortions remain, such as “PREPER” instead of “COVER.” By SNR 9, the model performs well, with only minor mistakes like “CATALING” for “CATALAN.” Overall, as the SNR decreases, we notice more mistakes in hypothesis.

In [None]:
# WAV2VEC2-LARGE for single speaker clean dataset 1
wer_clean_speaker1_large, cer_clean_speaker1_large, result_clean_speaker1_large = evaluate(model_large, processor_large, dataset_speaker_1)
for i in range(5):
    print(f"\n-Sample {i+1}")
    print("Ground truth   :", result_clean_speaker1_large[i]["text"])
    print("Predicted text :", result_clean_speaker1_large[i]["transcription"])

# for single speaker clean dataset 2
wer_clean_speaker2_large, cer_clean_speaker2_large, result_clean_speaker2_large = evaluate(model_large, processor_large, dataset_speaker_2)
for i in range(5):
    print(f"\n-Sample {i+1}")
    print("Ground truth   :", result_clean_speaker2_large[i]["text"])
    print("Predicted text :", result_clean_speaker2_large[i]["transcription"])

# for single speaker 1 noisy (SNR 3)
wer_noisy_speaker1_large, cer_noisy_speaker1_large, result_noisy_speaker1_large = evaluate(model_large, processor_large, dataset_snr_3_speaker_1)
for i in range(5):
    print(f"\n-Sample {i+1}")
    print("Ground truth   :", result_noisy_speaker1_large[i]["text"])
    print("Predicted text :", result_noisy_speaker1_large[i]["transcription"])

# for single speaker 2 noisy (SNR 3)
wer_noisy_speaker2_large, cer_noisy_speaker2_large, result_noisy_speaker2_large = evaluate(model_large, processor_large, dataset_snr_3_speaker_2)
for i in range(5):
    print(f"\n-Sample {i+1}")
    print("Ground truth   :", result_noisy_speaker2_large[i]["text"])
    print("Predicted text :", result_noisy_speaker2_large[i]["transcription"])

Map:   0%|          | 0/5 [00:00<?, ? examples/s]


-Sample 1
Ground truth   : THE STRENGTH THAT ENABLES SOMEONE IN A TRANCE TO HOLD HIS BODY STIFF AND UNSUPPORTED EXCEPT AT TWO POINTS THE HEAD AND HEELS
Predicted text : THE STRENGTH THAT ENABLES SOMEONE IN A TRANCE TO HOLD HIS BODY STIFF AND UNSUPPORTED EXCEPT AT TWO POINTS THE HEAD AND HEELS

-Sample 2
Ground truth   : THE METAL FOREST IS IN THE GREAT DOMED CAVERN THE LARGEST IN ALL OUR DOMINIONS REPLIED KALIKO
Predicted text : THE MIDDLE FOREST IS IN THE GREAT DOMED CAVERN THE LARGEST IN ALL OUR DOMINIONS REPLIED CALIKO

-Sample 3
Ground truth   : IROLG LOOKED AMAZED AT THE SUDDEN FURY OF THE ATTACK THEN SMILED
Predicted text : ROLK LOOKED AMAZED AT THE SUDDEN FURY OF THE ATTACK THEN SMILED

-Sample 4
Ground truth   : THE OTHER VOICE SNAPPED WITH A HARSH URGENCY CLEARLY USED TO COMMAND
Predicted text : THE OTHER VOICE SNAPPED WITH A HARSH URGENCY CLEARLY USED TO COMMAND

-Sample 5
Ground truth   : THE KING HAS FLED IN DISGRACE AND YOUR FRIENDS ARE ASKING FOR YOU
Predicted text : THE

Map:   0%|          | 0/5 [00:00<?, ? examples/s]


-Sample 1
Ground truth   : BUT NEITHER THE GLORIFIED WOODS ON THE ONE HAND NOR THE LAKE ON THE OTHER COULD AT FIRST HOLD THE EYE
Predicted text : BUT NEITHER THE GLORIFIED WOODS ON THE ONE HAND NOR THE LAKE ON THE OTHER COULD AT FIRST HOLD THE EYE

-Sample 2
Ground truth   : THE ASCENT OF LASSEN'S BUTTE IS AN EASY WALK AND THE VIEWS FROM THE SUMMIT ARE EXTREMELY TELLING
Predicted text : THE ASCENT OF LASSEN'S BUTE IS AN EASY WALK AND THE VIEWS FROM THE SUMMIT ARE EXTREMELY TELLING

-Sample 3
Ground truth   : IN SETTING OUT FROM STRAWBERRY VALLEY BY BEARING OFF TO THE NORTHWESTWARD A FEW MILES YOU MAY SEE
Predicted text : AND SETTING OUT FROM STRAWBERRY VALLEY BY BEARING OFF TO THE NORTHWESTWARD A FEW MILES YOU MAY SEE

-Sample 4
Ground truth   : GO QUIETLY ALONE NO HARM WILL BEFALL YOU
Predicted text : GO QUIETLY ALONE NO HARM WILL BEFALL YOU

-Sample 5
Ground truth   : THE DUCKS LESS WARY KEPT THEIR PLACES MERELY SWIMMING IN AND OUT THROUGH OPENINGS IN THE RUSHES RIPPLING THE GLASSY 

Map:   0%|          | 0/5 [00:00<?, ? examples/s]


-Sample 1
Ground truth   : THE STRENGTH THAT ENABLES SOMEONE IN A TRANCE TO HOLD HIS BODY STIFF AND UNSUPPORTED EXCEPT AT TWO POINTS THE HEAD AND HEELS
Predicted text : THE STRENGTH OF A NEBORS SOMWON IN ADFANCE TO HAT WAS BODY STIFF AND CANSUPPORTIT EXCEPT A E PORTS THE HEAD AN HEELS

-Sample 2
Ground truth   : THE METAL FOREST IS IN THE GREAT DOMED CAVERN THE LARGEST IN ALL OUR DOMINIONS REPLIED KALIKO
Predicted text : HE MIDDLE FOREST IS IN THE GREAT BUND CATTLER THE LARGEST IN ALL DEMANS REPLIED THE COWGO

-Sample 3
Ground truth   : IROLG LOOKED AMAZED AT THE SUDDEN FURY OF THE ATTACK THEN SMILED
Predicted text : A LOK LOOK AMAZED AT THE SUDDEN FER OF THE ATTACK AND SCOWED

-Sample 4
Ground truth   : THE OTHER VOICE SNAPPED WITH A HARSH URGENCY CLEARLY USED TO COMMAND
Predicted text : THE UMERHORTER SNAPPED WITH A HORRIS BRILLIANCY CEONLY USED TO COMMAND

-Sample 5
Ground truth   : THE KING HAS FLED IN DISGRACE AND YOUR FRIENDS ARE ASKING FOR YOU
Predicted text : THE KING IS FONE 

Map:   0%|          | 0/5 [00:00<?, ? examples/s]


-Sample 1
Ground truth   : BUT NEITHER THE GLORIFIED WOODS ON THE ONE HAND NOR THE LAKE ON THE OTHER COULD AT FIRST HOLD THE EYE
Predicted text : BUT NEITHER THE GLOR FIDLE HENENTS HANGN ONE NEAR NOR THE WAKE ON THE OTHER COULD AT FIRST HEARTNOL

-Sample 2
Ground truth   : THE ASCENT OF LASSEN'S BUTTE IS AN EASY WALK AND THE VIEWS FROM THE SUMMIT ARE EXTREMELY TELLING
Predicted text : THE ASCENT OF LASEN'S MUT IS AN ENZY WALK AND THE VENUS FROM THE STOMICH ARE EXTREMELY TELLIBLE

-Sample 3
Ground truth   : IN SETTING OUT FROM STRAWBERRY VALLEY BY BEARING OFF TO THE NORTHWESTWARD A FEW MILES YOU MAY SEE
Predicted text : AND SETTING OUT COSE BEHER N HANE BY BEARING OFF TO THE NORTH WEST QUARTER  HONMINIS OMAIN SCENE

-Sample 4
Ground truth   : GO QUIETLY ALONE NO HARM WILL BEFALL YOU
Predicted text : WEAWILIA O MOON WHO FINWILL BEFOLE YOU

-Sample 5
Ground truth   : THE DUCKS LESS WARY KEPT THEIR PLACES MERELY SWIMMING IN AND OUT THROUGH OPENINGS IN THE RUSHES RIPPLING THE GLASSY WATER 

On **clean dataset,** the model performs very well for both speakers, with mostly perfect transcriptions and only minor errors such as phonetic substitutions like “KALIKO” → “CALIKO” or small deletions in characters (e.g., “BUTTE” → “BUTE”).

In **SNR 3,** the transcription quality drops significantly. Both speakers have severe errors including substitutions of words, deletions etc. For example, “REPLIED KALIKO” becomes “REPLIED THE COWGO” and “GO QUIETLY ALONE...” turns into  “WEAWILIA O MOON...”. The model struggles with noise, especially in longer sentences.

Overall, wav2vec2-large maintains high accuracy in clean speech but remains full of errros in heavy noise, particularly in single-speaker conditions.

In [None]:
# Model-large-self for clean datasetwer_noisy_speaker1_self, cer_noisy_speaker1_self = evaluate(model_self, processor_self, dataset_snr_3_speaker_1)
wer_clean_self, cer_clean_self, result_clean_self = evaluate(model_self, processor_self, dataset_clean)

for i in range(5):
    print(f"\n-Sample {i+1}")
    print("Ground truth   :", result_clean_self[i]["text"])
    print("Predicted text :", result_clean_self[i]["transcription"])

Map:   0%|          | 0/130 [00:00<?, ? examples/s]


-Sample 1
Ground truth   : NO MISTER HARDWIGG SAID THE CAPTAIN NO FEAR OF THAT
Predicted text : NO MISTER HARDWIG SAID THE CAPTAIN NO FEAR OF THAT

-Sample 2
Ground truth   : EACH MUST CRY LOUDER AND WANDER FARTHER YET MAY LUCK BE WITH THEM BOTH THAT THEY MAY FIND THEIR OWN AT NIGHTFALL
Predicted text : EACH MUST CRY LOUDER AND WANDER FARTHER YET MAY LUCK BE WITH THEM BOTH THAT THEY MAY FIND THEIR OWN AT NIGHTFALL

-Sample 3
Ground truth   : AND IF YOU HAVE ANY DESIRE TO SHORTEN THE JOURNEY AND PUT YOURSELF EASILY IN THE WAY OF SALVATION COME WITH ME AND I WILL SHOW YOU HOW TO BECOME A KNIGHT ERRANT A CALLING WHEREIN SO MANY HARDSHIPS AND MISHAPS ARE ENCOUNTERED THAT IF THEY BE TAKEN AS PENANCES THEY WILL LODGE YOU IN HEAVEN IN A TRICE
Predicted text : AND IF YOU HAVE ANY DESIRE TO SHORTEN THE JOURNEY AND PUT YOURSELF EASILY IN THE WAY OF SALVATION COME WITH ME AND I WILL SHOW YOU HOW TO BECOME A KNIGHT ERRANT A CALLING WHEREIN SO MANY HARDSHIPS AND MISHAPS ARE ENCOUNTERED THAT IF THE

The model LARGE SELF TRAINING demonstrates exceptional transcription performance in clean dataset with minor errors in hard/rare words. Only minor deletions For example, “HARDWIGG” is transcribed as “HARDWIG”. The majority of hypotheses have no substantial errors. Overall, the model handles clean audio with high precision.

In [None]:
# Model-large-self for SNR dataste
# SNR 3
wer_snr3_self, cer_snr3_self, result_snr3_self = evaluate(model_self, processor_self, dataset_snr_3)

for i in range(5):
    print(f"\n-Sample {i+1}")
    print("Ground truth   :", result_snr3_self[i]["text"])
    print("Predicted text :", result_snr3_self[i]["transcription"])

# SNR 6
wer_snr6_self, cer_snr6_self, result_snr6_self = evaluate(model_self, processor_self, dataset_snr_6)

for i in range(5):
    print(f"\n-Sample {i+1}")
    print("Ground truth   :", result_snr6_self[i]["text"])
    print("Predicted text :", result_snr6_self[i]["transcription"])

# SNR 9
wer_snr9_self, cer_snr9_self, result_snr9_self = evaluate(model_self, processor_self, dataset_snr_9)

for i in range(5):
    print(f"\n-Sample {i+1}")
    print("Ground truth   :", result_snr9_self[i]["text"])
    print("Predicted text :", result_snr9_self[i]["transcription"])

Map:   0%|          | 0/130 [00:00<?, ? examples/s]


-Sample 1
Ground truth   : NO MISTER HARDWIGG SAID THE CAPTAIN NO FEAR OF THAT
Predicted text : NO MISTER HARDWICK SAID THE CAPTAIN NO FEAR OF THAT

-Sample 2
Ground truth   : EACH MUST CRY LOUDER AND WANDER FARTHER YET MAY LUCK BE WITH THEM BOTH THAT THEY MAY FIND THEIR OWN AT NIGHTFALL
Predicted text : EACH MUST CRY LOUDER AND WANDER FARTHER YET MAY LUCK BE WITH THEM BOTH THAT THEY MAY FIND THEIR OWN AT NIGHTFALL

-Sample 3
Ground truth   : AND IF YOU HAVE ANY DESIRE TO SHORTEN THE JOURNEY AND PUT YOURSELF EASILY IN THE WAY OF SALVATION COME WITH ME AND I WILL SHOW YOU HOW TO BECOME A KNIGHT ERRANT A CALLING WHEREIN SO MANY HARDSHIPS AND MISHAPS ARE ENCOUNTERED THAT IF THEY BE TAKEN AS PENANCES THEY WILL LODGE YOU IN HEAVEN IN A TRICE
Predicted text : AND IF YOU HAVE ANY DESIRE TO SHORTEN THE JOURNEY AND PUT YOURSELF EASILY IN THE WAY OF SALVATION COME WITH ME AND I WILL SHOW YOU A GUN BECOME AND MY LAYER NOT A CALLING WHEREIN SO MANY HARDSHIPS AND MISHAPS ARE ENCOUNTERED THAT IF TH

Map:   0%|          | 0/130 [00:00<?, ? examples/s]


-Sample 1
Ground truth   : NO MISTER HARDWIGG SAID THE CAPTAIN NO FEAR OF THAT
Predicted text : NO MISTER HARDWIG SAID THE CAPTAIN NO FEAR OF THAT

-Sample 2
Ground truth   : EACH MUST CRY LOUDER AND WANDER FARTHER YET MAY LUCK BE WITH THEM BOTH THAT THEY MAY FIND THEIR OWN AT NIGHTFALL
Predicted text : EACH MUST CRY LOUDER AND WANDER FARTHER YET MAY LUCK BE WITH THEM BOTH THAT THEY MAY FIND THEIR OWN AT NIGHTFALL

-Sample 3
Ground truth   : AND IF YOU HAVE ANY DESIRE TO SHORTEN THE JOURNEY AND PUT YOURSELF EASILY IN THE WAY OF SALVATION COME WITH ME AND I WILL SHOW YOU HOW TO BECOME A KNIGHT ERRANT A CALLING WHEREIN SO MANY HARDSHIPS AND MISHAPS ARE ENCOUNTERED THAT IF THEY BE TAKEN AS PENANCES THEY WILL LODGE YOU IN HEAVEN IN A TRICE
Predicted text : AND IF YOU HAVE ANY DESIRE TO SHORTEN THE JOURNEY AND PUT YOURSELF EASILY IN THE WAY OF SALVATION COME WITH ME AND I WILL SHOW YOU HOW TO BECOME A KNIGHT ERRANT A CALLING WHEREIN SO MANY HARDSHIPS AND MISHAPS ARE ENCOUNTERED THAT IF THE

Map:   0%|          | 0/130 [00:00<?, ? examples/s]


-Sample 1
Ground truth   : NO MISTER HARDWIGG SAID THE CAPTAIN NO FEAR OF THAT
Predicted text : NO MISTER HARDWICK SAID THE CAPTAIN NO FEAR OF THAT

-Sample 2
Ground truth   : EACH MUST CRY LOUDER AND WANDER FARTHER YET MAY LUCK BE WITH THEM BOTH THAT THEY MAY FIND THEIR OWN AT NIGHTFALL
Predicted text : EACH MUST CRY LOUDER AND WANDER FARTHER YET MAY LUCK BE WITH THEM BOTH THAT THEY MAY FIND THEIR OWN AT NIGHTFALL

-Sample 3
Ground truth   : AND IF YOU HAVE ANY DESIRE TO SHORTEN THE JOURNEY AND PUT YOURSELF EASILY IN THE WAY OF SALVATION COME WITH ME AND I WILL SHOW YOU HOW TO BECOME A KNIGHT ERRANT A CALLING WHEREIN SO MANY HARDSHIPS AND MISHAPS ARE ENCOUNTERED THAT IF THEY BE TAKEN AS PENANCES THEY WILL LODGE YOU IN HEAVEN IN A TRICE
Predicted text : AND IF YOU HAVE ANY DESIRE TO SHORTEN THE JOURNEY AND PUT YOURSELF EASILY IN T WAY OF SALVATION COME WITH ME AND I WILL SHOW YOU HOW TO BECOME A KNIGHT ERRANT A CALLING WHEREIN SO MANY HARDSHIPS AND MISHAPS ARE ENCOUNTERED THAT IF THEY

At **SNR 3**, the wav2vec2-large self-training model makes errors without sense like deletions and substitutions. EXAMPLE: 'HOW TO' becomes 'A GUN'. At **SNR 6**, accuracy improves with some errors remaining 'AND CATALAN' - 'MACATALY' that do not worsen the meaning. By **SNR 9**, the model produces almost perfect transcriptions, with only minimal differences from the ground truth. Overall, the performance of this model in CER and WER is even better compared to the previous models.

In [None]:
# Model-large-self for single speakers dataset
# Clean speaker 1
wer_clean_speaker1_self, cer_clean_speaker1_self, result_clean_speaker1_self = evaluate(model_self, processor_self, dataset_speaker_1)

for i in range(5):
    print(f"\n-Sample {i+1}")
    print("Ground truth   :", result_clean_speaker1_self[i]["text"])
    print("Predicted text :", result_clean_speaker1_self[i]["transcription"])

# Clean speaker 2
wer_clean_speaker2_self, cer_clean_speaker2_self, result_clean_speaker2_self = evaluate(model_self, processor_self, dataset_speaker_2)

for i in range(5):
    print(f"\n-Sample {i+1}")
    print("Ground truth   :", result_clean_speaker2_self[i]["text"])
    print("Predicted text :", result_clean_speaker2_self[i]["transcription"])

# Noisy speaker 1 (SNR 3)
wer_noisy_speaker1_self, cer_noisy_speaker1_self, result_noisy_speaker1_self = evaluate(model_self, processor_self, dataset_snr_3_speaker_1)

for i in range(5):
    print(f"\n-Sample {i+1}")
    print("Ground truth   :", result_noisy_speaker1_self[i]["text"])
    print("Predicted text :", result_noisy_speaker1_self[i]["transcription"])

# Noisy speaker 2 (SNR 3)
wer_noisy_speaker2_self, cer_noisy_speaker2_self, result_noisy_speaker2_self = evaluate(model_self, processor_self, dataset_snr_3_speaker_2)

for i in range(5):
    print(f"\n-Sample {i+1}")
    print("Ground truth   :", result_noisy_speaker2_self[i]["text"])
    print("Predicted text :", result_noisy_speaker2_self[i]["transcription"])

Map:   0%|          | 0/5 [00:00<?, ? examples/s]


-Sample 1
Ground truth   : THE STRENGTH THAT ENABLES SOMEONE IN A TRANCE TO HOLD HIS BODY STIFF AND UNSUPPORTED EXCEPT AT TWO POINTS THE HEAD AND HEELS
Predicted text : THE STRENGTH THAT ENABLES SOME ONE IN A TRANCE TO HOLD HIS BODY STIFF AND UNSUPPORTED EXCEPT AT TWO POINTS THE HEAD AND HEELS

-Sample 2
Ground truth   : THE METAL FOREST IS IN THE GREAT DOMED CAVERN THE LARGEST IN ALL OUR DOMINIONS REPLIED KALIKO
Predicted text : THE MIDDLE FOREST IS IN THE GREAT DOMED CAVERN THE LARGEST IN ALL OUR DOMINIONS REPLIED CALICO

-Sample 3
Ground truth   : IROLG LOOKED AMAZED AT THE SUDDEN FURY OF THE ATTACK THEN SMILED
Predicted text : IROLG LOOKED AMAZED AT THE SUDDEN FURY OF THE ATTACK THEN SMILED

-Sample 4
Ground truth   : THE OTHER VOICE SNAPPED WITH A HARSH URGENCY CLEARLY USED TO COMMAND
Predicted text : THE OTHER VOICE SNAPPED WITH A HARSH URGENCY CLEARLY USED TO COMMAND

-Sample 5
Ground truth   : THE KING HAS FLED IN DISGRACE AND YOUR FRIENDS ARE ASKING FOR YOU
Predicted text : T

Map:   0%|          | 0/5 [00:00<?, ? examples/s]


-Sample 1
Ground truth   : BUT NEITHER THE GLORIFIED WOODS ON THE ONE HAND NOR THE LAKE ON THE OTHER COULD AT FIRST HOLD THE EYE
Predicted text : BUT NEITHER THE GLORIFIED WOODS ON THE ONE HAND NOR THE LAKE ON THE OTHER COULD AT FIRST HOLD THE EYE

-Sample 2
Ground truth   : THE ASCENT OF LASSEN'S BUTTE IS AN EASY WALK AND THE VIEWS FROM THE SUMMIT ARE EXTREMELY TELLING
Predicted text : THE ASCENT OF LASSON'S BUTTE IS AN EASY WALK AND THE VIEWS FROM THE SUMMIT ARE EXTREMELY TELLING

-Sample 3
Ground truth   : IN SETTING OUT FROM STRAWBERRY VALLEY BY BEARING OFF TO THE NORTHWESTWARD A FEW MILES YOU MAY SEE
Predicted text : IN SETTING OUT FROM STRAWBERRY VALLEY BY BEARING OFF TO THE NORTHWESTWARD A FEW MILES YOU MAY SEE

-Sample 4
Ground truth   : GO QUIETLY ALONE NO HARM WILL BEFALL YOU
Predicted text : GO QUIETLY ALONE NO HARM WILL BEFALL YOU

-Sample 5
Ground truth   : THE DUCKS LESS WARY KEPT THEIR PLACES MERELY SWIMMING IN AND OUT THROUGH OPENINGS IN THE RUSHES RIPPLING THE GLASSY 

Map:   0%|          | 0/5 [00:00<?, ? examples/s]


-Sample 1
Ground truth   : THE STRENGTH THAT ENABLES SOMEONE IN A TRANCE TO HOLD HIS BODY STIFF AND UNSUPPORTED EXCEPT AT TWO POINTS THE HEAD AND HEELS
Predicted text : THE STRENGTH OF THE KNEE WAS SOMEWHAT IN ADVANCE TOTI WAS BODY STIFF AND UND SUPPORTE EXCEPTTEDPONTS THE HAND AND HEELS

-Sample 2
Ground truth   : THE METAL FOREST IS IN THE GREAT DOMED CAVERN THE LARGEST IN ALL OUR DOMINIONS REPLIED KALIKO
Predicted text : THE MITTER FIRST IS IN THE GREAT DIMNED CABIN THE LARGEST AND POLLER OF EMENS EPID E TALAGA

-Sample 3
Ground truth   : IROLG LOOKED AMAZED AT THE SUDDEN FURY OF THE ATTACK THEN SMILED
Predicted text : OR LORD LOKED AMAZED AT THE SUDDEN FERY OF THE ATTACK THEN SMILED

-Sample 4
Ground truth   : THE OTHER VOICE SNAPPED WITH A HARSH URGENCY CLEARLY USED TO COMMAND
Predicted text : THE OTHER VOICE SNAPPED WITH AHOIS IGANCY CLEARLY USED TO COMMAND

-Sample 5
Ground truth   : THE KING HAS FLED IN DISGRACE AND YOUR FRIENDS ARE ASKING FOR YOU
Predicted text : THE KING IS 

Map:   0%|          | 0/5 [00:00<?, ? examples/s]


-Sample 1
Ground truth   : BUT NEITHER THE GLORIFIED WOODS ON THE ONE HAND NOR THE LAKE ON THE OTHER COULD AT FIRST HOLD THE EYE
Predicted text : BUT NEITHER THE GLORIFIED WORDS ON ONE A NOR THE LAKE ON THE OTHER COULD AT FIRST PURE OUT

-Sample 2
Ground truth   : THE ASCENT OF LASSEN'S BUTTE IS AN EASY WALK AND THE VIEWS FROM THE SUMMIT ARE EXTREMELY TELLING
Predicted text : THE ASCENT OF LASEN'S BUTE IS AN EASY WALK AND THE VIEWS FROM THE SUMMIT ARE EXTREMELY TELLY

-Sample 3
Ground truth   : IN SETTING OUT FROM STRAWBERRY VALLEY BY BEARING OFF TO THE NORTHWESTWARD A FEW MILES YOU MAY SEE
Predicted text : IN SETTING OUT FOR STRACATO BY BEARING OFF TO THE NORTH WESTWARD A FEW MILES OU MAY SEE

-Sample 4
Ground truth   : GO QUIETLY ALONE NO HARM WILL BEFALL YOU
Predicted text : GO YOUR WAY ALONE NO HARM WILL BEFALL YOU

-Sample 5
Ground truth   : THE DUCKS LESS WARY KEPT THEIR PLACES MERELY SWIMMING IN AND OUT THROUGH OPENINGS IN THE RUSHES RIPPLING THE GLASSY WATER AND RAISING SPANGL

In the clean condition single speaker, the model performs very well, with near-perfect transcriptions. The are low-impact errors such as phonetic substitutions like “KALIKO” → “CALICO” or “LASSON’S” for “LASSEN’S.”.

Under SNR 3- single speaker, performance declines with increased substitutions, insertions, and distortions. Errors such as “STRENGTH OF THE KNEE” for “THE STRENGTH THAT ENABLES” and “THE MITTER FIRST” instead of “THE METAL FOREST” demonstrate confusion in meaning. However, the performance of this model here is even better compared to the previous models.