## Mounting drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Importing necessary libraries

In [2]:
%%capture
%pip install datasets==1.18.3
%pip install transformers==4.17.0
%pip install jiwer

In [3]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
%%capture
!apt install git-lfs

## Prepare Data, Tokenizer, Feature Extractor

In [5]:
from datasets import load_dataset, load_metric

librispeech_train = load_dataset("librispeech_asr", "clean", split="train.100[:4000]")
librispeech_test = load_dataset("librispeech_asr", "clean", split="test")
librispeech_valid = load_dataset("librispeech_asr", "clean", split="validation")



In [6]:
#sample dataset
librispeech_train

Dataset({
    features: ['file', 'audio', 'text', 'speaker_id', 'chapter_id', 'id'],
    num_rows: 4000
})

In [7]:
librispeech_train = librispeech_train.remove_columns(["speaker_id", "chapter_id", "id"])
librispeech_test = librispeech_test.remove_columns(["speaker_id", "chapter_id", "id"])
librispeech_valid = librispeech_valid.remove_columns(["speaker_id", "chapter_id", "id"])

In [8]:
#sample dataset after removing columns
librispeech_train

Dataset({
    features: ['file', 'audio', 'text'],
    num_rows: 4000
})

In [9]:
from datasets import ClassLabel
import random
import pandas as pd
from IPython.display import display, HTML

def show_random_elements(dataset, num_examples=10):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)

    df = pd.DataFrame(dataset[picks])
    display(HTML(df.to_html()))

In [10]:
show_random_elements(librispeech_train.remove_columns(["audio", "file"]), num_examples=10)

Unnamed: 0,text
0,WHEN YOU ARE TOLD ANYTHING CANNOT BE DONE THERE IS AN END OF IT IT IS MUCH MORE CONVENIENT AT PARIS WHEN ANYTHING CANNOT BE DONE YOU PAY DOUBLE AND IT IS DONE DIRECTLY THAT IS WHAT ALL THE FRENCH SAY RETURNED SIGNOR PASTRINI SOMEWHAT PIQUED
1,THE BIG MOOSE A BULL MOOSE YOU SAY WILL ECHOED BLUFF HIS FACE LIGHTING UP WITH SUDDEN ENERGY THAT'S WHAT I MEAN REPLIED THE OTHER I KNOW WHAT YOU'RE THINKING BLUFF AND THAT I WOULDN'T KNOW A BULL MOOSE IF I SAW ONE
2,HE WAS OF COURSE ALTOGETHER UNCONSCIOUS WHAT GRAND THINGS HIS COUSIN JOHN HAD INTENDED TO DO BY HIM HAD NOT THE HONOURABLE OLD LADY INTERFERED BUT HE HAD MADE UP HIS MIND THAT DUTY REQUIRED HIM TO CALL AT THE HOUSE
3,CAUGHT HIM IN HER CRUEL BEAK AND BEFORE HE COULD SAY A WORD HE HAD DISAPPEARED DOWN HER THROAT THESE TWO UNLUCKY VENTURES DID NOT PREVENT THE PRINCESS FROM TRYING ONCE MORE
4,ONE DAY WHEN HE WAS SEVEN YEARS OLD HE CAME TO THE CURATE OF PALESTRINA AND ASKED TO BE TAUGHT TO READ IT WAS SOMEWHAT DIFFICULT FOR HE COULD NOT QUIT HIS FLOCK BUT THE GOOD CURATE WENT EVERY DAY TO SAY MASS AT A LITTLE HAMLET TOO POOR TO PAY A PRIEST AND WHICH
5,ONE OR TWO OF THE MEN WHO HAD RUN AFTER THE FUGITIVES WERE NOW SLOWLY WORKING THEIR WAY UP THE CLIFF ONE OF THEM REACHED CHAUVELIN'S SIDE AT THE VERY MOMENT THAT THIS HOPE AROSE IN THE ASTUTE DIPLOMATIST'S HEART
6,AND WHY ASKED NICHOLL BECAUSE IF YOU GAIN THE FIRST THE COLUMBIAD WILL HAVE BURST AND THE PROJECTILE WITH IT AND BARBICANE WILL NO LONGER BE THERE TO REIMBURSE YOUR DOLLARS MY STAKE IS DEPOSITED AT THE BANK IN BALTIMORE
7,THEY ARE DISTRIBUTED SO AS TO BE WHERE MOST USEFUL TO THE OWNER A MAN HAVING TWO UMBRELLAS KEEPS ONE AT HIS OFFICE AND THE OTHER AT HOME A STUDENT HAVING TWO BOOKS OF THE SAME KIND
8,WHICH IN THE AFTERNOON ENVELOPED THE UPPER MOUNTAINS IN CLOUDS WE WATCHED THEM ANXIOUSLY AS NOW WE DREADED A SNOW STORM SHORTLY AFTERWARDS WE HEARD THE ROLL OF THUNDER AND LOOKING TOWARD THE VALLEY FOUND IT ALL ENVELOPED IN A THUNDERSTORM
9,THE DIAMOND WEDDING BY EDMUND CLARENCE STEDMAN O LOVE LOVE LOVE WHAT TIMES WERE THOSE LONG ERE THE AGE OF BELLES AND BEAUX AND BRUSSELS LACE AND SILKEN HOSE WHEN IN THE GREEN ARCADIAN CLOSE


In [11]:
import re
chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"]'

def remove_special_characters(batch):
    batch["text"] = re.sub(chars_to_ignore_regex, '', batch["text"]).lower() + " "
    return batch

In [12]:
librispeech_train = librispeech_train.map(remove_special_characters)
librispeech_test = librispeech_test.map(remove_special_characters)
librispeech_valid = librispeech_valid.map(remove_special_characters)



0ex [00:00, ?ex/s]

0ex [00:00, ?ex/s]

0ex [00:00, ?ex/s]

In [13]:
show_random_elements(librispeech_train.remove_columns(["audio", "file"]))

Unnamed: 0,text
0,were there some especially froggy place near europe as there is an especially sandy place the scientific explanation would of course be that all small frogs falling from the sky in europe come from that center of frogeity to start with
1,became the first clavier master he and doctor john bull
2,and hour after hour passed by while we eagerly watched the pale light stealing down the ridge to the hollow where we lay but there was not a trace of that warm flushing sunrise splendor we so long had hoped for
3,what do you not know him i have not that honor you have never heard his name never well then he is a bandit compared to whom the decesaris and the gasparones were mere children
4,now then albert cried franz here is a bandit for you at last i forewarn you signor pastrini that i shall not believe one word of what you are going to tell us having told you this begin
5,leave him there said chauvelin and lead the way now quickly to the cart i'll follow he walked up to where marguerite lay and looked down into her face
6,there came another it must be added and doubtless as a result of something that over the cloth did hang between them when she struck him as having quite answered dear no for what do you take me
7,yet was he forced by a most terrible storm to pitch his camp in the neighboring villages before he could take it but when after a few days time the second legion that came from antony joined themselves to him
8,very young minnows had been caught up in the london times march second eighteen fifty nine is a letter from mister aaron roberts curate of saint peter's carmathon in this letter
9,there being no competition no time or space is required for sensational trash thus if nothing of importance occurs nothing need be transmitted the official news censors decide as to the relative importance of occurrences


In [14]:
def extract_all_chars(batch):
  all_text = " ".join(batch["text"])
  vocab = list(set(all_text))
  return {"vocab": [vocab], "all_text": [all_text]}

In [15]:
vocab_train = librispeech_train.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=librispeech_train.column_names)
vocab_test = librispeech_test.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=librispeech_test.column_names)
vocab_dev = librispeech_valid.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=librispeech_valid.column_names)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [16]:
vocab_list = list(set(vocab_train["vocab"][0]) | set(vocab_dev["vocab"][0]) | set(vocab_test["vocab"][0]) )

In [17]:
vocab_dict = {v: k for k, v in enumerate(vocab_list)}
vocab_dict

{'i': 0,
 'a': 1,
 't': 2,
 'j': 3,
 'm': 4,
 ' ': 5,
 'l': 6,
 'w': 7,
 'v': 8,
 'c': 9,
 'z': 10,
 'f': 11,
 'o': 12,
 'q': 13,
 'p': 14,
 'g': 15,
 'y': 16,
 'n': 17,
 'h': 18,
 'r': 19,
 'b': 20,
 'e': 21,
 'u': 22,
 "'": 23,
 's': 24,
 'k': 25,
 'd': 26,
 'x': 27}

In [18]:
vocab_dict["|"] = vocab_dict[" "]
del vocab_dict[" "]

In [19]:
vocab_dict["[UNK]"] = len(vocab_dict)
vocab_dict["[PAD]"] = len(vocab_dict)
len(vocab_dict)

30

In [20]:
import json
with open('vocab_SST_medium_accuracy.json', 'w') as vocab_file:
    json.dump(vocab_dict, vocab_file)

In [21]:
from transformers import Wav2Vec2CTCTokenizer

tokenizer = Wav2Vec2CTCTokenizer("./vocab_SST_medium_accuracy.json", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")

### Create Wav2Vec2 Feature Extractor

In [22]:
from transformers import Wav2Vec2FeatureExtractor

feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=False)

In [23]:
from transformers import Wav2Vec2Processor

processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

## Saving tokenizer

In [24]:
tokenizer.save_pretrained("/content/drive/My Drive/Capstone-Lisa-Mimi")

('/content/drive/My Drive/Capstone-Lisa-Mimi/tokenizer_config.json',
 '/content/drive/My Drive/Capstone-Lisa-Mimi/special_tokens_map.json',
 '/content/drive/My Drive/Capstone-Lisa-Mimi/vocab.json',
 '/content/drive/My Drive/Capstone-Lisa-Mimi/added_tokens.json')

## Preprocess data

In [25]:
import numpy as np
np.object = np.object_


def prepare_dataset(batch):
    audio = batch["audio"]
    audio_array = np.array(audio["array"], dtype=np.float32)

    # Batched output is "un-batched" to ensure mapping is correct
    batch["input_values"] = processor(audio_array, sampling_rate=audio["sampling_rate"]).input_values[0]
    batch["input_length"] = len(batch["input_values"])

    with processor.as_target_processor():
        batch["labels"] = processor(batch["text"]).input_ids

    return batch

In [26]:
librispeech_train = librispeech_train.map(prepare_dataset, remove_columns=librispeech_train.column_names)
librispeech_valid= librispeech_valid.map(prepare_dataset, remove_columns=librispeech_valid.column_names)
librispeech_test = librispeech_test.map(prepare_dataset, remove_columns=librispeech_test.column_names)

0ex [00:00, ?ex/s]

0ex [00:00, ?ex/s]

0ex [00:00, ?ex/s]

In [27]:
max_input_length_in_sec = 4.0
train_dataset= librispeech_train.filter(lambda x: x < max_input_length_in_sec * processor.feature_extractor.sampling_rate, input_columns=["input_length"])

  0%|          | 0/4 [00:00<?, ?ba/s]

### Set-up Trainer

In [28]:
import torch

from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union

@dataclass
class DataCollatorCTCWithPadding:
    """
    Data collator that will dynamically pad the inputs received.
    Args:
        processor (:class:`~transformers.Wav2Vec2Processor`)
            The processor used for proccessing the data.
        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:
            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
              sequence if provided).
            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
              maximum acceptable input length for the model if that argument is not provided.
            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
              different lengths).
    """

    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lenghts and need
        # different padding methods
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            return_tensors="pt",
        )
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                return_tensors="pt",
            )

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels

        return batch

In [29]:
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

In [30]:
wer_metric = load_metric("wer")

In [31]:
def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.batch_decode(pred_ids)
    # we do not want to group tokens when computing the metrics
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

    wer = wer_metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

In [32]:
from transformers import Wav2Vec2ForCTC

model = Wav2Vec2ForCTC.from_pretrained(
    "facebook/wav2vec2-base",
    ctc_loss_reduction="mean",
    pad_token_id=processor.tokenizer.pad_token_id,
)

Some weights of the model checkpoint at facebook/wav2vec2-base were not used when initializing Wav2Vec2ForCTC: ['project_hid.bias', 'project_hid.weight', 'quantizer.weight_proj.weight', 'quantizer.codevectors', 'project_q.weight', 'quantizer.weight_proj.bias', 'project_q.bias']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['lm_head.bias', 'lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predicti

In [33]:
model.freeze_feature_encoder()

In [34]:
from transformers import TrainingArguments

training_args = TrainingArguments(
  output_dir="/content/drive/My Drive/Capstone-Lisa-Mimi",
  group_by_length=True,
  per_device_train_batch_size=32,
  per_device_eval_batch_size=32,
  evaluation_strategy="steps",
  num_train_epochs=10,
  fp16=True,
  gradient_checkpointing=True,
  save_steps=500,
  eval_steps=500,
  logging_steps=500,
  learning_rate=1e-4,
  weight_decay=0.005,
  warmup_steps=1000,
  save_total_limit=2,
)

In [35]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=librispeech_train,
    eval_dataset=librispeech_valid,
    tokenizer=processor.feature_extractor,
)

Using amp half precision backend


## Training

In [36]:
np.bool = np.bool_
trainer.train()

The following columns in the training set  don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 4000
  Num Epochs = 10
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 1250
  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass


Step,Training Loss,Validation Loss,Wer
500,3.6757,3.189225,0.991802
1000,0.6104,0.283178,0.213705


The following columns in the evaluation set  don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2703
  Batch size = 32
Saving model checkpoint to /content/drive/My Drive/Capstone-Lisa-Mimi/checkpoint-500
Configuration saved in /content/drive/My Drive/Capstone-Lisa-Mimi/checkpoint-500/config.json
Model weights saved in /content/drive/My Drive/Capstone-Lisa-Mimi/checkpoint-500/pytorch_model.bin
Feature extractor saved in /content/drive/My Drive/Capstone-Lisa-Mimi/checkpoint-500/preprocessor_config.json
  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
The following columns in the evaluation set  don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2Fo

TrainOutput(global_step=1250, training_loss=1.7337145278930663, metrics={'train_runtime': 4838.1264, 'train_samples_per_second': 8.268, 'train_steps_per_second': 0.258, 'total_flos': 4.81607620176937e+18, 'train_loss': 1.7337145278930663, 'epoch': 10.0})

In [37]:
trainer.save_model("/content/drive/MyDrive/Capstone-Lisa-Mimi")

Saving model checkpoint to /content/drive/MyDrive/Capstone-Lisa-Mimi
Configuration saved in /content/drive/MyDrive/Capstone-Lisa-Mimi/config.json
Model weights saved in /content/drive/MyDrive/Capstone-Lisa-Mimi/pytorch_model.bin
Feature extractor saved in /content/drive/MyDrive/Capstone-Lisa-Mimi/preprocessor_config.json


## Saving model

In [39]:
# saving model on my drive
model = Wav2Vec2ForCTC.from_pretrained("/content/drive/MyDrive/Capstone-Lisa-Mimi")
processor = Wav2Vec2Processor.from_pretrained("/content/drive/MyDrive/Capstone-Lisa-Mimi")

# Define the path where you want to save the final model
save_directory = "/content/drive/MyDrive/Capstone-Lisa-Mimi/model"

# Save the model and processor
model.save_pretrained(save_directory)
processor.save_pretrained(save_directory)

print(f"Model and processor saved in {save_directory}")

loading configuration file /content/drive/MyDrive/Capstone-Lisa-Mimi/config.json
Model config Wav2Vec2Config {
  "_name_or_path": "facebook/wav2vec2-base",
  "activation_dropout": 0.0,
  "adapter_kernel_size": 3,
  "adapter_stride": 2,
  "add_adapter": false,
  "apply_spec_augment": true,
  "architectures": [
    "Wav2Vec2ForCTC"
  ],
  "attention_dropout": 0.1,
  "bos_token_id": 1,
  "classifier_proj_size": 256,
  "codevector_dim": 256,
  "contrastive_logits_temperature": 0.1,
  "conv_bias": false,
  "conv_dim": [
    512,
    512,
    512,
    512,
    512,
    512,
    512
  ],
  "conv_kernel": [
    10,
    3,
    3,
    3,
    3,
    2,
    2
  ],
  "conv_stride": [
    5,
    2,
    2,
    2,
    2,
    2,
    2
  ],
  "ctc_loss_reduction": "mean",
  "ctc_zero_infinity": false,
  "diversity_loss_weight": 0.1,
  "do_stable_layer_norm": false,
  "eos_token_id": 2,
  "feat_extract_activation": "gelu",
  "feat_extract_norm": "group",
  "feat_proj_dropout": 0.1,
  "feat_quantizer_drop

Model and processor saved in /content/drive/MyDrive/Capstone-Lisa-Mimi/model


## Evaluation

In [40]:
from transformers import AutoModelForCTC, Wav2Vec2Processor

model = AutoModelForCTC.from_pretrained("/content/drive/MyDrive/Capstone-Lisa-Mimi/model")
processor = Wav2Vec2Processor.from_pretrained("/content/drive/MyDrive/Capstone-Lisa-Mimi/model")

loading configuration file /content/drive/MyDrive/Capstone-Lisa-Mimi/model/config.json
Model config Wav2Vec2Config {
  "_name_or_path": "/content/drive/MyDrive/Capstone-Lisa-Mimi/model",
  "activation_dropout": 0.0,
  "adapter_kernel_size": 3,
  "adapter_stride": 2,
  "add_adapter": false,
  "apply_spec_augment": true,
  "architectures": [
    "Wav2Vec2ForCTC"
  ],
  "attention_dropout": 0.1,
  "bos_token_id": 1,
  "classifier_proj_size": 256,
  "codevector_dim": 256,
  "contrastive_logits_temperature": 0.1,
  "conv_bias": false,
  "conv_dim": [
    512,
    512,
    512,
    512,
    512,
    512,
    512
  ],
  "conv_kernel": [
    10,
    3,
    3,
    3,
    3,
    2,
    2
  ],
  "conv_stride": [
    5,
    2,
    2,
    2,
    2,
    2,
    2
  ],
  "ctc_loss_reduction": "mean",
  "ctc_zero_infinity": false,
  "diversity_loss_weight": 0.1,
  "do_stable_layer_norm": false,
  "eos_token_id": 2,
  "feat_extract_activation": "gelu",
  "feat_extract_norm": "group",
  "feat_proj_dropou

In [41]:
final_model = model.cuda()

In [42]:
final_model

Wav2Vec2ForCTC(
  (wav2vec2): Wav2Vec2Model(
    (feature_extractor): Wav2Vec2FeatureEncoder(
      (conv_layers): ModuleList(
        (0): Wav2Vec2GroupNormConvLayer(
          (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False)
          (activation): GELUActivation()
          (layer_norm): GroupNorm(512, 512, eps=1e-05, affine=True)
        )
        (1-4): 4 x Wav2Vec2NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
          (activation): GELUActivation()
        )
        (5-6): 2 x Wav2Vec2NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,), bias=False)
          (activation): GELUActivation()
        )
      )
    )
    (feature_projection): Wav2Vec2FeatureProjection(
      (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (projection): Linear(in_features=512, out_features=768, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder)

In [43]:
def map_to_result(batch):
  with torch.no_grad():
    input_values = torch.tensor(batch["input_values"], device="cuda").unsqueeze(0)
    logits = model(input_values).logits

  pred_ids = torch.argmax(logits, dim=-1)
  batch["pred_str"] = processor.batch_decode(pred_ids)[0]
  batch["text"] = processor.decode(batch["labels"], group_tokens=False)

  return batch

In [47]:
results = librispeech_test.map(map_to_result, remove_columns=librispeech_test.column_names)

0ex [00:00, ?ex/s]

  return F.conv1d(input, weight, bias, self.stride,


In [50]:
print("Test WER: {:.3f}".format(wer_metric.compute(predictions=results["pred_str"], references=results["text"])))

Test WER: 0.120


In [49]:
show_random_elements(results)

Unnamed: 0,pred_str,text
0,to those duties you have not yet been called and when you are you will be less eager for cealeberiaty,to those duties you have not yet been called and when you are you will be less eager for celebrity
1,siluc slunk away without a word of protest anbroush stood his ground evidently bent on making his pece whih nro me before he leftor seeing tht i was in the way i walked aside toward a glass door at the lower end of the room,silas slunk away without a word of protest ambrose stood his ground evidently bent on making his peace with naomi before he left her seeing that i was in the way i walked aside toward a glass door at the lower end of the room
2,that is the best way to side for the spere will always point somewhere and one thing is as good as another,that is the best way to decide for the spear will always point somewhere and one thing is as good as another
3,if i can get patience,if i can get patients
4,each will therefore serve about equally well during the earlier stages of social growth,each will therefore serve about equally well during the earlier stages of social growth
5,so choose for yourself to make a rush or tarry here,so choose for yourself to make a rush or tarry here
6,he had broken into her courtyard,he had broken into her courtyard
7,she pored into the dish a quantity from each of these bottles,she poured into the dish a quantity from each of these bottles
8,he is my esquire excellency returned robin with dignity,he is my esquire excellency returned robin with dignity
9,their pioty would be like their names like their faces like their clothes and it was idale for him to tell himself that their humble and contrive hearts it might be paid a far richer tribute of devotion than his had ever been a gift ten fold more acceptable than his elaborate admoration,their piety would be like their names like their faces like their clothes and it was idle for him to tell himself that their humble and contrite hearts it might be paid a far richer tribute of devotion than his had ever been a gift tenfold more acceptable than his elaborate adoration
