In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
!unzip '/content/gdrive/My Drive/models/pickles.zip'

Archive:  /content/gdrive/My Drive/models/pickles.zip
   creating: pickles/
  inflating: pickles/test_labels     
  inflating: pickles/train_recordings.pickle  
  inflating: pickles/train_labels    
  inflating: pickles/test_recordings  


In [3]:
!pip install transformers
!pip install jiwer

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.27.4-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m58.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.4-py3-none-any.whl (200 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.1/200.1 kB[0m [31m25.3 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m103.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.4 tokenizers-0.13.3 transformers-4.27.4
Looking in indexes: https://pypi.org/simple, https://u

In [4]:
# test to see if labels are accurate

import pickle 
import IPython.display as ipd


# Load pickled data
with open('pickles/train_recordings.pickle', 'rb') as f:
    train_recordings = pickle.load(f)
with open('pickles/train_labels', 'rb') as f:
    train_labels = pickle.load(f)
  


# Choose an index from the train_recordings list
index = 764  # You can change this to any valid index

# Get the corresponding audio element
audio_element = train_recordings[index]

# Play the audio
ipd.display(ipd.Audio(audio_element, rate=16000))  # Adjust the rate if your sampling rate is different
print(train_labels[index])

He looked at the sky and saw the white cumulus built like friendly piles of ice cream and high above were the thin feathers of the cirrus against the high September sky



In [None]:
# train model

import re
import random
import pickle
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import Wav2Vec2Processor, Wav2Vec2CTCTokenizer, Wav2Vec2ForCTC, AdamW
from tqdm import tqdm

# Load pickled data
with open('pickles/train_recordings.pickle', 'rb') as f:
    train_recordings = pickle.load(f)
with open('pickles/train_labels', 'rb') as f:
    train_labels = pickle.load(f)

with open('pickles/test_recordings', 'rb') as f:
    test_recordings = pickle.load(f)
with open('pickles/test_labels', 'rb') as f:
    test_labels = pickle.load(f)

# Custom DataLoader
class AudioDataset(Dataset):
    def __init__(self, recordings, labels, processor, tokenizer, sampling_rate=16000):
        self.recordings = recordings
        self.labels = labels
        self.processor = processor
        self.tokenizer = tokenizer
        self.sampling_rate = sampling_rate

    def __len__(self):
        return len(self.recordings)

    def __getitem__(self, idx):
        recording = self.recordings[idx]
        label = self.labels[idx].upper()
        input_values = self.processor(recording, sampling_rate=self.sampling_rate, return_tensors="pt").input_values
        labels_input_ids = self.tokenizer(label, return_tensors="pt").input_ids
        return input_values.squeeze(), labels_input_ids

# Custom collate function
def audio_collate_fn(batch):
    input_values, labels = zip(*batch)

    # Pad input_values
    max_length = max([iv.shape[0] for iv in input_values])
    padded_input_values = torch.zeros(len(input_values), max_length)
    for i, iv in enumerate(input_values):
        padded_input_values[i, :iv.shape[0]] = iv

    max_label_length = max([label.shape[1] for label in labels])
    padded_labels = torch.zeros(len(labels), max_label_length, dtype=torch.long)

    for i, label in enumerate(labels):
        padded_labels[i, :label.shape[1]] = label.squeeze()

    return padded_input_values, padded_labels

# Initialize Wav2Vec2 components
model_link = "facebook/wav2vec2-base-960h"
processor = Wav2Vec2Processor.from_pretrained(model_link)
model = Wav2Vec2ForCTC.from_pretrained(model_link)
tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(model_link)

# Create DataLoader
batch_size = 2
train_dataset = AudioDataset(train_recordings, train_labels, processor, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=audio_collate_fn)

# Initialize optimizer
optim = AdamW(model.parameters(), lr=1e-5)

# Training settings
num_epochs = 1
start_epoch = 0
device = torch.device('cuda')
model.to(device)
num_training_steps = num_epochs * len(train_loader)
progress_bar = tqdm(range(num_training_steps))
model.train()
new_model_out = '/content/gdrive/My Drive/models/w2v_trained'

# Training loop
for epoch in range(start_epoch, num_epochs):
    for input_values, labels_tokenized in train_loader:
        # Prepare input_values and labels
        input_values = input_values.to(device)
        # labels_tokenized = tokenizer(labels, return_tensors="pt", padding=True)
        labels_tokenized = labels_tokenized.to(device)

        # Compute loss
        output = model(input_values=input_values, labels=labels_tokenized)
        loss = output.loss

        # Optimize model weights
        loss.backward()
        optim.step()
        optim.zero_grad()
        progress_bar.update(1)

model.save_pretrained(new_model_out)

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 96/96 [01:19<00:00,  1.21it/s]
100%|██████████| 865/865 [03:31<00:00,  3.76it/s]

In [8]:
#evaluate model

from jiwer import wer
from transformers import  Wav2Vec2Processor, Wav2Vec2CTCTokenizer, Wav2Vec2ForCTC
import torch
import pickle
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union
from torch.utils.data import Dataset, DataLoader

from tqdm import tqdm


with open('pickles/test_recordings', 'rb') as f:
    test_recordings = pickle.load(f)
with open('pickles/test_labels', 'rb') as f:
    test_labels = pickle.load(f)

print(len(test_labels))

  # Custom DataLoader
class AudioDataset(Dataset):
    def __init__(self, recordings, labels, processor, sampling_rate=16000):
        self.recordings = recordings
        self.labels = labels
        self.processor = processor
        self.sampling_rate = sampling_rate

    def __len__(self):
        return len(self.recordings)

    def __getitem__(self, idx):
        recording = self.recordings[idx]
        label = self.labels[idx].upper()
        input_values = self.processor(recording, sampling_rate=self.sampling_rate, return_tensors="pt").input_values
        return input_values.squeeze(), label


# Custom collate function
def audio_collate_fn(batch):
    input_values, labels = zip(*batch)

    # Pad input_values
    max_length = max([iv.shape[0] for iv in input_values])
    padded_input_values = torch.zeros(len(input_values), max_length)
    for i, iv in enumerate(input_values):
        padded_input_values[i, :iv.shape[0]] = iv

    return padded_input_values, labels

model_link = "facebook/wav2vec2-base-960h"
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
tokenizer = Wav2Vec2CTCTokenizer.from_pretrained("facebook/wav2vec2-base-960h")

# Create DataLoader
batch_size = 2
test_dataset = AudioDataset(test_recordings, test_labels, processor)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=audio_collate_fn)

# Initialize optimizer
device = torch.device('cuda')
model.to(device)
num_testing_steps = len(test_loader)
progress_bar = tqdm(range(num_testing_steps))
model.eval()

predictions_list = []
for batch in test_loader:
    # Unpack the batch tuple and push the input values to the GPU
    padded_input_values, labels = batch
    padded_input_values = padded_input_values.to(device)

    # Get the logits from the model
    with torch.no_grad():
        logits = model(padded_input_values).logits

    # Get the predicted token indices
    predicted_indices = torch.argmax(logits, dim=-1)

    # Decode tokens and save the results
    for output in predicted_indices:
        predictions_list.append(tokenizer.decode(output, skip_special_tokens=True))

    progress_bar.update(1)

print(predictions_list)
test_labels = [i.upper().replace('\n', '') for i in test_labels]

192


Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 96/96 [01:35<00:00,  1.00it/s]
100%|██████████| 96/96 [00:07<00:00, 16.57it/s]

['BUT REMEMBER HOW YOU WENT EIGHTY SEVEN DAYS WITHOUT FISH AND THEN WE CAUGHT BIG ONES EVERY DAY FOR THRE WEKS', 'I KNOW YOU DID NOT LEAVE ME BECAUSE YOU DOUBT IT', 'CAN I OFER YOU A BER ON THE TERACE AND THEN WIL TAKE THE STUF HOME', 'NO', 'I CAN STIL ROW AND ROGALIO WIL THROW THE NET', 'I CAN REMEMBER THE TAIL SLAPING AND BANGING AND THE THWART BREAKING AND THE NOISE OF THE CLUBING', 'I PUT THEM IN SALT IN THE BOX', 'BUT ARE YOU STRONG ENOUGH NOW FOR A TRULY BIG FISH', 'THESE WERE RELICS OF HIS WIFE', 'THE BOY ASKED', 'OF COURSE', 'I AM NOT VERY HUNGRY', 'I HAVE THE OLD MAN SAID GETING UP AND TAKING THE NEWSPAPER AND FOLDING IT', 'BLACK BEANS AND RICE FRIED BANANAS AND SOME STEW', "I'M RUNY NOW THE OLD MAN SAID", 'TEL ME ABOUT THE BASEBAL THE BOY ASKED HIM', 'IT WAS A GREAT MISTAKE', 'THEY SAY HIS FATHER WAS A FISHERMAN', 'NO', 'I KNOW OTHERS BETER', 'THERE ARE MANY GOD FISHERMEN AND SOME GREAT ONES', 'THANK YOU', 'BUT HE KNEW HE WOULD SHIVER HIMSELF WARM AND THAT SON HE WOULD BE ROW

In [7]:
wer(predictions_list, test_labels)

0.12271259418729817

In [None]:
print(test_labels)

['BUT REMEMBER HOW YOU WENT EIGHTYSEVEN DAYS WITHOUT FISH AND THEN WE CAUGHT BIG ONES EVERY DAY FOR THREE WEEKS', 'I KNOW YOU DID NOT LEAVE ME BECAUSE YOU DOUBTED', "CAN I OFFER YOU A BEER ON THE TERRACE AND THEN WE'LL TAKE THE STUFF HOME", 'NO', 'I CAN STILL ROW AND ROGELIO WILL THROW THE NET', 'I CAN REMEMBER THE TAIL SLAPPING AND BANGING AND THE THWART BREAKING AND THE NOISE OF THE CLUBBING', 'I PUT THEM IN SALT IN THE BOX', 'BUT ARE YOU STRONG ENOUGH NOW FOR A TRULY BIG FISH', 'THESE WERE RELICS OF HIS WIFE', 'THE BOY ASKED', 'OF COURSE', "I'M NOT VERY HUNGRY", 'I HAVE THE OLD MAN SAID GETTING UP AND TAKING THE NEWSPAPER AND FOLDING IT', 'BLACK BEANS AND RICE FRIED BANANAS AND SOME STEW', "I'M READY NOW THE OLD MAN SAID", 'TELL ME ABOUT THE BASEBALL THE BOY ASKED HIM', 'IT WAS A GREAT MISTAKE', 'THEY SAY HIS FATHER WAS A FISHERMAN', 'NO', 'I KNOW OTHERS BETTER', 'THERE ARE MANY GOOD FISHERMEN AND SOME GREAT ONES', 'THANK YOU', 'BUT HE KNEW HE WOULD SHIVER HIMSELF WARM AND THAT SOON