In [1]:
from transformers import (
    Wav2Vec2ForCTC,
    Wav2Vec2Processor,
    Wav2Vec2CTCTokenizer,
    Wav2Vec2FeatureExtractor
) 

from transformers import (
    AutoConfig,
    AutoFeatureExtractor,
    AutoModelForCTC,
    AutoProcessor,
    AutoTokenizer,
    HfArgumentParser,
    Trainer,
    TrainingArguments,
    Wav2Vec2Processor,
    set_seed,
)
import json
import os
from jiwer import wer

[2023-09-01 06:39:32,914] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)


2023-09-01 06:39:37.218002: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
import datasets
import random
import pandas as pd
import torch
from transformers import AutoTokenizer
from typing import Mapping, Tuple
import librosa
# import en_core_web_sm


import librosa

class BengaliDataset(torch.utils.data.Dataset):
    def __init__(self, df, processor):
        self.df = df
        self.processor = processor


    def __getitem__(self, idx):
        audio_path = self.df.loc[idx]['path']
        audio_array = self.read_audio(audio_path)
        
        inputs = self.processor(
            audio_array,
            sampling_rate=16000,
            return_tensors='pt'  
        )
        
        with self.processor.as_target_processor():
            labels = self.processor(self.df.loc[idx]['sentence']).input_ids
        
        return {'input_values': inputs['input_values'][0], 'labels': labels}
        
    def __len__(self):
        return len(self.df)

    def read_audio(self, mp3_path):
        target_sr = 16000  # Set the target sampling rate
        
        audio, sr = librosa.load(mp3_path, sr=None)  # Load with original sampling rate
        audio_array = librosa.resample(audio, orig_sr=sr, target_sr=target_sr)
        
        return audio_array


In [3]:
processor = AutoProcessor.from_pretrained("/home/ubuntu/bengali/aisyah/training/mms-1b/checkpoint-16000")
model = Wav2Vec2ForCTC.from_pretrained("/home/ubuntu/bengali/aisyah/training/mms-1b/checkpoint-16000")

In [4]:
processor.tokenizer.set_target_lang("ben")

In [5]:
with open('vocab.json', 'w') as fopen:
    json.dump(processor.tokenizer.vocab['ben'], fopen)

tokenizer = Wav2Vec2CTCTokenizer(
    "vocab.json", 
    unk_token="<unk>",
    pad_token="<pad>",
    word_delimiter_token="|"
)

feature_extractor = Wav2Vec2FeatureExtractor(
    feature_size=1, 
    sampling_rate=16000, 
    padding_value=0.0, 
    do_normalize=True, 
    return_attention_mask=False
)

# valid_ds = BengaliDataset(val,processor)

In [90]:
test = pd.read_csv('../data/train.csv')

test = test[test['split'] =='valid'].sample(frac=0.005).reset_index(drop=True)

test['path'] = test['id'].apply(lambda x: os.path.join('/home/ubuntu/bengali/data/train_mp3s', x+'.mp3'))

In [69]:
print(test)

             id                                           sentence  split  \
0  d0dbfb20b617  এই বিশাল জলাভূমি সমগ্র উচ্চ পারস্য উপসাগরের বা...  valid   
1  b7c0753186b3           যথারীতি পরের পর্বে উত্তীর্ণ হতে পারেননি।  valid   
2  e243067fc3b2  কুমিল্লা আদর্শ সদর উপজেলার দক্ষিণ-মধ্যাংশে দুর...  valid   

                                                path  
0  /home/ubuntu/bengali/data/train_mp3s/d0dbfb20b...  
1  /home/ubuntu/bengali/data/train_mp3s/b7c075318...  
2  /home/ubuntu/bengali/data/train_mp3s/e243067fc...  


In [104]:
test['sentence'][:5000]

0       তিনি এবং তাঁর মা তাদের পৈতৃক বাড়িতে থেকে প্রত...
1       কৃত্তিবাস রামায়ণ-বহির্ভূত অনেক গল্প এই অনুবাদ...
2       তিনি তার সুশৃঙ্খল সামরিক বাহিনী এবং সুগঠিত শাস...
3       তিনি বিজয়নগর সাম্রাজ্যের বিরুদ্ধে এবং বিজাপুর...
4                             এটি মূলত একটি মরুময় অঞ্চল।
                              ...                        
4995    সেখানে তাদের প্রথম ম্যাচে তারা ভারতের সাথে ড্র...
4996                 তাঁর পিতার নাম আলহাজ্ব জালাল উদ্দিন।
4997    দেশি-বিদেশি জার্নালে বর্তমানে তার দু’শতাধিক গব...
4998                       দানেশ এর প্রথম প্রেসিডেন্ট হন।
4999    এমনকি এর আলোকে প্রায়শ "রূপালি" বলে বর্ণনা করা...
Name: sentence, Length: 5000, dtype: object

In [91]:
import torch

test_ds = BengaliDataset(test,processor)

test_loader = torch.utils.data.DataLoader(
    test_ds, batch_size=1, shuffle=False,num_workers =2)

In [92]:
model.to('cuda')
model.half()

Wav2Vec2ForCTC(
  (wav2vec2): Wav2Vec2Model(
    (feature_extractor): Wav2Vec2FeatureEncoder(
      (conv_layers): ModuleList(
        (0): Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
        (1-4): 4 x Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
        (5-6): 2 x Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
      )
    )
    (feature_projection): Wav2Vec2FeatureProjection(
      (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (projec

In [93]:
from datasets import load_metric, Audio

wer_metric = load_metric("wer")

In [94]:
from tqdm import tqdm

In [96]:
sentences = []
wer_eval = []

with torch.no_grad():
    for batch in tqdm(test_loader):
        x = batch["input_values"]
        x = x.to("cuda", non_blocking=True)
        with torch.cuda.amp.autocast(True):
            y = model(x).logits
        predicted_ids = torch.argmax(y, dim=-1)

        transcription = processor.decode(predicted_ids[0])
        sentences.append(transcription)    

  4%|▍         | 1248/29588 [02:25<1:00:11,  7.85it/s]Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7feedb824820>
Traceback (most recent call last):
  File "/home/ubuntu/.local/lib/python3.10/site-packages/torch/utils/data/dataloader.py", line 1478, in __del__
    self._shutdown_workers()
  File "/home/ubuntu/.local/lib/python3.10/site-packages/torch/utils/data/dataloader.py", line 1461, in _shutdown_workers
    if w.is_alive():
  File "/usr/lib/python3.10/multiprocessing/process.py", line 160, in is_alive
    assert self._parent_pid == os.getpid(), 'can only test a child process'
AssertionError: can only test a child process
  4%|▍         | 1267/29588 [02:27<56:16,  8.39it/s]  Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7feedb824820>
Traceback (most recent call last):
  File "/home/ubuntu/.local/lib/python3.10/site-packages/torch/utils/data/dataloader.py", line 1478, in __del__
    self._shutdown_workers()
  File "/home/ub

KeyboardInterrupt: 

In [102]:
# https://www.kaggle.com/code/mbmmurad/detailed-eda-normalizer-and-wer?scriptVersionId=141093405&cellId=72

total_wer = 0
avg_wer = 0
for i in (range(0, 5000)):
    total_wer =+ wer(test['sentence'][i],sentences[i])

avg_wer = total_wer/5000
print("Average Word Error Rate : ", avg_wer)

Average Word Error Rate :  8e-05


In [105]:
wer = wer_metric.compute(predictions=sentences[:5000], references=test['sentences'][:5000])

KeyError: 'sentences'