In [11]:
from transformers import (
    Wav2Vec2ForCTC,
    Wav2Vec2Processor,
    Wav2Vec2CTCTokenizer,
    Wav2Vec2FeatureExtractor
) 

from transformers import (
    AutoConfig,
    AutoFeatureExtractor,
    AutoModelForCTC,
    AutoProcessor,
    AutoTokenizer,
    HfArgumentParser,
    Trainer,
    TrainingArguments,
    Wav2Vec2Processor,
    set_seed,
)
import json
import os

In [12]:
import datasets
import random
import pandas as pd
import torch
from transformers import AutoTokenizer
from typing import Mapping, Tuple
import librosa
# import en_core_web_sm


import librosa

class BengaliDataset(torch.utils.data.Dataset):
    def __init__(self, df, processor):
        self.df = df
        self.processor = processor


    def __getitem__(self, idx):
        audio_path = self.df.loc[idx]['path']
        audio_array = self.read_audio(audio_path)
        
        inputs = self.processor(
            audio_array,
            sampling_rate=16000,
            return_tensors='pt'  
        )
        
        with self.processor.as_target_processor():
            labels = self.processor(self.df.loc[idx]['sentence']).input_ids
        
        return {'input_values': inputs['input_values'][0], 'labels': labels}
        
    def __len__(self):
        return len(self.df)

    def read_audio(self, mp3_path):
        target_sr = 16000  # Set the target sampling rate
        
        audio, sr = librosa.load(mp3_path, sr=None)  # Load with original sampling rate
        audio_array = librosa.resample(audio, orig_sr=sr, target_sr=target_sr)
        
        return audio_array


In [13]:
processor = AutoProcessor.from_pretrained("/home/ubuntu/bengali/aisyah/training/mms-1b/checkpoint-3600")
model = Wav2Vec2ForCTC.from_pretrained("/home/ubuntu/bengali/aisyah/training/mms-1b/checkpoint-3600")

In [14]:
processor.tokenizer.set_target_lang("ben")

In [15]:
with open('vocab.json', 'w') as fopen:
    json.dump(processor.tokenizer.vocab['ben'], fopen)

tokenizer = Wav2Vec2CTCTokenizer(
    "vocab.json", 
    unk_token="<unk>",
    pad_token="<pad>",
    word_delimiter_token="|"
)

feature_extractor = Wav2Vec2FeatureExtractor(
    feature_size=1, 
    sampling_rate=16000, 
    padding_value=0.0, 
    do_normalize=True, 
    return_attention_mask=False
)

# valid_ds = BengaliDataset(val,processor)

In [29]:
test = pd.read_csv('../data/train.csv')

test = test[test['split'] =='valid'].reset_index(drop=True)

test['path'] = test['id'].apply(lambda x: os.path.join('/home/ubuntu/bengali/data/train_mp3s', x+'.mp3'))

In [30]:
test = test[:3]

In [31]:
test.head()

Unnamed: 0,id,sentence,split,path
0,0000e711c2b1,তিনি এবং তাঁর মা তাদের পৈতৃক বাড়িতে থেকে প্রত...,valid,/home/ubuntu/bengali/data/train_mp3s/0000e711c...
1,00036c2a2d9d,কৃত্তিবাস রামায়ণ-বহির্ভূত অনেক গল্প এই অনুবাদ...,valid,/home/ubuntu/bengali/data/train_mp3s/00036c2a2...
2,00065e317123,তিনি তার সুশৃঙ্খল সামরিক বাহিনী এবং সুগঠিত শাস...,valid,/home/ubuntu/bengali/data/train_mp3s/00065e317...


In [32]:
test_ds = BengaliDataset(test,processor)

In [45]:
input_dict = processor(test_ds[2]['input_values'], sampling_rate=16_000, return_tensors="pt", padding=True)

logits = model(input_dict.input_values).logits

pred_ids = torch.argmax(logits, dim=-1)[0]

In [46]:
print("Prediction:")
print(processor.decode(pred_ids))

print("\nReference:")
print(test.loc[2]["sentence"].lower())

Prediction:
তিনি তার সুশ্ৃঙখল সামরিক বাহিনী এবং সুগঠিত শাসন কাঠামার মাধ্যমে একটি দঃ্ষর শাসন ব্যবস্থা প্রতিষ্ঠিত করেন।

Reference:
তিনি তার সুশৃঙ্খল সামরিক বাহিনী এবং সুগঠিত শাসন কাঠামোর মাধ্যমে একটি দক্ষ শাসন ব্যবস্থা প্রতিষ্ঠিত করেন।


In [39]:
processor.push_to_hub('mms-1b-all-bengali',organization = 'malaysia-ai')

CommitInfo(commit_url='https://huggingface.co/malaysia-ai/mms-1b-all-bengali/commit/89453fc150a7b3a18d0a6f570141bcf007bf76ca', commit_message='Upload processor', commit_description='', oid='89453fc150a7b3a18d0a6f570141bcf007bf76ca', pr_url=None, pr_revision=None, pr_num=None)