In [3]:
import pandas as pd
import os
import torch
import json
import librosa
from transformers import AutoProcessor, Wav2Vec2ForCTC, Wav2Vec2CTCTokenizer, Wav2Vec2FeatureExtractor
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence
from tqdm import tqdm

2023-09-01 15:19:55.120393: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


[2023-09-01 15:19:59,425] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)


In [2]:
class BengaliDataset(Dataset):
    def __init__(self, df, processor):
        self.df = df
        self.processor = processor

    def __getitem__(self, idx):
        audio_path = self.df.loc[idx]['path']
        audio_array = self.read_audio(audio_path)

        inputs = self.processor(
            audio_array,
            sampling_rate=16_000,
            return_tensors='pt'
        )

        with self.processor.as_target_processor():
            labels = self.processor(self.df.loc[idx]['sentence']).input_ids

        return {'input_values': inputs['input_values'].squeeze(0), 'labels': labels}

    def __len__(self):
        return len(self.df)

    def read_audio(self, mp3_path):
        target_sr = 16000  # Set the target sampling rate

        audio, sr = librosa.load(mp3_path, sr=None)  # Load with original sampling rate
        audio_array = librosa.resample(audio, orig_sr=sr, target_sr=target_sr)

        return audio_array

In [4]:
processor = AutoProcessor.from_pretrained("/home/ubuntu/bengali/aisyah/training/mms-1b/checkpoint-16000")
# model = Wav2Vec2ForCTC.from_pretrained("/home/ubuntu/bengali/aisyah/training/mms-1b/checkpoint-16000")

In [5]:
processor.tokenizer.set_target_lang("ben")

In [6]:
processor

Wav2Vec2Processor:
- feature_extractor: Wav2Vec2FeatureExtractor {
  "do_normalize": true,
  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
  "feature_size": 1,
  "padding_side": "right",
  "padding_value": 0,
  "processor_class": "Wav2Vec2Processor",
  "return_attention_mask": true,
  "sampling_rate": 16000
}

- tokenizer: Wav2Vec2CTCTokenizer(name_or_path='/home/ubuntu/bengali/aisyah/training/mms-1b/checkpoint-16000', vocab_size=136, model_max_length=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>'}, clean_up_tokenization_spaces=True)

In [7]:
processor.tokenizer

Wav2Vec2CTCTokenizer(name_or_path='/home/ubuntu/bengali/aisyah/training/mms-1b/checkpoint-16000', vocab_size=136, model_max_length=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>'}, clean_up_tokenization_spaces=True)

In [6]:
with open('vocab.json', 'w') as fopen:
    json.dump(processor.tokenizer.vocab['ben'], fopen)

In [3]:
tokenizer = Wav2Vec2CTCTokenizer(
    "vocab.json",
    unk_token="<unk>",
    pad_token="<pad>",
    word_delimiter_token="|"
)

feature_extractor = Wav2Vec2FeatureExtractor(
    feature_size=1,
    sampling_rate=16000,
    padding_value=0.0,
    do_normalize=True,
    return_attention_mask=False
)

In [7]:
test = pd.read_csv('../data/train.csv')
test = test[test['split'] =='valid'].reset_index(drop=True)
test['path'] = test['id'].apply(lambda x: os.path.join('/home/ubuntu/bengali/data/train_mp3s', x+'.mp3'))

In [11]:
batch_size = 10
test_ds = BengaliDataset(test, processor)

In [12]:
def custom_collate_fn(batch):
    input_values = [item['input_values'] for item in batch]
    labels = [item['labels'] for item in batch]

    input_values_padded = pad_sequence(input_values, batch_first=True, padding_value=0.0)

    labels = [torch.tensor(label) for label in labels]

    return {'input_values': input_values_padded, 'labels': labels}


In [13]:
test_loader = DataLoader(
    test_ds, batch_size=batch_size, shuffle=False, num_workers=2, collate_fn=custom_collate_fn
)

In [14]:
model.to('cuda')
model.half()

Wav2Vec2ForCTC(
  (wav2vec2): Wav2Vec2Model(
    (feature_extractor): Wav2Vec2FeatureEncoder(
      (conv_layers): ModuleList(
        (0): Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
        (1-4): 4 x Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
        (5-6): 2 x Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
      )
    )
    (feature_projection): Wav2Vec2FeatureProjection(
      (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (projec

In [16]:
sentences = []

with torch.no_grad():
    for batch in tqdm(test_loader):
        x = batch["input_values"]
        x = x.to("cuda", non_blocking=True)
        with torch.cuda.amp.autocast(True):
            y = model(x).logits
        predicted_ids = torch.argmax(y, dim=-1)
        
        batch_size = predicted_ids.shape[0]

        for i in range(batch_size):
            transcription = processor.decode(predicted_ids[i])
            sentences.append(transcription)

100%|██████████| 2959/2959 [13:17<00:00,  3.71it/s]


In [19]:
import jiwer

jiwerresult_0 = []

for x in range(len(sentences)):

    jiwer_ = jiwer.wer(sentences[x],test.iloc[x,1])

    jiwerresult_0.append(jiwer_)
    
    
avg_wer_0 = sum(jiwerresult_0)/len(sentences)
print(avg_wer_0)

0.26888149151446894


In [38]:
from bnunicodenormalizer import Normalizer

bnorm = Normalizer()

def postprocess(sentence):
    period_set = set([".", "?", "!", "।"])
    _words = [bnorm(word)['normalized']  for word in sentence.split()]
    sentence = " ".join([word for word in _words if word is not None])
    try:
        if sentence[-1] not in period_set:
            sentence+="।"
    except:
        # print(sentence)
        sentence = "।"
    return sentence

In [39]:
cleaned_sentences = []

for sentence in sentences:
    data = postprocess(sentence)
    cleaned_sentences.append(data)

In [40]:
jiwerresult_1 = []

for x in range(len(cleaned_sentences)):

    jiwer_ = jiwer.wer(cleaned_sentences[x],test.iloc[x,1])

    jiwerresult_1.append(jiwer_)
    
    
avg_wer_1 = sum(jiwerresult_1)/len(cleaned_sentences)
print(avg_wer_1)

0.3337583934118714


In [42]:
unk_sentences = []

for sentence in sentences:
    data = sentence.replace("<unk>","")
    unk_sentences.append(data)

In [44]:
jiwerresult_2 = []

for x in range(len(unk_sentences)):

    jiwer_ = jiwer.wer(unk_sentences[x],test.iloc[x,1])

    jiwerresult_2.append(jiwer_)
    
    
avg_wer_2 = sum(jiwerresult_2)/len(unk_sentences)
print(avg_wer_2)

0.26886597708432697


In [45]:
num = 0.26888149151446894

In [46]:
num_1 = 0.26886597708432697

In [48]:
if num > num_1:
    print("num > num1")
else:
    print("num < num1")

num > num1


In [24]:
sentences[0], test["sentence"][0]

('তিনি এবং তার মা তাদের পৈতৃক বাড়িতে থেকে প্রতিবেশীদের দ্বারা অনেক তিরস্কার সহ্য করেন।',
 'তিনি এবং তাঁর মা তাদের পৈতৃক বাড়িতে থেকে প্রতিবেশীদের দ্বারা অনেক তিরস্কার সহ্য করেন।')

In [27]:
for x in range(10):
    print(f"Predicted: {sentences[x]}")
    print(f"Test Set: {test['sentence'][x]}")
    print("================================")

Predicted: তিনি এবং তার মা তাদের পৈতৃক বাড়িতে থেকে প্রতিবেশীদের দ্বারা অনেক তিরস্কার সহ্য করেন।
Test Set: তিনি এবং তাঁর মা তাদের পৈতৃক বাড়িতে থেকে প্রতিবেশীদের দ্বারা অনেক তিরস্কার সহ্য করেন।
Predicted: কৃত্তিবাস রামায়ন বহির্ভূত অনেক গল্প এই অনুবাদে গ্রহণ করেছিলেন।
Test Set: কৃত্তিবাস রামায়ণ-বহির্ভূত অনেক গল্প এই অনুবাদে গ্রহণ করেছিলেন।
Predicted: তিনি তার সুশৃং্হল সামরিক বাহিনী এবং সুগঠিত শাসন কাঠামোর মাধ্যমে একটি দক্ষ শাসন ব্যবস্থা প্রতিষ্ঠিত করেন।
Test Set: তিনি তার সুশৃঙ্খল সামরিক বাহিনী এবং সুগঠিত শাসন কাঠামোর মাধ্যমে একটি দক্ষ শাসন ব্যবস্থা প্রতিষ্ঠিত করেন।
Predicted: তিনি বিজয়নগজ সাম্রাজ্যের বিরুদ্ধে এবং বিজাপুরের মুসলিম প্রতিবেশীদের বিরুদ্ধেও যুদ্ধ করেছিলেন।
Test Set: তিনি বিজয়নগর সাম্রাজ্যের বিরুদ্ধে এবং বিজাপুরের মুসলিম প্রতিবেশীদের বিরুদ্ধেও যুদ্ধ করেছিলেন।
Predicted: এটি মূলত একটি মরুময় অঞ্চব।
Test Set: এটি মূলত একটি মরুময় অঞ্চল।
Predicted: সড়কটি বিহার পশ্চিমবঙ্গ সীমান্ত অতিক্রম গর়ে পশ্চিমবঙ্গ রাজ্যে প্রবেশ করে উত্তর দিনাজ্পুর জেলা হয়ে।
Test Set: সড়কটি বিহার-পশ্

In [33]:
index = 0 
for sentence in sentences:
    if "unk" in sentence:
#         print(sentence)
#         print(index)
#         break
        index += 1

In [34]:
index

1485

In [29]:
test['sentence'][18]

'এমনকি নামাযের সময়ও সঙ্গে রাখতেন।'

In [None]:
test

In [17]:
len(sentences)

29588

In [20]:
len(sentences)

59286

In [21]:
batch_size

8

In [22]:
len(sentences[29589:])

29697

In [18]:
len(test)

29588

In [25]:
len(sentences[-10_000:])

10000

In [39]:
check = sentences[-29_588:]

In [40]:
test_check = test[-29_588:]

In [41]:
import jiwer

jiwerresult = []

for x in range(len(check)):

    jiwer_ = jiwer.wer(check[x],test_check.iloc[x,1])

    jiwerresult.append(jiwer_)

In [37]:
avg_wer = sum(jiwerresult)/len(jiwerresult)
avg_wer

0.2729789306789587

In [42]:
avg_wer = sum(jiwerresult)/len(jiwerresult)
avg_wer

0.26888149151446894

In [12]:
# sentences = []

# with torch.no_grad():
#     for batch in tqdm(test_loader):
#         x = batch["input_values"]
#         x = x.to("cuda", non_blocking=True)
#         with torch.cuda.amp.autocast(True):
#             y = model(x).logits
#         predicted_ids = torch.argmax(y, dim=-1)

#         for i in range(batch_size):
#             transcription = processor.decode(predicted_ids[i])
#             sentences.append(transcription)

100%|█████████▉| 5917/5918 [13:03<00:00,  7.56it/s]


IndexError: index 3 is out of bounds for dimension 0 with size 3

In [14]:
temp = "একটু বয়েস হলে একটি বিদেশী।"

In [15]:
temp_1 = "একটু বয়স হলে একটি বিদেশী।"

In [16]:
if temp == temp_1:
    print(True)
else:
    print(False)

False


In [17]:
temp == temp_1

False

In [22]:
dadada = "একটু বয়েস হলে একটি বিদেশী।"
sss = "একটু বয়েস হলে একটি বিদেশী।"

dadada == sss

True

In [23]:
def calculate_cer(actual, hyp):
    """
    Calculate CER using `python-Levenshtein`.
    """
    import Levenshtein as Lev

    actual = actual.replace(' ', '')
    hyp = hyp.replace(' ', '')
    return Lev.distance(actual, hyp) / len(actual)

In [27]:
check = calculate_cer("একটু বয়স হলে একটি বিদেশী।", "একটু বয়েস হলে একটি বিদেশী।")
check*100

14.285714285714285

In [28]:
def calculate_wer(actual, hyp):
    """
    Calculate WER using `python-Levenshtein`.
    """
    import Levenshtein as Lev

    b = set(actual.split() + hyp.split())
    word2char = dict(zip(b, range(len(b))))

    w1 = [chr(word2char[w]) for w in actual.split()]
    w2 = [chr(word2char[w]) for w in hyp.split()]

    return Lev.distance(''.join(w1), ''.join(w2)) / len(actual.split())

In [29]:
check = calculate_wer("একটু বয়স হলে একটি বিদেশী।", "একটু বয়েস হলে একটি বিদেশী।")
check*100

20.0