This notebook is based on : https://www.kaggle.com/code/ttahara/bengali-sr-umong-sain-s-wav2vec2-0-w-lm-baseline .

Make sure to execute files below before running this notebook:
1. train/finetune_fole_without_unigrams.py
2. train/finetune_with_commonvoice.py

## Install packages 
This part is needed only on Kaggle Notebook. packages can be obtained from [here](https://www.kaggle.com/datasets/nagohachi/bengaliai-packages/)

In [3]:
# !cp -r ../input/bengaliai-packages ./

# !pip install ./bengaliai-packages/setuptools-65.7.0-py3-none-any.whl -f ./ --no-index
# !pip install ./bengaliai-packages/jiwer-3.0.3-py3-none-any.whl -f ./ --no-index
# !pip install ./bengaliai-packages/bnunicodenormalizer-0.1.6/bnunicodenormalizer-0.1.6 -f ./ --no-index
# !pip install ./bengaliai-packages/attrs-23.1.0-py3-none-any.whl -f ./ --no-index --no-deps
# !pip install ./bengaliai-packages/exceptiongroup-1.1.3-py3-none-any.whl -f ./ --no-index --no-deps
# !pip install ./bengaliai-packages/hypothesis-6.87.0-py3-none-any.whl -f ./ --no-index --no-deps
# !pip install ./bengaliai-packages/pygtrie-2.5.0-py3-none-any.whl -f ./ --no-index --no-deps
# !pip install ./bengaliai-packages/sortedcontainers-2.4.0-py2.py3-none-any.whl -f ./ --no-index --no-deps
# !pip install ./bengaliai-packages/pyctcdecode-0.5.0-py2.py3-none-any.whl -f ./ --no-index --no-deps
# !pip install ./bengaliai-packages/pypi-kenlm-0.1.20220713/pypi-kenlm-0.1.20220713 -f ./ --no-index --no-deps

# !rm -rf ./bengaliai-packages

Looking in links: ./
Processing ./bengaliai-packages/setuptools-65.7.0-py3-none-any.whl
Installing collected packages: setuptools
  Attempting uninstall: setuptools
    Found existing installation: setuptools 59.8.0
    Uninstalling setuptools-59.8.0:
      Successfully uninstalled setuptools-59.8.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
opentelemetry-api 1.18.0 requires importlib-metadata~=6.0.0, but you have importlib-metadata 6.7.0 which is incompatible.
pymc3 3.11.5 requires numpy<1.22.2,>=1.15.0, but you have numpy 1.23.5 which is incompatible.
pymc3 3.11.5 requires scipy<1.8.0,>=1.7.3, but you have scipy 1.11.1 which is incompatible.[0m[31m
[0mSuccessfully installed setuptools-65.7.0
Looking in links: ./
Processing ./bengaliai-packages/jiwer-3.0.3-py3-none-any.whl
Installing collected packages: jiwer
Successfully installed 

In [5]:
from pathlib import Path
from functools import partial

import pandas as pd
import pyctcdecode
from tqdm.notebook import tqdm

import librosa

import pyctcdecode
import torch
from transformers import Wav2Vec2Processor, Wav2Vec2ProcessorWithLM, Wav2Vec2ForCTC
from bnunicodenormalizer import Normalizer

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


In [6]:
ROOT = Path.cwd().parent
INPUT = ROOT / "input"
DATA = INPUT / "bengaliai-speech"
TRAIN = DATA / "train_mp3s"
TEST = DATA / "test_mp3s"

SAMPLING_RATE = 16_000
MODEL_PATH = INPUT / "wav2vec2-small-finetuned-with-commonvoice/" # finetuned again with commonvoice
LM_PATH = INPUT / "bengali-sr-download-public-trained-models/wav2vec2-xls-r-300m-bengali/language_model/"

### load model, processor, decoder

In [7]:
model = Wav2Vec2ForCTC.from_pretrained(MODEL_PATH)
processor = Wav2Vec2Processor.from_pretrained(MODEL_PATH)

In [8]:
vocab_dict = processor.tokenizer.get_vocab()
sorted_vocab_dict = {k: v for k, v in sorted(vocab_dict.items(), key=lambda item: item[1])}

decoder = pyctcdecode.build_ctcdecoder(
    list(sorted_vocab_dict.keys()),
    str(LM_PATH / "5gram.bin"),
)

In [9]:
processor_with_lm = Wav2Vec2ProcessorWithLM(
    feature_extractor=processor.feature_extractor,
    tokenizer=processor.tokenizer,
    decoder=decoder
)

## prepare dataloader

In [10]:
class BengaliSRTestDataset(torch.utils.data.Dataset):
    
    def __init__(
        self,
        audio_paths: list[str],
        sampling_rate: int
    ):
        self.audio_paths = audio_paths
        self.sampling_rate = sampling_rate
        
    def __len__(self,):
        return len(self.audio_paths)
    
    def __getitem__(self, index: int):
        audio_path = self.audio_paths[index]
        sr = self.sampling_rate
        w = librosa.load(audio_path, sr=sr, mono=False)[0]
        
        return w

In [11]:
test = pd.read_csv(DATA / "sample_submission.csv", dtype={"id": str})
print(test.head())

             id                                           sentence
0  0f3dac00655e  এছাড়াও নিউজিল্যান্ড এ ক্রিকেট দলের হয়েও খেলছ...
1  a9395e01ad21  এছাড়াও নিউজিল্যান্ড এ ক্রিকেট দলের হয়েও খেলছ...
2  bf36ea8b718d  এছাড়াও নিউজিল্যান্ড এ ক্রিকেট দলের হয়েও খেলছ...


In [12]:
test_audio_paths = [str(TEST / f"{aid}.mp3") for aid in test["id"].values]

In [13]:
test_dataset = BengaliSRTestDataset(
    test_audio_paths, SAMPLING_RATE
)

collate_func = partial(
    processor_with_lm.feature_extractor,
    return_tensors="pt", sampling_rate=SAMPLING_RATE,
    padding=True,
)

test_loader = torch.utils.data.DataLoader(
    test_dataset, batch_size=8, shuffle=False,
    num_workers=2, collate_fn=collate_func, drop_last=False,
    pin_memory=True,
)

## Inference

In [14]:
if not torch.cuda.is_available():
    device = torch.device("cpu")
else:
    device = torch.device("cuda")
print(device)

cuda


In [15]:
model = model.to(device)
model = model.eval()
model = model.half()

In [16]:
pred_sentence_list = []

with torch.no_grad():
    for batch in tqdm(test_loader):
        x = batch["input_values"]
        x = x.to(device, non_blocking=True)
        with torch.cuda.amp.autocast(True):
            y = model(x).logits
        y = y.detach().cpu().numpy()
        
#         {'alpha': 0.345, 'beta': 0.06, 'beam_width': 768}
        for l in y:  
            sentence = processor_with_lm.decode(
                l, beam_width=2000, 
                alpha=0.345, 
                beta=0.06
            ).text
            pred_sentence_list.append(sentence)

  0%|          | 0/1 [00:00<?, ?it/s]

## Make Submission

In [17]:
bnorm = Normalizer()

def postprocess(sentence):
    period_set = set([".", "?", "!", "।"])
    _words = [bnorm(word)['normalized']  for word in sentence.split()]
    sentence = " ".join([word for word in _words if word is not None])
    try:
        if sentence[-1] not in period_set:
            sentence+="।"
    except:
        sentence = "।"
    return sentence

In [18]:
pp_pred_sentence_list = [
    postprocess(s) for s in tqdm(pred_sentence_list)]

  0%|          | 0/3 [00:00<?, ?it/s]

In [19]:
test["sentence"] = pp_pred_sentence_list

test.to_csv("submission.csv", index=False)

pd.set_option("display.max_colwidth", 100)
print(test.head())

             id  \
0  0f3dac00655e   
1  a9395e01ad21   
2  bf36ea8b718d   

                                                                sentence  
0                                              একটু বয়স হলে একটি বিদেশি।  
1  কী কারণে তুমি এতাবৎ কাল পর্যন্ত এ দারুল দৈব দুর্বিপাকে পতিত ছিলে বলো।  
2             এ কারণে সরকার নির্ধারিত হারে পরিবহনজনিত ক্ষতি অনুমোদন করে।  


## EOF