In [1]:
!pip -q install https://github.com/kpu/kenlm/archive/master.zip pyctcdecode

[0m

In [2]:
!git clone https://github.com/mushrafi88/asr_bangla.git

Cloning into 'asr_bangla'...
remote: Enumerating objects: 231, done.[K
remote: Counting objects: 100% (30/30), done.[K
remote: Compressing objects: 100% (25/25), done.[K
remote: Total 231 (delta 15), reused 15 (delta 5), pack-reused 201[K
Receiving objects: 100% (231/231), 45.92 MiB | 11.81 MiB/s, done.
Resolving deltas: 100% (107/107), done.


In [3]:
import numpy as np
import pandas as pd
import random
import ast
from tqdm import tqdm
from IPython import display as ipd

# visualization
import matplotlib.pyplot as plt
from tabulate import tabulate
from joblib import Parallel, delayed

#normalization
from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True,nb_workers=8)
tqdm.pandas()

import torch
import torchaudio
import torchaudio.functional as F
import torchaudio.transforms as T
from torchmetrics.functional.audio import scale_invariant_signal_noise_ratio
import librosa


from datasets import Dataset,Audio


# Set environment variables
import warnings
warnings.filterwarnings('ignore')

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [4]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [5]:
df = pd.read_csv('/kaggle/input/bn-corpus-prothom-alo/bangla_text_data.csv')
df

Unnamed: 0,sentence
0,পশ্চিমবঙ্গের বরখাস্ত হওয়া মন্ত্রী পার্থ চট্টোপ...
1,আর জামিনের আবেদন করলেন না অর্পিতা
2,আজ শুক্রবার দিনের ইডি হেফাজত শেষে তাঁদের হাজির...
3,সেখানে পার্থর আইনজীবী জামিনের আবেদন জানালেও অর...
4,অর্পিতা মুখার্জি কারাগারেই
...,...
8418468,চারটি অভিযোগে তাঁকে মৃত্যুদণ্ড দেওয়া হয়
8418469,এর মধ্যে রাজনৈতিক গ্রুপকে গণহত্যার দায়ের বিষয়ট...
8418470,যা প্রথম
8418471,অপর একটি অভিযোগে তাঁকে বছরের কারাদণ্ড দিয়েছেন ...


In [6]:
df=df.dropna(how='any')

In [7]:
with open("text.txt", "w") as file:
    file.write(" ".join(df["sentence"]))

In [8]:
! sudo apt -y install build-essential cmake libboost-system-dev libboost-thread-dev libboost-program-options-dev libboost-test-dev libeigen3-dev zlib1g-dev libbz2-dev liblzma-dev
! wget -O - https://kheafield.com/code/kenlm.tar.gz | tar xz
! mkdir kenlm/build && cd kenlm/build && cmake .. && make -j2
! ls kenlm/build/bin
! kenlm/build/bin/lmplz -o 4 < "text.txt" > "6gram.arpa"




cmake is already the newest version (3.16.3-1ubuntu1).
libboost-system-dev is already the newest version (1.71.0.0ubuntu2).
build-essential is already the newest version (12.8ubuntu1.1).
The following additional packages will be installed:
  bzip2-doc libboost-atomic1.71-dev libboost-atomic1.71.0
  libboost-chrono1.71-dev libboost-chrono1.71.0 libboost-date-time1.71-dev
  libboost-date-time1.71.0 libboost-program-options1.71-dev
  libboost-program-options1.71.0 libboost-serialization1.71-dev
  libboost-serialization1.71.0 libboost-test1.71-dev libboost-test1.71.0
  libboost-thread1.71-dev libboost-thread1.71.0 zlib1g
Suggested packages:
  libeigen3-doc libmpfrc++-dev liblzma-doc
The following NEW packages will be installed:
  bzip2-doc libboost-atomic1.71-dev libboost-atomic1.71.0
  libboost-chrono1.71-dev libboost-chrono1.71.0 libboost-date-time1.71-dev
  libboost-date-time1.71.0 libboost-program-options-dev
  libboost-program-options1.71-dev libboost-program-options1.71.0
  libboo

In [9]:
with open("6gram.arpa", "r") as read_file, open("6gram_correct.arpa", "w") as write_file:
    has_added_eos = False
    for line in read_file:
        if not has_added_eos and "ngram 1=" in line:
            count=line.strip().split("=")[-1]
            write_file.write(line.replace(f"{count}", f"{int(count)+1}"))
        elif not has_added_eos and "<s>" in line:
            write_file.write(line)
            write_file.write(line.replace("<s>", "</s>"))
            has_added_eos = True
        else:
            write_file.write(line)

In [10]:
import kenlm
model = kenlm.LanguageModel('./6gram_correct.arpa')

Loading the LM will be faster if you build a binary file.
Reading /kaggle/working/6gram_correct.arpa
----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100
****************************************************************************************************


In [11]:
print(model.score('প্রধান নায়ক', bos=True, eos=True))
print(model.score('প্রাধান নয়ক', bos=True, eos=True))

-10.845097541809082
-13.650924682617188


In [12]:
#!cp -r /kaggle/input/wav2vec2-bn/wav2vec2_bn /kaggle/working/

In [13]:
from transformers import Wav2Vec2CTCTokenizer,Wav2Vec2ForCTC,Wav2Vec2Processor,Wav2Vec2FeatureExtractor
processor = Wav2Vec2Processor.from_pretrained("mushrafi88/wav2vec2_xlsr_300m_bn_6gram_arpa")
vocab_dict = processor.tokenizer.get_vocab()
sorted_vocab_dict = {k.lower(): v for k, v in sorted(vocab_dict.items(), key=lambda item: item[1])}

Downloading:   0%|          | 0.00/262 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/404 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/696 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.00 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/84.0 [00:00<?, ?B/s]

In [14]:
vocab_dict

{'[PAD]': 63,
 '[UNK]': 62,
 '|': 56,
 'ঁ': 50,
 'ং': 23,
 'ঃ': 20,
 'অ': 55,
 'আ': 6,
 'ই': 30,
 'ঈ': 35,
 'উ': 16,
 'ঊ': 9,
 'ঋ': 24,
 'এ': 29,
 'ঐ': 2,
 'ও': 36,
 'ঔ': 5,
 'ক': 26,
 'খ': 60,
 'গ': 46,
 'ঘ': 8,
 'ঙ': 0,
 'চ': 57,
 'ছ': 44,
 'জ': 21,
 'ঝ': 37,
 'ঞ': 38,
 'ট': 43,
 'ঠ': 28,
 'ড': 42,
 'ঢ': 34,
 'ণ': 17,
 'ত': 53,
 'থ': 54,
 'দ': 1,
 'ধ': 41,
 'ন': 14,
 'প': 7,
 'ফ': 61,
 'ব': 12,
 'ভ': 10,
 'ম': 13,
 'য': 45,
 'র': 59,
 'ল': 31,
 'শ': 32,
 'ষ': 27,
 'স': 15,
 'হ': 3,
 'া': 47,
 'ি': 22,
 'ী': 49,
 'ু': 25,
 'ূ': 58,
 'ৃ': 19,
 'ে': 18,
 'ৈ': 48,
 'ো': 52,
 'ৌ': 4,
 '্': 39,
 'ৎ': 33,
 'ড়': 51,
 'ঢ়': 11,
 'য়': 40}

In [15]:
sorted_vocab_dict

{'ঙ': 0,
 'দ': 1,
 'ঐ': 2,
 'হ': 3,
 'ৌ': 4,
 'ঔ': 5,
 'আ': 6,
 'প': 7,
 'ঘ': 8,
 'ঊ': 9,
 'ভ': 10,
 'ঢ়': 11,
 'ব': 12,
 'ম': 13,
 'ন': 14,
 'স': 15,
 'উ': 16,
 'ণ': 17,
 'ে': 18,
 'ৃ': 19,
 'ঃ': 20,
 'জ': 21,
 'ি': 22,
 'ং': 23,
 'ঋ': 24,
 'ু': 25,
 'ক': 26,
 'ষ': 27,
 'ঠ': 28,
 'এ': 29,
 'ই': 30,
 'ল': 31,
 'শ': 32,
 'ৎ': 33,
 'ঢ': 34,
 'ঈ': 35,
 'ও': 36,
 'ঝ': 37,
 'ঞ': 38,
 '্': 39,
 'য়': 40,
 'ধ': 41,
 'ড': 42,
 'ট': 43,
 'ছ': 44,
 'য': 45,
 'গ': 46,
 'া': 47,
 'ৈ': 48,
 'ী': 49,
 'ঁ': 50,
 'ড়': 51,
 'ো': 52,
 'ত': 53,
 'থ': 54,
 'অ': 55,
 '|': 56,
 'চ': 57,
 'ূ': 58,
 'র': 59,
 'খ': 60,
 'ফ': 61,
 '[unk]': 62,
 '[pad]': 63}

In [16]:
list(sorted_vocab_dict.keys())

['ঙ',
 'দ',
 'ঐ',
 'হ',
 'ৌ',
 'ঔ',
 'আ',
 'প',
 'ঘ',
 'ঊ',
 'ভ',
 'ঢ়',
 'ব',
 'ম',
 'ন',
 'স',
 'উ',
 'ণ',
 'ে',
 'ৃ',
 'ঃ',
 'জ',
 'ি',
 'ং',
 'ঋ',
 'ু',
 'ক',
 'ষ',
 'ঠ',
 'এ',
 'ই',
 'ল',
 'শ',
 'ৎ',
 'ঢ',
 'ঈ',
 'ও',
 'ঝ',
 'ঞ',
 '্',
 'য়',
 'ধ',
 'ড',
 'ট',
 'ছ',
 'য',
 'গ',
 'া',
 'ৈ',
 'ী',
 'ঁ',
 'ড়',
 'ো',
 'ত',
 'থ',
 'অ',
 '|',
 'চ',
 'ূ',
 'র',
 'খ',
 'ফ',
 '[unk]',
 '[pad]']

In [17]:
from pyctcdecode import build_ctcdecoder

decoder = build_ctcdecoder(
    labels=list(sorted_vocab_dict.keys()),
    kenlm_model_path="6gram_correct.arpa",
)

Loading the LM will be faster if you build a binary file.
Reading /kaggle/working/6gram_correct.arpa
----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100
****************************************************************************************************


In [18]:
from transformers import Wav2Vec2ProcessorWithLM

processor_with_lm = Wav2Vec2ProcessorWithLM(
    feature_extractor=processor.feature_extractor,
    tokenizer=processor.tokenizer,
    decoder=decoder
)

In [19]:
#model.save_pretrained('wav2vec2')
processor_with_lm.save_pretrained("wav2vec2_arpa")

In [20]:
!tree -h

[01;34m.[00m
├── [3.0G]  6gram.arpa
├── [3.0G]  6gram_correct.arpa
├── [139K]  __notebook__.ipynb
├── [4.0K]  [01;34masr_bangla[00m
│   ├── [ 106]  README.md
│   ├── [7.8K]  bn-corpus-prothom-alo-2022.ipynb
│   ├── [4.0K]  [01;34mcsv_files[00m
│   │   ├── [ 27M]  df_nov_2gram.csv
│   │   ├── [ 34M]  df_nov_3gram.csv
│   │   ├── [ 20M]  df_nov_wav2vec2.csv
│   │   ├── [2.9M]  evaluation.csv
│   │   ├── [1.4M]  submission_38k_no_punctuation.csv
│   │   ├── [2.6M]  submission_with_punctuation.csv
│   │   ├── [6.9M]  symspell.txt
│   │   ├── [1.4M]  test.csv
│   │   ├── [288K]  test_files_duration.csv
│   │   ├── [7.5M]  train_files_duration.csv
│   │   ├── [2.7M]  validation_38k_arpa_3_.csv
│   │   ├── [352K]  validation_error_list.csv
│   │   ├── [288K]  validation_files_duration.csv
│   │   ├── [1.2M]  validation_wer_detail.csv
│   │   ├── [3.9M]  validation_with_punctuation.csv
│   │   ├── [ 870]  vocab.json
│   │   ├── [811K]  wav2vec2_lm_errors.csv
│   │   ├── [1.0M]  wav2vec2_

In [21]:
!kenlm/build/bin/build_binary wav2vec2_arpa/language_model/6gram_correct.arpa wav2vec2_arpa/language_model/6gram.bin

Reading wav2vec2_arpa/language_model/6gram_correct.arpa
----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100
****************************************************************************************************
SUCCESS


In [22]:
!rm wav2vec2_arpa/language_model/6gram_correct.arpa && tree -h wav2vec2_arpa/

[01;34mwav2vec2_arpa/[00m
├── [ 658]  alphabet.json
├── [4.0K]  [01;34mlanguage_model[00m
│   ├── [876M]  6gram.bin
│   ├── [  78]  attrs.json
│   └── [ 11M]  unigrams.txt
├── [ 262]  preprocessor_config.json
├── [  51]  special_tokens_map.json
├── [ 464]  tokenizer_config.json
└── [ 827]  vocab.json

1 directory, 8 files


In [23]:
!rm -rf kenlm
!rm -rf 6gram.arpa
!rm -rf text.txt
!rm -rf 6gram_correct.arpa

In [24]:
%%writefile ./wav2vec2_arpa/tokenizer_config.json
{"unk_token": "[UNK]", "bos_token": null, "eos_token": null, "pad_token": "[PAD]", "do_lower_case": false, "word_delimiter_token": "|", "replace_word_delimiter_char": " ", "config": null, "tokenizer_type": "wav2vec2", "processor_class": "Wav2Vec2ProcessorWithLM", "special_tokens_map_file": "wav2vec2_bn/special_tokens_map.json", "name_or_path": "wav2vec2_bn", "tokenizer_class": "Wav2Vec2CTCTokenizer"}

Overwriting ./wav2vec2_arpa/tokenizer_config.json


In [25]:
%%writefile ./wav2vec2_arpa/special_tokens_map.json

{"bos_token": null, "eos_token": null, "unk_token": "[UNK]", "pad_token": "[PAD]"}

Overwriting ./wav2vec2_arpa/special_tokens_map.json


In [26]:
%%writefile ./wav2vec2_arpa/added_tokens.json
{}

Writing ./wav2vec2_arpa/added_tokens.json


In [27]:
%%writefile ./wav2vec2_arpa/alphabet.json
{"labels": ["\u0999", "\u09a6", "\u0990", "\u09b9", "\u09cc", "\u0994", "\u0986", "\u09aa", "\u0998", "\u098a", "\u09ad", "\u09dd", "\u09ac", "\u09ae", "\u09a8", "\u09b8", "\u0989", "\u09a3", "\u09c7", "\u09c3", "\u0983", "\u099c", "\u09bf", "\u0982", "\u098b", "\u09c1", "\u0995", "\u09b7", "\u09a0", "\u098f", "\u0987", "\u09b2", "\u09b6", "\u09ce", "\u09a2", "\u0988", "\u0993", "\u099d", "\u099e", "\u09cd", "\u09df", "\u09a7", "\u09a1", "\u099f", "\u099b", "\u09af", "\u0997", "\u09be", "\u09c8", "\u09c0", "\u0981", "\u09dc", "\u09cb", "\u09a4", "\u09a5", "\u0985", " ", "\u099a", "\u09c2", "\u09b0", "\u0996", "\u09ab", "\u2047", ""], "is_bpe": false}

Overwriting ./wav2vec2_arpa/alphabet.json


Inference part

In [28]:
model_path = '/kaggle/working/wav2vec2_arpa'

In [29]:
from transformers import Wav2Vec2CTCTokenizer,Wav2Vec2ForCTC,Wav2Vec2Processor,Wav2Vec2FeatureExtractor,Wav2Vec2ProcessorWithLM
model = Wav2Vec2ForCTC.from_pretrained('mushrafi88/wav2vec2_xlsr_300m_bn_6gram_arpa').to("cuda")
processor = Wav2Vec2ProcessorWithLM.from_pretrained(model_path)

Downloading:   0%|          | 0.00/2.01k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.18G [00:00<?, ?B/s]

In [30]:
df = pd.read_csv('/kaggle/working/asr_bangla/csv_files/df_nov_3gram.csv')

In [31]:
submission = Dataset.from_pandas(df)

In [32]:
!mkdir /kaggle/tmp

In [33]:
submission.save_to_disk('/kaggle/tmp')
submission = submission.cast_column("path", Audio(sampling_rate=16_000))
submission.cleanup_cache_files()
result=[]

In [34]:
submission

Dataset({
    features: ['path', 'id', 'source', 'audio', 'sentence', 'duration', 'snr', 'wav2vec2', '2gram', '3gram'],
    num_rows: 57366
})

In [35]:
# edit dic and save file name 
df_tmp = pd.DataFrame()

for i in tqdm(range(len(submission))):
    inputs = processor(submission[i]["path"]["array"], sampling_rate=16_000, return_tensors="pt").to("cuda")
    with torch.no_grad():
        logits = model(**inputs).logits
        transcription = processor.batch_decode(logits.cpu().numpy()).text
        dic={ 'path':submission[i]['path']['path'],'4gram': transcription[0]}
        df_tmp=df_tmp.append(dic,ignore_index=True)
        torch.cuda.empty_cache()
df = df.merge(df_tmp,how='inner',on='path')

100%|██████████| 57366/57366 [7:22:40<00:00,  2.16it/s]


In [36]:
df.to_csv('df_nov_4gram.csv',index=False)