In [None]:
!pip install bnunicodenormalizer
!pip install pandarallel
!pip -q install https://github.com/kpu/kenlm/archive/master.zip pyctcdecode

In [None]:
!conda install -y gdown

In [None]:
!gdown --id --folder 1BTlleKr18WBYPYu8xFcCBC8JL5r97re2

In [None]:
import numpy as np
import pandas as pd
import random
import ast
from tqdm import tqdm
from IPython import display as ipd

# visualization
import matplotlib.pyplot as plt
from tabulate import tabulate
from joblib import Parallel, delayed

#normalization
from pandarallel import pandarallel
from bnunicodenormalizer import Normalizer 
pandarallel.initialize(progress_bar=True,nb_workers=8)
tqdm.pandas()
bnorm=Normalizer()

# Set environment variables
import warnings
warnings.filterwarnings('ignore')

In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [None]:
df_train = pd.read_csv('../input/dlsprint/train.csv')
df_validation = pd.read_csv('../input/dlsprint/validation.csv')

In [None]:
def cleaning_csv(df):
    df['votes'] = df['up_votes'] - df['down_votes']
    df['votes'] = df['votes'] + abs(df['votes'].min())
    df.replace(0,df['votes'].mean(axis=0),inplace=True)
    df['votes'] = df['votes']/df['votes'].max() 
    df = df.drop(['client_id','age','gender','accents','locale','up_votes','down_votes'],axis=1)
    df = df.dropna(how='all')
    if len(df) > 200000:
        df['path'] = '../input/dlsprint/train_files/' + df['path']
    if len(df) < 10000:
        df['path'] = '../input/dlsprint/validation_files/' + df['path']
    return df

In [None]:
def remove_punctuations(my_str):
    punctuations = '''````£|¢|Ñ+-*/=EROero৳০১২৩৪৫৬৭৮৯012–34567•89।!()-[]{};:'"“\’…,<>.‚/?@#$%^&*_~‘—॥”‰🤣⚽️✌�￰৷￰'''
    no_punct = ""
    for char in my_str:
        if char not in punctuations:
            no_punct = no_punct + char
    return no_punct


In [None]:
def normalize(sen):
    _words = [bnorm(word)['normalized']  for word in sen.split()]
    return " ".join([word for word in _words if word is not None]) 

In [None]:
df_train = cleaning_csv(df_train)
df_validation = cleaning_csv(df_validation)

In [None]:
df_train['sentence'] = df_train['sentence'].apply(lambda x : remove_punctuations(x))
df_validation['sentence'] = df_validation['sentence'].apply(lambda x : remove_punctuations(x))

In [None]:
df_train = pd.concat([df_train, df_validation], ignore_index=True)

In [None]:
to_drop_train = df_train[df_train['sentence'].str.contains('V')]
df_train = df_train.drop(to_drop_train.index)
to_drop_train = df_train[df_train['sentence'].str.contains('A')]
df_train = df_train.drop(to_drop_train.index)
to_drop_train = df_train[df_train['sentence'].str.contains('B')]
df_train = df_train.drop(to_drop_train.index)

In [None]:
df_train["sentence"]=df_train["sentence"].parallel_apply(lambda x:normalize(x))

In [None]:
with open("text.txt", "w") as file:
    file.write(" ".join(df_train["sentence"]))

In [None]:
! sudo apt -y install build-essential cmake libboost-system-dev libboost-thread-dev libboost-program-options-dev libboost-test-dev libeigen3-dev zlib1g-dev libbz2-dev liblzma-dev
! wget -O - https://kheafield.com/code/kenlm.tar.gz | tar xz
! mkdir kenlm/build && cd kenlm/build && cmake .. && make -j2
! ls kenlm/build/bin
! kenlm/build/bin/lmplz -o 6 < "text.txt" > "6gram.arpa"

In [None]:
with open("6gram.arpa", "r") as read_file, open("6gram_correct.arpa", "w") as write_file:
    has_added_eos = False
    for line in read_file:
        if not has_added_eos and "ngram 1=" in line:
            count=line.strip().split("=")[-1]
            write_file.write(line.replace(f"{count}", f"{int(count)+1}"))
        elif not has_added_eos and "<s>" in line:
            write_file.write(line)
            write_file.write(line.replace("<s>", "</s>"))
            has_added_eos = True
        else:
            write_file.write(line)

In [None]:
import kenlm
model = kenlm.LanguageModel('./6gram_correct.arpa')

In [None]:
print(model.score('প্রধান নায়ক', bos=True, eos=True))
print(model.score('প্রাধান নয়ক', bos=True, eos=True))

In [None]:
#!cp -r /kaggle/input/wav2vec2-bn/wav2vec2_bn /kaggle/working/

In [None]:
from transformers import Wav2Vec2CTCTokenizer,Wav2Vec2ForCTC,Wav2Vec2Processor,Wav2Vec2FeatureExtractor
processor = Wav2Vec2Processor.from_pretrained("wav2vec2_bn")
vocab_dict = processor.tokenizer.get_vocab()
sorted_vocab_dict = {k.lower(): v for k, v in sorted(vocab_dict.items(), key=lambda item: item[1])}

In [None]:
vocab_dict

In [None]:
sorted_vocab_dict

In [None]:
list(sorted_vocab_dict.keys())

In [None]:
from pyctcdecode import build_ctcdecoder

decoder = build_ctcdecoder(
    labels=list(sorted_vocab_dict.keys()),
    kenlm_model_path="6gram_correct.arpa",
)

In [None]:
from transformers import Wav2Vec2ProcessorWithLM

processor_with_lm = Wav2Vec2ProcessorWithLM(
    feature_extractor=processor.feature_extractor,
    tokenizer=processor.tokenizer,
    decoder=decoder
)

In [None]:
processor_with_lm.save_pretrained("wav2vec2_bn")

In [None]:
!tree -h wav2vec2_bn/

In [None]:
!kenlm/build/bin/build_binary wav2vec2_bn/language_model/6gram_correct.arpa wav2vec2_bn/language_model/6gram.bin

In [None]:
!tree -h wav2vec2_bn/

In [None]:
!rm wav2vec2_bn/language_model/6gram_correct.arpa && tree -h wav2vec2_bn/

In [None]:
!du -hs *

In [None]:
!mv wav2vec2_bn wav2vec2_bn_xlsr_300m_38k_6gram_arpa

In [None]:
!tree wav2vec2_bn_xlsr_300m_38k_6gram_arpa

In [None]:
!rm -rf kenlm
!rm -rf 6gram.arpa
!rm -rf text.txt
!rm -rf 6gram_correct.arpa

In [None]:
'''
how to use it
from transformers import Wav2Vec2ProcessorWithLM

processor = Wav2Vec2ProcessorWithLM.from_pretrained("patrickvonplaten/wav2vec2-base-100h-with-lm")

'''

In [None]:
%%writefile ./wav2vec2_bn_xlsr_300m_38k_6gram_arpa/tokenizer_config.json
{"unk_token": "[UNK]", "bos_token": null, "eos_token": null, "pad_token": "[PAD]", "do_lower_case": false, "word_delimiter_token": "|", "replace_word_delimiter_char": " ", "config": null, "tokenizer_type": "wav2vec2", "processor_class": "Wav2Vec2ProcessorWithLM", "special_tokens_map_file": "wav2vec2_bn/special_tokens_map.json", "name_or_path": "wav2vec2_bn", "tokenizer_class": "Wav2Vec2CTCTokenizer"}

In [None]:
%%writefile ./wav2vec2_bn_xlsr_300m_38k_6gram_arpa/special_tokens_map.json

{"bos_token": null, "eos_token": null, "unk_token": "[UNK]", "pad_token": "[PAD]"}

In [None]:
%%writefile ./wav2vec2_bn_xlsr_300m_38k_6gram_arpa/added_tokens.json
{}

In [None]:
%%writefile ./wav2vec2_bn_xlsr_300m_38k_6gram_arpa/alphabet.json
{"labels": ["\u0999", "\u09a6", "\u0990", "\u09b9", "\u09cc", "\u0994", "\u0986", "\u09aa", "\u0998", "\u098a", "\u09ad", "\u09dd", "\u09ac", "\u09ae", "\u09a8", "\u09b8", "\u0989", "\u09a3", "\u09c7", "\u09c3", "\u0983", "\u099c", "\u09bf", "\u0982", "\u098b", "\u09c1", "\u0995", "\u09b7", "\u09a0", "\u098f", "\u0987", "\u09b2", "\u09b6", "\u09ce", "\u09a2", "\u0988", "\u0993", "\u099d", "\u099e", "\u09cd", "\u09df", "\u09a7", "\u09a1", "\u099f", "\u099b", "\u09af", "\u0997", "\u09be", "\u09c8", "\u09c0", "\u0981", "\u09dc", "\u09cb", "\u09a4", "\u09a5", "\u0985", " ", "\u099a", "\u09c2", "\u09b0", "\u0996", "\u09ab", "\u2047", ""], "is_bpe": false}

In [None]:
#this model has been  uploaded to huggingface
#https://huggingface.co/mushrafi88/wav2vec2_xlsr_300m_bn_6gram_arpa