## build KenLM

In [None]:
!sudo apt install \
    build-essential cmake libboost-system-dev libboost-thread-dev \
    libboost-program-options-dev libboost-test-dev libeigen3-dev zlib1g-dev \
    libbz2-dev liblzma-dev

In [None]:
!wget -O - https://kheafield.com/code/kenlm.tar.gz | tar xz

In [None]:
!mkdir kenlm/build && cd kenlm/build && cmake .. && make -j2

## imports

In [None]:
import os
import re
import shutil

import pandas as pd

In [None]:
from google.colab import drive
gdrive_mount_dp = '/content/gdrive/'
drive.mount(gdrive_mount_dp)

Drive already mounted at /content/gdrive/; to attempt to forcibly remount, call drive.mount("/content/gdrive/", force_remount=True).


In [None]:
out_dp = 'lm'
os.makedirs(out_dp, exist_ok=True)

local_data_dp = 'data'
os.makedirs(local_data_dp, exist_ok=True)

## build LM from CommonVoice 8 belarusian

In [None]:
gdrive_exp_root_dp = input()

In [None]:
data_gdrive_fp = os.path.join(gdrive_exp_root_dp, 'data', 'sentences_for_lm.tsv'
shutil.copy2(data_gdrive_fp, local_data_dp)
os.listdir(local_data_dp)

['sentences_for_lm.tsv']

In [None]:
df = pd.read_csv(os.path.join(local_data_dp, 'sentences_for_lm.tsv'), sep='\t')
print(df.shape)

(314676, 2)


In [None]:
df.head(2)

Unnamed: 0,sentence,split
0,"Вы ўжо ўсе, відавочна, адчулі, што нешта не так?",train
1,"Зойдзе да яе, а яна не ведае, з чаго пачынаць ...",train


In [None]:
df['split'].value_counts()

train    314305
other       371
Name: split, dtype: int64

In [None]:
assert df.duplicated('sentence').sum() == 0

## kenlm input format:
* 1 sentence per line

In [None]:
import nltk

In [None]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
line = df.iloc[0]['sentence'] + ' ' + df.iloc[1]['sentence']
line

'Вы ўжо ўсе, відавочна, адчулі, што нешта не так? Зойдзе да яе, а яна не ведае, з чаго пачынаць гаворку.'

In [None]:
res = []
for sentence in nltk.sent_tokenize(line):
    res.append(nltk.word_tokenize(sentence))
    print(' '.join(res[-1]))

Вы ўжо ўсе , відавочна , адчулі , што нешта не так ?
Зойдзе да яе , а яна не ведае , з чаго пачынаць гаворку .


## preprocess texts

In [None]:
char_map = {'n': 'н', 'ґ': 'г'}
char_map = str.maketrans(char_map)
char_map

{110: 'н', 1169: 'г'}

In [None]:
def preprocess_text(text, char_map):
    text = text.lower()
    # remove non-word chars
    text = re.sub(r"[^\w\s']", '', text)    
    # sub multiple sequential space chars with single space
    text = re.sub(r"\s+", ' ', text)
    text = text.translate(char_map)
    return text

In [None]:
df['sentence_p'] = df['sentence'].apply(preprocess_text, char_map=char_map)

In [None]:
s = df.sample(5)
print(s['sentence_p'].tolist())
print(s['sentence'].tolist())

['суседнія аўто з беларускай рэгістрацыяй засталіся цэлымі', 'у гэтым вялікая небяспека', 'аднак па словах эксперта нельга сцвярджаць што інфіцыраваных стала нашмат больш', 'а гэта вялікая перавага', 'чытаю на беларускай і расейскай мове']
['Суседнія аўто з беларускай рэгістрацыяй засталіся цэлымі.', 'У гэтым вялікая небяспека.', 'Аднак, па словах эксперта, нельга сцвярджаць, што інфіцыраваных стала нашмат больш.', 'А гэта вялікая перавага.', 'Чытаю на беларускай і расейскай мове.']


In [None]:
text_dump_fp = os.path.join('sentences_for_lm_processed_1PerLine.txt')
print(f'text_dump_fp: {text_dump_fp}')

with open(text_dump_fp, 'w') as fout:
    joined_text = '\n'.join(df['sentence_p'])
    fout.write(joined_text)

text_dump_fp: lm/sentences_for_lm_processed_1PerLine.txt


In [None]:
!wc $text_dump_fp

  314675  2385268 28378057 lm/sentences_for_lm_processed_1PerLine.txt


In [None]:
!du -sh $out_dp/*

28M	lm/sentences_for_lm_processed_1PerLine.txt


## build LM

In [None]:
!kenlm/build/bin/lmplz -o 5 --discount_fallback < "lm/sentences_for_lm_processed_1PerLine.txt" > "lm/cv8be_5gram.arpa"

=== 1/5 Counting and sorting n-grams ===
Reading /content/lm/sentences_for_lm_processed_1PerLine.txt
----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100
tcmalloc: large alloc 1918697472 bytes == 0x55f2e110a000 @  0x7fe9516651e7 0x55f2dfbbd7e2 0x55f2dfb584fe 0x55f2dfb372eb 0x55f2dfb23066 0x7fe94f7fec87 0x55f2dfb24baa
tcmalloc: large alloc 8953896960 bytes == 0x55f3536da000 @  0x7fe9516651e7 0x55f2dfbbd7e2 0x55f2dfbac80a 0x55f2dfbad248 0x55f2dfb37308 0x55f2dfb23066 0x7fe94f7fec87 0x55f2dfb24baa
****************************************************************************************************
Unigram tokens 2385268 types 144389
=== 2/5 Calculating and sorting adjusted counts ===
Chain sizes: 1:1732668 2:1063026240 3:1993174400 4:3189078784 5:4650740224
tcmalloc: large alloc 4650745856 bytes == 0x55f2e110a000 @  0x7fe9516651e7 0x55f2dfbbd7e2 0x55f2dfbac80a 0x55f2dfbad248 0x55f2dfb378d7 0x55f2dfb23066 0x7fe94f7fec87 0x55f2dfb24baa
tcmalloc:

In [None]:
!du -sh lm/*

403M	lm/cv8be_5gram.arpa
28M	lm/sentences_for_lm_processed_1PerLine.txt


In [None]:
!wc lm/cv8be_5gram.arpa

  6801446  36230104 422360493 lm/cv8be_5gram.arpa


In [None]:
!head -20 lm/cv8be_5gram.arpa

\data\
ngram 1=144389
ngram 2=1141265
ngram 3=1862784
ngram 4=1931256
ngram 5=1721734

\1-grams:
-6.058665	<unk>	0
0	<s>	-1.0664405
-1.2717469	</s>	0
-3.3857362	вы	-0.36858183
-2.9503407	ўжо	-0.35378957
-3.331054	ўсе	-0.27460152
-3.998521	відавочна	-0.24098675
-4.704899	адчулі	-0.257407
-2.484223	што	-0.57535625
-3.2316258	нешта	-0.3649858
-2.1907144	не	-0.51273227
-2.8073287	так	-0.4867947


## convert .arpa to .bin

In [None]:
!kenlm/build/bin/build_binary "lm/cv8be_5gram.arpa" "lm/cv8be_5gram.bin"

Reading lm/cv8be_5gram.arpa
----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100
****************************************************************************************************
SUCCESS


In [None]:
!du -sh lm/*

403M	lm/cv8be_5gram.arpa
149M	lm/cv8be_5gram.bin
28M	lm/sentences_for_lm_processed_1PerLine.txt


## copy to gdrive

In [None]:
lm_gdrive_dp = os.path.join(gdrive_exp_root_dp, 'lm')
os.makedirs(lm_gdrive_dp, exist_ok=True)

In [None]:
shutil.copy2('lm/cv8be_5gram.bin', lm_gdrive_dp);

In [None]:
os.listdir(lm_gdrive_dp)

['cv8be_5gram.bin']