In [1]:
! pip install https://github.com/kpu/kenlm/archive/master.zip

Collecting https://github.com/kpu/kenlm/archive/master.zip
  Downloading https://github.com/kpu/kenlm/archive/master.zip
[2K     [32m\[0m [32m553.6 kB[0m [31m4.3 MB/s[0m [33m0:00:00[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25hBuilding wheels for collected packages: kenlm
  Building wheel for kenlm (pyproject.toml) ... [?25ldone
[?25h  Created wheel for kenlm: filename=kenlm-0.2.0-cp312-cp312-macosx_14_0_arm64.whl size=498409 sha256=2a10934b044cf31341c94fb4e25b6a2de3c3ca54d9870f175b175422348bf45d
  Stored in directory: /private/var/folders/5c/m8ryn9cx68304n_x62p_9xzh0000gn/T/pip-ephem-wheel-cache-q5h9i99x/wheels/92/c8/12/56d187154e078f0eaa74d059017fc1afe1c4d91fbce02ce8d9
Successfully built kenlm
Installing collected packages: kenlm
Successfully installed kenlm-0.2.0


In [12]:
! brew install kenlm

[34m==>[0m [1mAuto-updating Homebrew...[0m
Adjust how often this is run with `$HOMEBREW_AUTO_UPDATE_SECS` or disable with
`$HOMEBREW_NO_AUTO_UPDATE=1`. Hide these hints with `$HOMEBREW_NO_ENV_HINTS=1` (see `man brew`).
[34m==>[0m [1mDownloading https://ghcr.io/v2/homebrew/core/portable-ruby/blobs/sha256:1c98fa49eacc935640a6f8e10a2bf33f14cfc276804b71ddb658ea45ba99d167[0m
######################################################################### 100.0%
[34m==>[0m [1mPouring portable-ruby-3.4.8.arm64_big_sur.bottle.tar.gz[0m
[34m==>[0m [1mAuto-updated Homebrew![0m
Updated 4 taps (hashicorp/tap, txn2/tap, homebrew/core and homebrew/cask).
[34m==>[0m [1mNew Formulae[0m
adplay: Command-line player for OPL2 music
astra: Command-Line Interface for DataStax Astra
bookokrat: Terminal EPUB Book Reader
bumpp: Interactive CLI that bumps your version numbers and more
calm-cli: CLI allows you to interact with the Common Architecture Language Model (CALM)
carl: Calendar for the comm

In [18]:
import json
import kenlm
import subprocess
import os
import os.path
from pathlib import Path
from collections import defaultdict
import urllib.request
import gzip
import bz2
import math

In [3]:
DATA_DIR = Path("data")
MODELS_DIR = Path("models")
DATA_DIR.mkdir(exist_ok=True)
MODELS_DIR.mkdir(exist_ok=True)

In [4]:
jsonl_file = "ua_asr_hypotheses_500.jsonl"

data = []
with open(jsonl_file, 'r', encoding='utf-8') as f:
    for line in f:
        data.append(json.loads(line))

print(f"Loaded {len(data)} utterances")
print(f"\nFirst example:")
print(f"Reference: {data[0]['reference']}")
print(f"Candidates ({len(data[0]['hypotheses'])}):")
for i, hyp in enumerate(data[0]['hypotheses'][:3], 1):
    print(f"  {i}. {hyp}")

Loaded 500 utterances

First example:
Reference: Вайлдер — Ортіс: відео нокауту
Candidates (8):
  1. Вайлдер — Ортіс: відео нокауту
  2. Вайлдер — Ортіз: видео нокауту
  3. Вайлдер — Ортіс: видео нокоуту


In [9]:
# using social corpus for the first attempt
corpus_url = "https://lang.org.ua/static/downloads/ubertext2.0/social/sentenced/ubertext.social.filter_rus_gcld+short.text_only.txt.bz2"
corpus_file = DATA_DIR / "social.txt.bz2"
corpus_txt = DATA_DIR / "corpus.txt"

if not corpus_txt.exists():
    urllib.request.urlretrieve(corpus_url, corpus_file)

    with bz2.open(corpus_file, 'rt', encoding='utf-8') as f_in:
        with open(corpus_txt, 'w', encoding='utf-8') as f_out:
            f_out.write(f_in.read())
    
    print(f"Corpus saved to {corpus_txt}")
else:
    print(f"Corpus already exists at {corpus_txt}")

with open(corpus_txt, 'r', encoding='utf-8') as f:
    lines = sum(1 for _ in f)
    
print(f"Corpus has {lines:,} lines")

Corpus saved to data/corpus.txt
Corpus has 4,494,263 lines


In [10]:
def preprocess_text(text):
    return text.lower().strip()

preprocessed_file = DATA_DIR / "corpus_preprocessed.txt"

with open(corpus_txt, 'r', encoding='utf-8') as f_in:
    with open(preprocessed_file, 'w', encoding='utf-8') as f_out:
        for i, line in enumerate(f_in):
            cleaned = preprocess_text(line)
            if cleaned:  # skip empty lines
                f_out.write(cleaned + '\n')
            
            if (i + 1) % 100000 == 0:
                print(f"  Processed {i+1:,} lines...")


  Processed 100,000 lines...
  Processed 200,000 lines...
  Processed 300,000 lines...
  Processed 400,000 lines...
  Processed 500,000 lines...
  Processed 600,000 lines...
  Processed 700,000 lines...
  Processed 800,000 lines...
  Processed 900,000 lines...
  Processed 1,000,000 lines...
  Processed 1,100,000 lines...
  Processed 1,200,000 lines...
  Processed 1,300,000 lines...
  Processed 1,400,000 lines...
  Processed 1,500,000 lines...
  Processed 1,600,000 lines...
  Processed 1,700,000 lines...
  Processed 1,800,000 lines...
  Processed 1,900,000 lines...
  Processed 2,000,000 lines...
  Processed 2,100,000 lines...
  Processed 2,200,000 lines...
  Processed 2,300,000 lines...
  Processed 2,400,000 lines...
  Processed 2,500,000 lines...
  Processed 2,600,000 lines...
  Processed 2,700,000 lines...
  Processed 2,800,000 lines...
  Processed 2,900,000 lines...
  Processed 3,000,000 lines...
  Processed 3,100,000 lines...
  Processed 3,200,000 lines...
  Processed 3,300,000 line

In [24]:
lmplz_path = f"/Users/mykhailopavliuk/nlp-improving-speech-recognition/nlp-improving-speech-recognition/kenlm/build/bin/lmplz"
build_binary_path = f"/Users/mykhailopavliuk/nlp-improving-speech-recognition/nlp-improving-speech-recognition/kenlm/build/bin/build_binary"

In [33]:
def train_kenlm(input_file, output_file, ngram_order):
    cmd = [
        lmplz_path,
        "-o", str(ngram_order),
        "--text", str(input_file),
        "--arpa", str(output_file),
        "--discount_fallback"
    ]
    
    subprocess.run(cmd, check=True, capture_output=True, text=True)
    print(f"{ngram_order}-gram ARPA model created")
    
    # binary for faster loading
    binary_file = output_file.with_suffix('.bin')
    cmd = [build_binary_path, str(output_file), str(binary_file)]
    result = subprocess.run(cmd, capture_output=True, text=True)
    
    if result.returncode != 0:
        print(f"Binary conversion failed: {result.stderr}")
        return output_file
    else:
        print(f"Binary model saved to {binary_file}")
        return binary_file

models = {}
for n in [2, 3]:
    arpa_file = MODELS_DIR / f"ukrainian_{n}gram.arpa"
    model_file = train_kenlm(preprocessed_file, arpa_file, n)
    models[n] = model_file

for n, path in models.items():
    print(f"  {n}-gram: {path}")

2-gram ARPA model created
Binary model saved to models/ukrainian_2gram.bin
3-gram ARPA model created
Binary model saved to models/ukrainian_3gram.bin
  2-gram: models/ukrainian_2gram.bin
  3-gram: models/ukrainian_3gram.bin


In [34]:
def load_model(model_path):
    return kenlm.Model(str(model_path))

def sentence_logprob(model, sent):
    return model.score(sent.lower(), bos=True, eos=True)

def sentence_perplexity(model, sent):
    words = sent.lower().split()
    if len(words) == 0:
        return float('inf')
    
    logprob = sentence_logprob(model, sent)
    perplexity = 10 ** (-logprob / len(words))
    return perplexity

# 2-gram and 3-gram only
loaded_models = {}
for n, path in models.items():
    loaded_models[n] = load_model(path)
    print(f"Loaded {n}-gram model")

Loaded 2-gram model
Loaded 3-gram model


In [37]:
def rerank_hypotheses(hypotheses, model):
    scored = []
    for hyp in hypotheses:
        ppl = sentence_perplexity(model, hyp)
        scored.append((hyp, ppl))
    
    # sort by perplexity (ascending)
    scored.sort(key=lambda x: x[1])
    return scored

example = data[0]
print(f"Reference: {example['reference']}\n")

for n in [2, 3]:
    print(f"{n}-gram model:")
    ranked = rerank_hypotheses(example['hypotheses'], loaded_models[n])
    
    print(f"Best candidate:  {ranked[0][0]}")
    print(f"Perplexity: {ranked[0][1]:.2f}")
    
    print(f"Worst candidate: {ranked[-1][0]}")
    print(f"Perplexity: {ranked[-1][1]:.2f}")
    print()

Reference: Вайлдер — Ортіс: відео нокауту

2-gram model:
Best candidate:  Вайлдер — Ортіс: відео нокауту
Perplexity: 448904.31
Worst candidate: Войлдер — Ортіз: вітео нокауту
Perplexity: 6729788.32

3-gram model:
Best candidate:  Вайлдер — Ортіс: відео нокауту
Perplexity: 404277.82
Worst candidate: Войлдер — Ортіз: вітео нокауту
Perplexity: 6281724.98



In [38]:
# Check which format we're using
loaded_models = {}
for n, path in models.items():
    loaded_models[n] = load_model(path)
    file_type = "Binary" if str(path).endswith('.bin') else "ARPA"
    print(f"✓ Loaded {n}-gram model ({file_type}: {path.name})")

✓ Loaded 2-gram model (Binary: ukrainian_2gram.bin)
✓ Loaded 3-gram model (Binary: ukrainian_3gram.bin)
