In [48]:
#! pip install https://github.com/kpu/kenlm/archive/master.zip
! pip install pandas

Collecting pandas
  Using cached pandas-2.3.3-cp312-cp312-macosx_11_0_arm64.whl.metadata (91 kB)
Collecting numpy>=1.26.0 (from pandas)
  Downloading numpy-2.4.0-cp312-cp312-macosx_14_0_arm64.whl.metadata (6.6 kB)
Collecting pytz>=2020.1 (from pandas)
  Using cached pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Downloading tzdata-2025.3-py2.py3-none-any.whl.metadata (1.4 kB)
Using cached pandas-2.3.3-cp312-cp312-macosx_11_0_arm64.whl (10.7 MB)
Downloading numpy-2.4.0-cp312-cp312-macosx_14_0_arm64.whl (5.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m30.0 MB/s[0m  [33m0:00:00[0m eta [36m0:00:01[0m
[?25hUsing cached pytz-2025.2-py2.py3-none-any.whl (509 kB)
Downloading tzdata-2025.3-py2.py3-none-any.whl (348 kB)
Installing collected packages: pytz, tzdata, numpy, pandas
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4/4[0m [pandas]2m3/4[0m [pandas]
[1A[2KSuccessfully installed 

In [49]:
import json
import kenlm
import subprocess
import os
import os.path
from pathlib import Path
from collections import defaultdict
import urllib.request
import gzip
import bz2
import math
import pandas as pd

In [3]:
DATA_DIR = Path("data")
MODELS_DIR = Path("models")
DATA_DIR.mkdir(exist_ok=True)
MODELS_DIR.mkdir(exist_ok=True)

In [4]:
jsonl_file = "ua_asr_hypotheses_500.jsonl"

data = []
with open(jsonl_file, 'r', encoding='utf-8') as f:
    for line in f:
        data.append(json.loads(line))

print(f"Loaded {len(data)} utterances")
print(f"\nFirst example:")
print(f"Reference: {data[0]['reference']}")
print(f"Candidates ({len(data[0]['hypotheses'])}):")
for i, hyp in enumerate(data[0]['hypotheses'][:3], 1):
    print(f"  {i}. {hyp}")

Loaded 500 utterances

First example:
Reference: Вайлдер — Ортіс: відео нокауту
Candidates (8):
  1. Вайлдер — Ортіс: відео нокауту
  2. Вайлдер — Ортіз: видео нокауту
  3. Вайлдер — Ортіс: видео нокоуту


In [None]:
CORPORA = {
    'social': {
        'url': 'https://lang.org.ua/static/downloads/ubertext2.0/social/sentenced/ubertext.social.filter_rus_gcld+short.text_only.txt.bz2',
        'size': '87 MB'
    },
    'fiction': {
        'url': 'https://lang.org.ua/static/downloads/ubertext2.0/fiction/sentenced/ubertext.fiction.filter_rus_gcld+short.text_only.txt.bz2',
        'size': '398 MB'
    },
    'news': {
        'url': 'https://lang.org.ua/static/downloads/ubertext2.0/news/sentenced/ubertext.news.filter_rus_gcld+short.text_only.txt.bz2',
        'size': '3.4 GB'
    }
}

def download_and_preprocess_corpus(name, url):
    bz2_file = DATA_DIR / f"{name}.txt.bz2"
    raw_file = DATA_DIR / f"{name}_raw.txt"
    preprocessed_file = DATA_DIR / f"{name}_preprocessed.txt"
    
    if not preprocessed_file.exists():
        if not raw_file.exists():
            urllib.request.urlretrieve(url, bz2_file)
            
            with bz2.open(bz2_file, 'rt', encoding='utf-8') as f_in:
                with open(raw_file, 'w', encoding='utf-8') as f_out:
                    f_out.write(f_in.read())

        with open(raw_file, 'r', encoding='utf-8') as f_in:
            with open(preprocessed_file, 'w', encoding='utf-8') as f_out:
                for i, line in enumerate(f_in):
                    cleaned = line.lower().strip()
                    if cleaned:
                        f_out.write(cleaned + '\n')
        
        print(f"Preprocessed corpus saved")
    else:
        print(f"Preprocessed corpus already exists")
    
    with open(preprocessed_file, 'r', encoding='utf-8') as f:
        lines = sum(1 for _ in f)
    print(f"Corpus size: {lines:,} lines")
    
    return preprocessed_file

corpus_files = {}

# skip social
corpus_files['social'] = DATA_DIR / "corpus_preprocessed.txt"

corpus_files['fiction'] = download_and_preprocess_corpus('fiction', CORPORA['fiction']['url'])
corpus_files['news'] = download_and_preprocess_corpus('news', CORPORA['news']['url'])

In [24]:
lmplz_path = f"/Users/mykhailopavliuk/nlp-improving-speech-recognition/nlp-improving-speech-recognition/kenlm/build/bin/lmplz"
build_binary_path = f"/Users/mykhailopavliuk/nlp-improving-speech-recognition/nlp-improving-speech-recognition/kenlm/build/bin/build_binary"

In [39]:
def train_kenlm(input_file, output_file, ngram_order):
    
    cmd = [
        lmplz_path,
        "-o", str(ngram_order),
        "--text", str(input_file),
        "--arpa", str(output_file),
        "--discount_fallback"
    ]
    
    subprocess.run(cmd, check=True, capture_output=True, text=True)
    
    # convert to binary
    binary_file = output_file.with_suffix('.bin')
    cmd = [build_binary_path, str(output_file), str(binary_file)]
    result = subprocess.run(cmd, capture_output=True, text=True)
    
    if result.returncode == 0:
        output_file.unlink()
        return binary_file
    else:
        print(f"Binary conversion failed, keeping ARPA")
        return output_file

# using 2-gram with context size 1 as 1-gram
models = {}

arpa_file = MODELS_DIR / f"ukrainian_1gram.arpa"
models[1] = train_kenlm(preprocessed_file, arpa_file, 2)

# actual 2-gram and 3-gram
for n in [2, 3]:
    arpa_file = MODELS_DIR / f"ukrainian_{n}gram.arpa"
    models[n] = train_kenlm(preprocessed_file, arpa_file, n)

for n, path in models.items():
    print(f"  {n}-gram: {path.name}")

  1-gram: ukrainian_1gram.bin
  2-gram: ukrainian_2gram.bin
  3-gram: ukrainian_3gram.bin


In [43]:
def load_model(model_path):
    return kenlm.Model(str(model_path))

def sentence_logprob(model, sent):
    return model.score(sent.lower(), bos=True, eos=True)

def sentence_perplexity(model, sent, order=None):
    words = sent.lower().split()
    if len(words) == 0:
        return float('inf')
    
    # for 1-gram, score words independently
    if order == 1:
        logprob = sum(model.score(word, bos=False, eos=False) for word in words)
    else:
        logprob = model.score(sent.lower(), bos=True, eos=True)
    
    perplexity = 10 ** (-logprob / len(words))
    return perplexity

loaded_models = {}
for n, path in models.items():
    loaded_models[n] = load_model(path)
    print(f"Loaded {n}-gram model")

Loaded 1-gram model
Loaded 2-gram model
Loaded 3-gram model


In [44]:
def rerank_hypotheses(hypotheses, model, order=None):
    scored = []
    for hyp in hypotheses:
        ppl = sentence_perplexity(model, hyp, order=order)
        scored.append((hyp, ppl))
    
    # sort by perplexity (ascending)
    scored.sort(key=lambda x: x[1])
    return scored

example = data[0]
print(f"Reference: {example['reference']}\n")

for n in [1, 2, 3]:
    print(f"{n}-gram model:")
    ranked = rerank_hypotheses(example['hypotheses'], loaded_models[n], order=n)
    
    print(f"Best candidate:  {ranked[0][0]}")
    print(f"Perplexity: {ranked[0][1]:.2f}")
    
    print(f"Worst candidate: {ranked[-1][0]}")
    print(f"Perplexity: {ranked[-1][1]:.2f}")
    print()

Reference: Вайлдер — Ортіс: відео нокауту

1-gram model:
Best candidate:  Вайлдер — Ортіс: відео нокауту
Perplexity: 199754.37
Worst candidate: Войлдер — Ортіз: вітео нокауту
Perplexity: 1347445.37

2-gram model:
Best candidate:  Вайлдер — Ортіс: відео нокауту
Perplexity: 448904.31
Worst candidate: Войлдер — Ортіз: вітео нокауту
Perplexity: 6729788.32

3-gram model:
Best candidate:  Вайлдер — Ортіс: відео нокауту
Perplexity: 404277.82
Worst candidate: Войлдер — Ортіз: вітео нокауту
Perplexity: 6281724.98



In [45]:
def evaluate_model(data, model, order=None):
    correct = 0
    total = len(data)
    
    results = []
    
    for item in data:
        reference = item['reference']
        hypotheses = item['hypotheses']

        ranked = rerank_hypotheses(hypotheses, model, order=order)
        best_candidate = ranked[0][0]
        
        is_correct = (best_candidate == reference)
        if is_correct:
            correct += 1
        
        results.append({
            'utt_id': item['utt_id'],
            'reference': reference,
            'predicted': best_candidate,
            'correct': is_correct,
            'perplexity': ranked[0][1]
        })
    
    accuracy = correct / total
    return accuracy, results


all_results = {}
for n in [1, 2, 3]:
    print(f"Evaluating {n}-gram model")
    accuracy, results = evaluate_model(data, loaded_models[n], order=n)
    all_results[n] = {'accuracy': accuracy, 'results': results}
    print(f"Accuracy: {accuracy:.2%} ({int(accuracy * len(data))}/{len(data)})\n")

print("\nSummary:")
for n in [1, 2, 3]:
    acc = all_results[n]['accuracy']
    print(f"{n}-gram: {acc:.2%}")

Evaluating 1-gram model
Accuracy: 60.80% (304/500)

Evaluating 2-gram model
Accuracy: 73.60% (368/500)

Evaluating 3-gram model
Accuracy: 76.00% (380/500)


Summary:
1-gram: 60.80%
2-gram: 73.60%
3-gram: 76.00%


In [50]:
all_corpus_results = {
    'social': all_results
}

for corpus_name in ['fiction', 'news']:
    print(f"Training on {corpus_name} corpus")

    models = {}
    for n in [1, 2, 3]:
        arpa_file = MODELS_DIR / f"{corpus_name}_{n}gram.arpa"
        
        if n == 1:
            model_file = train_kenlm(corpus_files[corpus_name], arpa_file, 2)
        else:
            model_file = train_kenlm(corpus_files[corpus_name], arpa_file, n)
        
        models[n] = model_file
    
    loaded_models = {}
    for n, path in models.items():
        loaded_models[n] = kenlm.Model(str(path))
        print(f"Loaded {n}-gram model")
    
    print(f"\nEvaluating on {corpus_name}")
    corpus_results = {}
    for n in [1, 2, 3]:
        accuracy, results = evaluate_model(data, loaded_models[n], order=n)
        corpus_results[n] = {'accuracy': accuracy, 'results': results}
        print(f"  {n}-gram: {accuracy:.2%}")
    
    all_corpus_results[corpus_name] = corpus_results


Training on fiction corpus
Loaded 1-gram model
Loaded 2-gram model
Loaded 3-gram model

Evaluating on fiction
  1-gram: 55.40%
  2-gram: 61.20%
  3-gram: 63.40%
Training on news corpus
Loaded 1-gram model
Loaded 2-gram model
Loaded 3-gram model

Evaluating on news
  1-gram: 60.00%
  2-gram: 79.60%
  3-gram: 80.60%


In [51]:
results_table = []
for corpus in ['social', 'fiction', 'news']:
    for n in [1, 2, 3]:
        acc = all_corpus_results[corpus][n]['accuracy']
        results_table.append({
            'Corpus': corpus.capitalize(),
            'N-gram': f"{n}-gram",
            'Accuracy': f"{acc:.2%}",
            'Correct': f"{int(acc * len(data))}/{len(data)}"
        })

df = pd.DataFrame(results_table)
print(df.to_string(index=False))

 Corpus N-gram Accuracy Correct
 Social 1-gram   60.80% 304/500
 Social 2-gram   73.60% 368/500
 Social 3-gram   76.00% 380/500
Fiction 1-gram   55.40% 277/500
Fiction 2-gram   61.20% 306/500
Fiction 3-gram   63.40% 317/500
   News 1-gram   60.00% 300/500
   News 2-gram   79.60% 398/500
   News 3-gram   80.60% 403/500
