In [6]:
import pandas as pd
import math
import spacy

from IPython.display import display
from nltk import ngrams
from collections import Counter
from sklearn.model_selection import train_test_split

In [3]:
# import data from data/preprocessed/
with open("../data/preprocessed/PROCESSED_slk_newscrawl_2016_1M-sentences.txt", "r", encoding="utf-8") as f:
    slk_df = pd.DataFrame(
        [line.strip() for line in f.readlines()], columns=["sentence"]
    )
with open("../data/preprocessed/PROCESSED_tur_news_2024_1M-sentences.txt", "r", encoding="utf-8") as f:
    tur_df = pd.DataFrame(
        [line.strip() for line in f.readlines()], columns=["sentence"]
    )

# sanity check English wikipedia data
with open("../data/preprocessed/PROCESSED_eng-simple_wikipedia_2021_300K-sentences.txt", "r", encoding="utf-8") as f:
    eng_wiki_df = pd.DataFrame(
        [line.strip() for line in f.readlines()], columns=["sentence"]
    )

In [9]:
nlp = spacy.load("sk_dep_web_md")

doc = nlp("Toto je testovací slovenský text.")

sentence = ' '.join([token.lemma_ for token in doc])
print(sentence)


toto byť testovací slovenský text .


  with torch.cuda.amp.autocast(self._mixed_precision):


In [3]:
def preprocess_with_unk(corpus, min_freq=2):
    """
    Replace hapax legomena with <unk> token.
    Uses training corpus only for freq counting.
    """
    # Count frequencies on the full training data
    counts = Counter()
    for sentence in corpus['sentence']:
        counts.update(sentence.split())

    # Build vocabulary
    vocab = {w for w, c in counts.items() if c >= min_freq}
    vocab.add("<unk>")

    # Replace rare words with <unk>
    new_sentences = []
    for sentence in corpus['sentence']:
        tokens = [
            w if w in vocab else "<unk>"
            for w in sentence.split()
        ]
        new_sentences.append(" ".join(tokens))

    return {"sentence": new_sentences}, vocab

In [4]:
# test train split
slk_train_df, slk_test_df = train_test_split(slk_df, test_size=0.1, random_state=42)
tur_train_df, tur_test_df = train_test_split(tur_df, test_size=0.1, random_state=42)

eng_train_df, eng_test_df = train_test_split(eng_wiki_df, test_size=0.1, random_state=42)

In [5]:
def train_trigram_model(corpus):
    """
    Train a trigram model with counts for:
      - trigram counts
      - bigram counts (for conditioning)
    """
    trigram_counts = Counter()
    bigram_counts = Counter()

    for sentence in corpus['sentence']:
        tokens = sentence.split()
        for i in range(len(tokens) - 2):
            w1, w2, w3 = tokens[i], tokens[i+1], tokens[i+2]
            trigram_counts[(w1, w2, w3)] += 1
            bigram_counts[(w1, w2)] += 1

    return trigram_counts, bigram_counts

In [6]:
def calculate_perplexity(trigram_counts, bigram_counts, corpus, vocab):
    """
    Trigram perplexity with add-one smoothing:
    P(w3 | w1, w2) = (count(w1,w2,w3) + 1) / (count(w1,w2) + V)
    """
    V = len(vocab)

    log_prob_sum = 0.0
    N = 0

    for sentence in corpus['sentence']:
        tokens = sentence.split()
        for i in range(len(tokens) - 2):
            w1, w2, w3 = tokens[i], tokens[i+1], tokens[i+2]

            count_tri = trigram_counts.get((w1, w2, w3), 0)
            count_bi = bigram_counts.get((w1, w2), 0)

            prob = (count_tri + 1) / (count_bi + V)

            log_prob_sum += math.log(prob)
            N += 1

    return math.exp(-log_prob_sum / N)

In [7]:
# preprocess train with <unk>
slk_trained_processed, slk_vocab = preprocess_with_unk(slk_train_df, min_freq=2)
tur_trained_processed, tur_vocab = preprocess_with_unk(tur_train_df, min_freq=2)

eng_trained_processed, eng_vocab = preprocess_with_unk(eng_train_df, min_freq=2)

In [8]:
# preprocess test set to replace OOV with <unk>
def preprocess_test_set(test_df, vocab):
    test_df_processed, _ = preprocess_with_unk(test_df)
    test_df_processed["sentence"] = [
        " ".join([(w if w in vocab else "<unk>")
                for w in sent.split()])
        for sent in test_df_processed["sentence"]
    ]
    return test_df_processed

In [9]:
# Train, evaluate, and print perplexities
for lang, train_processed, test_processed, vocab in [
    ("Slovak", slk_trained_processed, slk_test_df, slk_vocab),
    ("Turkish", tur_trained_processed, tur_test_df, tur_vocab),
    ("English", eng_trained_processed, eng_test_df, eng_vocab),
]:
    trigram_counts, bigram_counts = train_trigram_model(train_processed)
    test_processed = preprocess_test_set(test_processed, vocab)
    perplexity = calculate_perplexity(trigram_counts, bigram_counts, test_processed, vocab)
    print(f"{lang} Trigram Model Perplexity: {perplexity:.2f}")

Slovak Trigram Model Perplexity: 117241.81
Turkish Trigram Model Perplexity: 119828.92
English Trigram Model Perplexity: 20566.69


In [10]:
# print common trigrams
for lang, train_processed in [
    ("Slovak", slk_trained_processed),
    ("Turkish", tur_trained_processed),
    ("English", eng_trained_processed),
]:
    trigram_counts, _ = train_trigram_model(train_processed)
    common_trigrams = trigram_counts.most_common(5)
    print(f"{lang} Most Common Trigrams:")
    for trigram, count in common_trigrams:
        print(f"  {' '.join(trigram)}: {count}")

Slovak Most Common Trigrams:
  prečítajte si exkluzívne: 12673
  si exkluzívne prognózy: 12673
  exkluzívne prognózy trendu: 12673
  do diskusie sa: 9027
  príspevku do diskusie: 9025
Turkish Most Common Trigrams:
  sosyal medya hesabından: 2498
  bir kez daha: 2315
  Cumhurbaşkanı Recep Tayyip: 2229
  Recep Tayyip Erdoğan: 2157
  Büyükşehir Belediye Başkanı: 1594
English Most Common Trigrams:
  one of the: 2787
  part of the: 1655
  the united states: 1555
  it is the: 1167
  <unk> <unk> <unk>: 1153
