In [2]:
!wget -O "guthenberg-cs.txt" "https://www.gutenberg.org/cache/epub/34225/pg34225.txt"
!wget -O "guthenberg-en.txt" "https://www.gutenberg.org/cache/epub/37536/pg37536.txt"

--2026-01-03 14:48:32--  https://www.gutenberg.org/cache/epub/34225/pg34225.txt
Resolving www.gutenberg.org (www.gutenberg.org)... 152.19.134.47, 2610:28:3090:3000:0:bad:cafe:47
Connecting to www.gutenberg.org (www.gutenberg.org)|152.19.134.47|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 769766 (752K) [text/plain]
Saving to: ‘guthenberg-cs.txt’


2026-01-03 14:48:33 (1,18 MB/s) - ‘guthenberg-cs.txt’ saved [769766/769766]

--2026-01-03 14:48:34--  https://www.gutenberg.org/cache/epub/37536/pg37536.txt
Resolving www.gutenberg.org (www.gutenberg.org)... 152.19.134.47, 2610:28:3090:3000:0:bad:cafe:47
Connecting to www.gutenberg.org (www.gutenberg.org)|152.19.134.47|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 724943 (708K) [text/plain]
Saving to: ‘guthenberg-en.txt’


2026-01-03 14:48:35 (1,12 MB/s) - ‘guthenberg-en.txt’ saved [724943/724943]



In [12]:
%pip install sacremoses

from sacremoses import MosesTokenizer, MosesDetokenizer
import os
from collections import Counter, defaultdict
import numpy as np

Note: you may need to restart the kernel to use updated packages.


In [4]:
filenames = {"cs": "guthenberg-cs.txt","en": "guthenberg-en.txt"}


In [5]:
for name in filenames:
    print(f"{name}: {os.path.getsize(filenames[name])} bytes")

cs: 769766 bytes
en: 724943 bytes


In [6]:
corpus_full = {}

for lang in filenames:
    with open(filenames[lang], "r") as f:
        corpus_full[lang] = f.read()

data = {lang: MosesTokenizer(lang).tokenize(corpus_full[lang]) for lang in corpus_full}

In [8]:
start_special_symbol, end_special_symbol = "<s>", "</s>"

get_unigrams = lambda tokens : Counter(tokens + [start_special_symbol, end_special_symbol])
get_bigrams = lambda tokens : Counter(zip([start_special_symbol] + tokens, tokens + [end_special_symbol]))
get_trigrams = lambda tokens : Counter(zip(2 * [start_special_symbol] + tokens, [start_special_symbol] + tokens + [end_special_symbol], tokens + 2 * [end_special_symbol]))

In [9]:
print("Length of english tokenized data:", len(data["en"]))
data["en"] = data["en"][:30_000]
print("Length of czech tokenized data:", len(data["cs"]))
data["cs"] = data["cs"][:15_000]
print("Unique tokens in english 30k dataset:", len(set(data["en"])))
print("Unique tokens in czech 15k dataset:", len(set(data["cs"])))

Length of english tokenized data: 149191
Length of czech tokenized data: 136966
Unique tokens in english 30k dataset: 4426
Unique tokens in czech 15k dataset: 4950


In [10]:
uni = {lang: get_unigrams(data[lang]) for lang in data}
bi = {lang: get_bigrams(data[lang]) for lang in data}

print("English tokens with at least 50 occurrences:", len([token for token, count in uni["en"].items() if count >= 50]))
print("Czech tokens with at least 20 occurrences:", len([token for token, count in uni["cs"].items() if count >= 20]))

English tokens with at least 50 occurrences: 73
Czech tokens with at least 20 occurrences: 77


In [None]:
def get_class_bigrams(word2class, bigrams):
    class_bigrams = defaultdict(int)
    for (w1, w2), count in bigrams.items():
        c1, c2 = word2class[w1], word2class[w2]
        class_bigrams[(c1, c2)] += count
    return class_bigrams

def get_class_bigrams_matrix(word2class, bigrams):
    class_bigrams_matrix = np.zeros((len(word2class), len(word2class)), dtype=int)
    for (l, r), count in bigrams.items():
        if l not in word2class or r not in word2class:
            continue
        c1, c2 = word2class[l], word2class[r]
        class_bigrams_matrix[c1, c2] += count
    return class_bigrams_matrix

def q(classes_matrix,l,r,N): # q_k(l,r)
    c_k = classes_matrix[l,r]
    c_kl = classes_matrix[l,:].sum()
    c_kr = classes_matrix[:,r].sum()
    if c_kl == 0 or c_kr == 0 or c_k == 0:
        return 0.0
    return c_k / N * np.log(N * c_k / (c_kl * c_kr))

def merge_matrix(m, a, b):
    """
    Return a new class bigram matrix where classes a and b are merged.
    """
    if a > b:
        a, b = b, a

    # new merged row / column
    new_row = m[a, :] + m[b, :]
    new_col = m[:, a] + m[:, b]

    # remove a and b
    m2 = np.delete(m, [a, b], axis=0)
    m2 = np.delete(m2, [a, b], axis=1)

    # append merged row and column
    m2 = np.vstack([m2, new_row[np.newaxis, :-2]])
    new_col = np.append(new_col[:-2], new_row[-2])
    m2 = np.column_stack([m2, new_col])

    return m2


def apply_merge_word2class(word2class, a, b):
    """
    Merge class b into class a.
    Re-label classes to keep them contiguous.
    """
    new_word2class = {}
    for w, c in word2class.items():
        if c == b:
            new_word2class[w] = a
        elif c > b:
            new_word2class[w] = c - 1
        else:
            new_word2class[w] = c
    return new_word2class

def mutual_information(classes_matrix, N):
    mi = 0
    for i in range(classes_matrix.shape[0]):
        for j in range(classes_matrix.shape[1]):
            mi += q(classes_matrix, i, j, N)
    return mi


def word_classes(initial_words, unigrams, bigrams, target_number=15):
    N = sum(unigrams.values()) # total number of tokens
    word2class = {w:i for i, (w,count) in enumerate(initial_words.items())} # r function word2class[word] = class_id
    # class_bigrams_dict = get_class_bigrams(word2class, bigrams)
    class_bigrams_matrix = get_class_bigrams_matrix(word2class, bigrams) # i,j position -> #bigrams s.t. class i-> class j
    
    mi = mutual_information(class_bigrams_matrix, N)

    while class_bigrams_matrix.shape[0] > target_number:
        print(f"Current number of classes: {class_bigrams_matrix.shape[0]}, MI={mi:.6f}")

        best_loss = float("inf")
        best_pair = None
        best_matrix = None

        K = class_bigrams_matrix.shape[0]

        for a in range(K):
            for b in range(a + 1, K):
                # test merge
                merged_matrix = merge_matrix(class_bigrams_matrix, a, b)
                merged_mi = mutual_information(merged_matrix, N)
                loss = mi - merged_mi

                if loss < best_loss:
                    best_loss = loss
                    best_pair = (a, b)
                    best_matrix = merged_matrix

        # apply best merge
        a, b = best_pair
        class_bigrams_matrix = best_matrix
        word2class = apply_merge_word2class(word2class, a, b)
        mi -= best_loss

        print(
            f"Merged classes {a} and {b}, "
            f"loss={best_loss:.6f}, "
            f"remaining={class_bigrams_matrix.shape[0]}"
        )


    return class_bigrams_matrix, word2class



m, w2c = word_classes({w:c for w,c in get_unigrams(data["en"]).items() if c >= 50}, get_unigrams(data["en"]), get_bigrams(data["en"]), target_number=15)

# mi = 0
# N = sum(get_unigrams(data["en"]).values())
# for i in range(m.shape[0]):
#     for j in range(m.shape[1]):
#         mi += q(m, i, j, N)
# print(mi)





Current number of classes: 73, MI=0.719245


In [17]:
len(set(w2c.values()))

73

In [14]:
len(w2c)

73

In [38]:
[a for a,b in w2c.items() if b == 181]


['že']

In [34]:
np.where(m == 144)

(array([38]), array([181]))

In [28]:
# !wget "https://ufallab.ms.mff.cuni.cz/~helcl/npfl147/cc.en.50.bin"
# !wget "https://ufallab.ms.mff.cuni.cz/~helcl/npfl147/cc.cs.50.bin"
# import fasttext
# ft_en = fasttext.load_model('cc.en.50.bin')