In [74]:
!wget -O "guthenberg-cs.txt" "https://www.gutenberg.org/cache/epub/34225/pg34225.txt"
!wget -O "guthenberg-en.txt" "https://www.gutenberg.org/cache/epub/37536/pg37536.txt"

--2026-01-04 21:45:43--  https://www.gutenberg.org/cache/epub/34225/pg34225.txt
Resolving www.gutenberg.org (www.gutenberg.org)... 152.19.134.47, 2610:28:3090:3000:0:bad:cafe:47
Connecting to www.gutenberg.org (www.gutenberg.org)|152.19.134.47|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 769766 (752K) [text/plain]
Saving to: ‘guthenberg-cs.txt’


2026-01-04 21:45:44 (938 KB/s) - ‘guthenberg-cs.txt’ saved [769766/769766]

--2026-01-04 21:45:44--  https://www.gutenberg.org/cache/epub/37536/pg37536.txt
Resolving www.gutenberg.org (www.gutenberg.org)... 152.19.134.47, 2610:28:3090:3000:0:bad:cafe:47
Connecting to www.gutenberg.org (www.gutenberg.org)|152.19.134.47|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 724943 (708K) [text/plain]
Saving to: ‘guthenberg-en.txt’


2026-01-04 21:45:45 (1,15 MB/s) - ‘guthenberg-en.txt’ saved [724943/724943]



In [75]:
%pip install sacremoses

from sacremoses import MosesTokenizer, MosesDetokenizer
import os
from collections import Counter, defaultdict
import numpy as np

Note: you may need to restart the kernel to use updated packages.


In [76]:
filenames = {"cs": "guthenberg-cs.txt","en": "guthenberg-en.txt"}


In [77]:
for name in filenames:
    print(f"{name}: {os.path.getsize(filenames[name])} bytes")

cs: 769766 bytes
en: 724943 bytes


In [78]:
corpus_full = {}

for lang in filenames:
    with open(filenames[lang], "r") as f:
        corpus_full[lang] = f.read()

data = {lang: MosesTokenizer(lang).tokenize(corpus_full[lang]) for lang in corpus_full}

In [79]:
start_special_symbol, end_special_symbol = "<s>", "</s>"

get_unigrams = lambda tokens : Counter(tokens + [start_special_symbol, end_special_symbol])
get_bigrams = lambda tokens : Counter(zip([start_special_symbol] + tokens, tokens + [end_special_symbol]))
get_trigrams = lambda tokens : Counter(zip(2 * [start_special_symbol] + tokens, [start_special_symbol] + tokens + [end_special_symbol], tokens + 2 * [end_special_symbol]))

In [80]:
print("Length of english tokenized data:", len(data["en"]))
data["en"] = data["en"][:30_000]
print("Length of czech tokenized data:", len(data["cs"]))
data["cs"] = data["cs"][:15_000]
print("Unique tokens in english 30k dataset:", len(set(data["en"])))
print("Unique tokens in czech 15k dataset:", len(set(data["cs"])))

Length of english tokenized data: 149191
Length of czech tokenized data: 136966
Unique tokens in english 30k dataset: 4426
Unique tokens in czech 15k dataset: 4950


In [123]:
uni = {lang: get_unigrams(data[lang]) for lang in data}
bi = {lang: get_bigrams(data[lang]) for lang in data}

starting_words = {
    "en": {w:c for w,c in uni["en"].items() if c >= 50},
    "cs": {w:c for w,c in uni["cs"].items() if c >= 20}
}
print("English tokens with at least 50 occurrences:", len(starting_words["en"]))
print("Czech tokens with at least 20 occurrences:", len(starting_words["cs"]))

English tokens with at least 50 occurrences: 73
Czech tokens with at least 20 occurrences: 77


In [None]:
def get_class_bigrams(word2class, bigrams):
    class_bigrams = defaultdict(int)
    for (w1, w2), count in bigrams.items():
        c1, c2 = word2class[w1], word2class[w2]
        class_bigrams[(c1, c2)] += count
    return class_bigrams

def get_class_bigrams_matrix(word2class, bigrams):
    class_bigrams_matrix = np.zeros((len(word2class), len(word2class)), dtype=int)
    for (l, r), count in bigrams.items():
        if l not in word2class or r not in word2class:
            continue
        c1, c2 = word2class[l], word2class[r]
        class_bigrams_matrix[c1, c2] += count
    return class_bigrams_matrix

def q(classes_matrix,l,r,N): # q_k(l,r)
    c_k = classes_matrix[l,r]
    c_kl = classes_matrix[l,:].sum()
    c_kr = classes_matrix[:,r].sum()
    if c_kl == 0 or c_kr == 0 or c_k == 0:
        return 0.0
    return c_k / N * np.log(N * c_k / (c_kl * c_kr))

# ok
def mutual_information(classes_matrix, N):
    mi = 0
    for i in range(classes_matrix.shape[0]):
        for j in range(classes_matrix.shape[1]):
            mi += q(classes_matrix, i, j, N)
    return mi

def test_merge(c, a, b, N):
    """
    c : matrix[i,j] = # bigrams class i -> class j
    a, b : classes to merge
    should return mutal information of something
    """

    # calculate q on row a,b and column a,b
    # create new matrix with merged row/column
    # calculate q on merged row/column
    # return the difference or whatever

    value = 0.0
    for i in range(c.shape[0]):
        value -= q(c, i, a, N)
        value -= q(c, i, b, N)
        value -= q(c, a, i, N)
        value -= q(c, b, i, N)

    value += q(c, a, a, N)
    value += q(c, b, b, N)
    value += q(c, a, b, N)
    value += q(c, b, a, N)
    
    new_matrix = np.copy(c)
    new_matrix[:,a] += new_matrix[:,b]
    new_matrix[a,:] += new_matrix[b,:]
    new_matrix = np.delete(new_matrix, b, axis=0)
    new_matrix = np.delete(new_matrix, b, axis=1)

    for i in range(new_matrix.shape[0]):
        value += q(new_matrix, a, i, N)
        value += q(new_matrix, i, a, N)

    value -= q(new_matrix, a, a, N)

    return new_matrix, value



def word_classes(initial_words, unigrams, bigrams, target_number=15):
    N = sum(bigrams.values())                                               # 
    word2class = {w:i for i, (w,count) in enumerate(initial_words.items())} # r function word2class[word] = class_id
    initial_w2c= word2class.copy()
    c = get_class_bigrams_matrix(word2class, bigrams)                       # i,j position -> #bigrams s.t. class i-> class j

    # N = np.sum(class_bigrams_matrix)
    
    mi = mutual_information(c, N)
    print(f"Initial mutual information is : {mi} N : {N}")
    history = []

    while c.shape[0] > target_number:

        best_diff, best_pair, best_matrix = -float("inf"), None, None
        # new_mi = mi + mi_diff

        K = c.shape[0] # current number of classes
        mi = mutual_information(c, N)

        for a in range(K):
            for b in range(a+1, K):
                if a == b:
                    continue
                merged_matrix, diff = test_merge(c, a, b, N)

                if diff > best_diff:
                    best_diff = diff
                    best_pair = (a, b)
                    best_matrix = merged_matrix

        # apply best merge
        # just apply merge to word2class
        a, b = best_pair
        classA = [word for word, c in word2class.items() if c == a]
        classB = [word for word, c in word2class.items() if c == b]

        for w,c in word2class.items():
            if c == b:
                word2class[w] = a
            elif c > b:
                word2class[w] = c - 1

        c = best_matrix
        mi += best_diff

        print(
            f"Number of classes {c.shape[0]}, mi={mi:.6f}"
        )
        print(f"Merged classes :\nClass1 : {classA} \nClass2 : {classB}")
        print(" ")


    return word2class


w2c = {
    "en": word_classes({w:c for w,c in get_unigrams(data["en"]).items() if c >= 50}, get_unigrams(data["en"]), get_bigrams(data["en"]), target_number=15),
    "cs": word_classes({w:c for w,c in get_unigrams(data["cs"]).items() if c >= 20}, get_unigrams(data["cs"]), get_bigrams(data["cs"]), target_number=15)
}


Initial mutual information is : 0.7192609886464977 N : 30001
Number of classes 72, mi=0.718735
Merged classes :
Class1 : ['a'] 
Class2 : ['an']
 
Number of classes 71, mi=0.718206
Merged classes :
Class1 : ['this'] 
Class2 : ['my']
 
Number of classes 70, mi=0.717622
Merged classes :
Class1 : ['convicts'] 
Class2 : ['prisoners']
 
Number of classes 69, mi=0.716940
Merged classes :
Class1 : ['He'] 
Class2 : ['It']
 
Number of classes 68, mi=0.716157
Merged classes :
Class1 : ['are'] 
Class2 : ['were']
 
Number of classes 67, mi=0.715365
Merged classes :
Class1 : ['little'] 
Class2 : ['day']
 
Number of classes 66, mi=0.714573
Merged classes :
Class1 : ['this', 'my'] 
Class2 : ['their']
 
Number of classes 65, mi=0.713756
Merged classes :
Class1 : ['would'] 
Class2 : ['could']
 
Number of classes 64, mi=0.712858
Merged classes :
Class1 : ['has'] 
Class2 : ['had']
 
Number of classes 63, mi=0.711953
Merged classes :
Class1 : ['himself'] 
Class2 : ['been']
 
Number of classes 62, mi=0.7110

In [None]:
get_classes = lambda dict_lang : [[w for w,c in dict_lang.items() if c == class_id] for class_id in range(15)]
classes = {
    lang: get_classes(w2c[lang]) for lang in w2c
}

In [None]:
print("Czech classes:")
for c in classes["cs"]:
    print(c)

['z', 'do', 'u']
['.']
[',']
['to', 'své', 'člověk', 'svého', 'jeho', '—', 'mne', 'tom', 'mně', 'trestnici', 'práci']
[':', 'na', 'k', 'v', 'po', 'pro', 'o', 'za', 'ze', 'při']
['by', 'jsem']
['*', ')']
['V', 'A', 'Ale']
['a', 'jako', 'ale']
['se', 'je', 'si', 'mu', 'trestanci', 'ho']
['s', 've', 'tam', 'jsou', ';', 'skoro', 'všichni', 'jen', 'i', 'tak', 'byl', 'trestanec', 'měl', 'ani', 'ještě', 'ne', 'od', 'bylo', 'už', 'proto', 'byla']
['než', 'že', 'aby', 'když', 'co', 'jak']
['nich', 'práce', 'nás', 'trestnice']
['?', '„', '!']
['&quot;']


In [121]:
print("English classes:")
for c in classes["en"]:
    print(c)

English classes:
['The', 'He', 'It', 'I', 'They']
['of', 'for', 'in', 'at', 'with', 'from', 'by', 'on']
['the', 'no', 'this', 'a', 'their', 'his', 'an', 'my', 'some']
['is', 'are', 'will', 'was', 'were', 'would', 'has', 'had', 'could']
['and', 'or', 'but']
['.', '’', '?']
['it', 'you', 'he', 'who', 'they', 'there']
[',', ';']
['not', 'so', 'very', 'himself', 'been', 'up', 'all']
['have', 'be']
['to']
['“', '”']
['that', 'as', 'which', 'when']
['them', 'him', 'me']
['one', 'time', 'little', 'man', 'day', 'convict', 'prison', 'convicts', 'prisoners']


In [1]:
!wget "https://ufallab.ms.mff.cuni.cz/~helcl/npfl147/cc.en.50.bin"
!wget "https://ufallab.ms.mff.cuni.cz/~helcl/npfl147/cc.cs.50.bin"
!pip install fasttext
import fasttext
ft_en = fasttext.load_model('cc.en.50.bin')

--2026-01-04 22:49:11--  https://ufallab.ms.mff.cuni.cz/~helcl/npfl147/cc.en.50.bin
Resolving ufallab.ms.mff.cuni.cz (ufallab.ms.mff.cuni.cz)... 195.113.18.181
Connecting to ufallab.ms.mff.cuni.cz (ufallab.ms.mff.cuni.cz)|195.113.18.181|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1237176312 (1,2G) [application/octet-stream]
Saving to: ‘cc.en.50.bin.1’


2026-01-04 22:50:59 (10,9 MB/s) - ‘cc.en.50.bin.1’ saved [1237176312/1237176312]

--2026-01-04 22:50:59--  https://ufallab.ms.mff.cuni.cz/~helcl/npfl147/cc.cs.50.bin
Resolving ufallab.ms.mff.cuni.cz (ufallab.ms.mff.cuni.cz)... 195.113.18.181
Connecting to ufallab.ms.mff.cuni.cz (ufallab.ms.mff.cuni.cz)|195.113.18.181|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1239419534 (1,2G) [application/octet-stream]
Saving to: ‘cc.cs.50.bin.1’


2026-01-04 22:52:41 (11,6 MB/s) - ‘cc.cs.50.bin.1’ saved [1239419534/1239419534]

Collecting fasttext
  Downloading fasttext-0.9.3.tar.gz (73 kB)


In [6]:
ft_en["king"]

array([-0.01845653, -0.10532278,  0.3126318 , -0.04975438,  0.06101633,
        0.00917064,  0.21169627,  0.04756726, -0.3229656 , -0.04121956,
        0.06523221,  0.05770125, -0.0333125 , -0.12769252,  0.0818017 ,
        0.22410385, -0.08463045, -0.15078136,  0.07903202, -0.09024395,
        0.13243525, -0.24416366, -0.14773396, -0.01687124, -0.30026418,
        0.05043682, -0.05043079, -0.13711669,  0.04021271,  0.17762068,
       -0.05184786, -0.10037543, -0.07551235,  0.13325769,  0.0417949 ,
        0.02461179,  0.17158055,  0.102674  ,  0.02734189,  0.04009185,
        0.01790874, -0.00459616,  0.11330321,  0.01320077, -0.01237988,
        0.09954365, -0.08919679,  0.03712962,  0.08594048, -0.07755216],
      dtype=float32)