In [1]:
import numpy as np
import pandas as pd
from tabulate import tabulate

from datasets import load_dataset
from huggingface_hub import list_datasets
from sacremoses import MosesTokenizer, MosesDetokenizer

from collections import defaultdict, Counter
from itertools import pairwise

dataset_name = "ufal/npfl147"

ModuleNotFoundError: No module named 'sacremoses'

In [None]:
# dataset_cs = load_dataset(dataset_name, "cs")
# dataset_en = load_dataset(dataset_name, "en")
# dataset_sk = load_dataset(dataset_name, "sk")

dataset = {
    "cs": load_dataset(dataset_name, "cs"),
    "en": load_dataset(dataset_name, "en"),
    "sk": load_dataset(dataset_name, "sk"),
}

moses_tokenizer = {
    "cs": MosesTokenizer(lang="cs"),
    "en": MosesTokenizer(lang="en"),
    "sk": MosesTokenizer(lang="sk"),
}


In [None]:
corpus = {
    lang: "".join(dataset[lang]["train"]["text"]) for lang in dataset.keys()
}



In [None]:
tokenized = {
    lang: moses_tokenizer[lang].tokenize(corpus[lang]) for lang in dataset.keys()
}


In [None]:
# English stats: Data size
# How many tokens of English did you count after applying MosesTokenizer?

print(len(tokenized["en"])) # 955309

955309


In [None]:
# English stats: Unigrams
# How many unique unigrams of English did you count after applying MosesTokenizer?

print(len(set(tokenized["en"]))) # 71970

71970


In [None]:
start_special_symbol = "<s>"
end_special_symbol = "</s>"

get_unigrams = lambda tokens : Counter(tokens)
get_bigrams = lambda tokens : Counter(zip([start_special_symbol] + tokens, tokens + [end_special_symbol]))
get_trigrams = lambda tokens : Counter(zip(2 * [start_special_symbol] + tokens, tokens + 2 * [end_special_symbol]))


In [None]:
# English stats: Bigrams
# How many unique bigrams of English did you count after applying MosesTokenizer?

print(len(get_bigrams(tokenized["en"]))) # 376077


376077


In [None]:
def entropy_bigrams(bi, uni):
    N_bi, N_uni = sum(bi.values()), sum(uni.values())
    uni[start_special_symbol] = uni[start_special_symbol] if start_special_symbol in uni else 1
    uni[end_special_symbol] = uni[end_special_symbol] if end_special_symbol in uni else 1
    H = 0.0
    for (x, y), count in bi.items():
        p_xy = count / N_bi
        p_y = uni[y] / N_uni
        H += - p_xy * np.log2(p_xy / p_y)
    return H



In [None]:
# English stats: Entropy
# What is the conditional entropy of the English dataset tokenized with MosesTokenizer?

print("English entropy: ", entropy_bigrams(get_bigrams(tokenized["en"]), get_unigrams(tokenized["en"])))

# Czech stats: Entropy
# What is the conditional entropy of the Czech dataset tokenized with MosesTokenizer?

print("Czech entropy: ", entropy_bigrams(get_bigrams(tokenized["cs"]), get_unigrams(tokenized["cs"])))

# Entropy of the other language
# What is the conditional entropy of the dataset of the language you chose tokenized with MosesTokenizer? (If you chose more, pick one)

print("Slovak entropy: ", entropy_bigrams(get_bigrams(tokenized["sk"]), get_unigrams(tokenized["sk"])))

English entropy:  5.67234587839017
Czech entropy:  5.064022671529996
Slovak entropy:  4.52334189845233


In [None]:
def detailed_stats_language(tokens, language_name="unknown", tokenizer="unknown"):
    unigrams = get_unigrams(tokens)
    bigrams = get_bigrams(tokens)
    trigrams = get_trigrams(tokens)

    stats = pd.DataFrame({
        "tokenizer": [tokenizer],
        "language": [language_name],
        "data_size": [len(tokens)],
        "nr_unigrams": [len(unigrams)],
        "nr_bigrams": [len(bigrams)],
        "entropy_bigrams": [entropy_bigrams(bigrams, unigrams)],
    }
    )
    return stats

all_stats = pd.concat([detailed_stats_language(tokens, language, tokenizer="MosesTokenizer") for language, tokens in tokenized.items()])
print(tabulate(all_stats, headers='keys')) # is there a way to drop the id column, that is implicit from pandas?

    tokenizer       language      data_size    nr_unigrams    nr_bigrams    entropy_bigrams
--  --------------  ----------  -----------  -------------  ------------  -----------------
 0  MosesTokenizer  cs               596400          91024        337350            5.06402
 0  MosesTokenizer  en               955309          71970        376077            5.67235
 0  MosesTokenizer  sk               409435          65541        211080            4.52334


In [None]:
# now i need to use xml-r tokenizer
# from transformers import AutoModelForMaskedLM, AutoTokenizer, BitsAndBytesConfig

# xlm = AutoTokenizer.from_pretrained("xlm-roberta-base")

from transformers import AutoModelForMaskedLM, AutoTokenizer
import torch

tokenizer = AutoTokenizer.from_pretrained(
    "FacebookAI/xlm-roberta-base"
)
model = AutoModelForMaskedLM.from_pretrained(
    "FacebookAI/xlm-roberta-base",
    dtype=torch.float16,
    device_map="auto",
    attn_implementation="sdpa"
)


ModuleNotFoundError: No module named 'torch'