In [18]:
# Stats Functions
from scripts.data_management import FloresPlusManager
from os.path import join
import tiktoken
from scripts.util import split_sents
ENC = tiktoken.encoding_for_model('gpt-4o')


def get_token_cnt(input_sents: list[str]):
    text = '\n'.join(input_sents)
    return len(ENC.encode(text))


def get_char_cnt(input_sents: list[str]):
    text = '\n'.join(input_sents)
    return len(text)


def get_real_sent_cnt(input_sents: list[str], lang):
    text = '\n'.join(input_sents)
    return len(split_sents(text, lang=lang))

def get_flores_meta(lang: str, num_of_sents: int, *keys):
    '''
    lang: Flores code
    num_of_sents: number of sents to load
    keys: keys used in the JSON structure of each Flores+ entry
    '''
    dm = FloresPlusManager()
    data = dm._load_data_files(join(dm.store, f'{lang}.jsonl'), num_of_sents)
    meta_set = {k: set() for k in keys}
    for o in data:
        for k in keys:
            meta_set[k].add(o[k])
    nums = {key: len(meta_set[key]) for key in meta_set}
    return nums

## Flores+ Stats

In [19]:
from scripts.data_management import FloresPlusManager
dm = FloresPlusManager()
mean_flores_char = 0
mean_flores_token = 0
mean_flores_sent = 0
langs = FloresPlusManager.EURO_ISO_2_FLORES_CODE.keys()
for lang in sorted(langs):
    src_sents, _ = dm.get_sentence_pairs(lang, 'en', num_of_sents=400)
    mean_flores_char += get_char_cnt(src_sents)
    mean_flores_token += get_token_cnt(src_sents)
    mean_flores_sent += get_real_sent_cnt(src_sents, lang)
    print(lang, get_char_cnt(src_sents), get_token_cnt(src_sents), get_real_sent_cnt(src_sents, lang), get_flores_meta(lang, 400, 'url', 'topic'))

mean_flores_char /= 11
mean_flores_token /= 11
mean_flores_sent /= 11
# %%

da 50961 14097 437 {'url': 122, 'topic': 58}
de 57714 13380 442 {'url': 122, 'topic': 58}
el 59462 21925 431 {'url': 122, 'topic': 58}
en 49427 10259 429 {'url': 122, 'topic': 58}
es 58934 13463 436 {'url': 122, 'topic': 58}
fi 53171 15686 448 {'url': 122, 'topic': 58}
fr 58255 13914 431 {'url': 122, 'topic': 58}
it 58720 14891 432 {'url': 122, 'topic': 58}
nl 55471 12753 454 {'url': 122, 'topic': 58}
pt 53279 12442 433 {'url': 122, 'topic': 58}
sv 50031 13702 430 {'url': 122, 'topic': 58}


## Europarl Stats

In [20]:
from scripts.data_management import EuroParlManager
dm = EuroParlManager()
langs = EuroParlManager.EURO_LANGS
mean_ep_char = 0
mean_ep_token = 0
mean_ep_sent = 0
for lang in sorted(langs):
    total_char = 0
    total_token = 0
    total_sent = 0
    for l in sorted(langs):
        if lang == l:
            continue
        src_sents, _ = dm.get_sentence_pairs(lang, l, num_of_sents=400)
        total_char += get_char_cnt(src_sents)
        total_token += get_token_cnt(src_sents)
        total_sent += get_real_sent_cnt(src_sents, lang)
    avg_char = total_char / 10
    avg_token = total_token / 10
    avg_sent = total_sent / 10
    mean_ep_char += avg_char
    mean_ep_token += avg_token
    mean_ep_sent += avg_sent
    print(lang, round(avg_char), round(avg_token), round(avg_sent))

mean_ep_char /= 11
mean_ep_token /= 11
mean_ep_sent /= 11

da 59634 15917 418
de 64846 13790 434
el 72660 24693 409
en 62993 12323 418
es 68227 14494 419
fi 62544 17876 415
fr 69581 14931 413
it 71059 16821 422
nl 67063 14350 462
pt 68643 14671 413
sv 61899 16315 447


## OPUS100 Stats

In [21]:
from scripts.data_management import Opus100Manager
dm = Opus100Manager()
langs = Opus100Manager.EURO_ISO_2_PAIR.keys()
mean_opus_char = 0
mean_opus_token = 0
mean_opus_sent = 0

total_char_en = 0
total_token_en = 0
total_sent_en = 0
for lang in sorted(langs):
    src_sents, _ = dm.get_sentence_pairs(lang, 'en', num_of_sents=400)
    total_char_en += get_char_cnt(_)
    total_token_en += get_token_cnt(_)
    total_sent_en += get_real_sent_cnt(_, lang)
    mean_opus_char += get_char_cnt(src_sents)
    mean_opus_token += get_token_cnt(src_sents)
    mean_opus_sent += get_real_sent_cnt(src_sents, lang)
    print(lang, get_char_cnt(src_sents), get_token_cnt(src_sents), get_real_sent_cnt(src_sents, lang))
print('---')
avg_sent = total_sent_en / 10
avg_token = total_token_en / 10
avg_char = total_char_en / 10
mean_opus_char += avg_char
mean_opus_token += avg_token
mean_opus_sent += avg_sent
print('en', round(avg_char), round(avg_token), round(avg_sent))

mean_opus_char /= 11
mean_opus_token /= 11
mean_opus_sent /= 11

da 25992 7556 421
de 32731 8295 444
el 22259 8648 415
es 34342 8004 430
fi 21774 6843 423
fr 46126 10852 422
it 27914 7556 423
nl 28485 6957 427
pt 27073 6540 419
sv 22505 6498 420
---
en 28320 6393 446


In [22]:
print(f'Mean Flores Char: {round(mean_flores_char)}')
print(f'Mean Flores Token: {round(mean_flores_token)}')
print(f'Mean Flores Sent: {round(mean_flores_sent)}')
print('---')
print(f'Mean EP Char: {round(mean_ep_char)}')
print(f'Mean EP Token: {round(mean_ep_token)}')
print(f'Mean EP Sent: {round(mean_ep_sent)}')
print('---')
print(f'Mean Opus Char: {round(mean_opus_char)}')
print(f'Mean Opus Token: {round(mean_opus_token)}')
print(f'Mean Opus Sent: {round(mean_opus_sent)}')
print('---')
print('OPUS Ratio')
print(f'Char: {round(mean_opus_char / mean_flores_char, 2)}')
print(f'Token: {round(mean_opus_token / mean_flores_token, 2)}')
print(f'Sent: {round(mean_opus_sent / mean_flores_sent, 2)}')
print('---')
print(f'Char: {round(mean_opus_char / mean_ep_char, 2)}')
print(f'Token: {round(mean_opus_token / mean_ep_token, 2)}')
print(f'Sent: {round(mean_opus_sent / mean_ep_sent, 2)}')



Mean Flores Char: 55039
Mean Flores Token: 14228
Mean Flores Sent: 437
---
Mean EP Char: 66286
Mean EP Token: 16016
Mean EP Sent: 424
---
Mean Opus Char: 28866
Mean Opus Token: 7649
Mean Opus Sent: 426
---
OPUS Ratio
Char: 0.52
Token: 0.54
Sent: 0.98
---
Char: 0.44
Token: 0.48
Sent: 1.0
