In [1]:
import pandas as pd
from helpers.corpus_creation import build_vocab, save_vocab, print_corpus_stats, interleave_tokens, simple_join_tokens
from helpers.eurlex import process_pages, load_tokens_from_processed_files

In [2]:
CORPUS_PATH = "corpus/eurlex/"

# Vytvoření corpu

In [3]:
cz_url = "https://eur-lex.europa.eu/legal-content/CS/TXT/HTML/?uri=OJ:C:2014:{:03d}:FULL".format(450)
en_url = "https://eur-lex.europa.eu/legal-content/EN/TXT/HTML/?uri=OJ:C:2014:{:03d}:FULL".format(450)
res = process_pages(cz_url, en_url, "C{:03d}".format(302))

Download successful: https://eur-lex.europa.eu/legal-content/CS/TXT/HTML/?uri=OJ:C:2014:450:FULL
Download successful: https://eur-lex.europa.eu/legal-content/EN/TXT/HTML/?uri=OJ:C:2014:450:FULL


In [28]:
import time
all_tokens = {'EN_tokens': [], 'CZ_tokens': []}
for year in range(2015, 2021):
    for article_nr in range(1, 451):
        cz_url = "https://eur-lex.europa.eu/legal-content/CS/TXT/HTML/?uri=OJ:C:{:d}:{:03d}:FULL".format(year, article_nr)
        en_url = "https://eur-lex.europa.eu/legal-content/EN/TXT/HTML/?uri=OJ:C:{:d}:{:03d}:FULL".format(year, article_nr)
        res = process_pages(cz_url, en_url, "{:d}-C{:03d}".format(year, article_nr))
        if not res:
            continue
        all_tokens['CZ_tokens'] += res[0]
        all_tokens['EN_tokens'] += res[1]
        time.sleep(1)
df = pd.DataFrame(all_tokens)

Download successfull: https://eur-lex.europa.eu/legal-content/CS/TXT/HTML/?uri=OJ:C:2015:001:FULL
Download successfull: https://eur-lex.europa.eu/legal-content/EN/TXT/HTML/?uri=OJ:C:2015:001:FULL
Download successfull: https://eur-lex.europa.eu/legal-content/CS/TXT/HTML/?uri=OJ:C:2015:002:FULL
Download successfull: https://eur-lex.europa.eu/legal-content/EN/TXT/HTML/?uri=OJ:C:2015:002:FULL
Download successfull: https://eur-lex.europa.eu/legal-content/CS/TXT/HTML/?uri=OJ:C:2015:003:FULL
Download successfull: https://eur-lex.europa.eu/legal-content/EN/TXT/HTML/?uri=OJ:C:2015:003:FULL
Download successfull: https://eur-lex.europa.eu/legal-content/CS/TXT/HTML/?uri=OJ:C:2015:004:FULL
Download successfull: https://eur-lex.europa.eu/legal-content/EN/TXT/HTML/?uri=OJ:C:2015:004:FULL
Download successfull: https://eur-lex.europa.eu/legal-content/CS/TXT/HTML/?uri=OJ:C:2015:005:FULL
Download successfull: https://eur-lex.europa.eu/legal-content/EN/TXT/HTML/?uri=OJ:C:2015:005:FULL
Download successfull

In [5]:
df = load_tokens_from_processed_files()

# Vytvoření slovníku

In [13]:
df.explode('EN_tokens')['EN_tokens'].value_counts().to_csv(CORPUS_PATH + 'en_freqs.csv')
df.explode('CZ_tokens')['CZ_tokens'].value_counts().to_csv(CORPUS_PATH + 'cz_freqs.csv')

the                  359806
de                   169458
of                   169246
in                   143708
and                  110084
                      ...  
schendt                   1
woningbouw                1
ongrondwettig             1
opvangplekken             1
bedrijfsonderdeel         1
Name: CZ_tokens, Length: 275867, dtype: int64

In [8]:
vocab = build_vocab(df)

In [6]:
save_vocab(vocab, CORPUS_PATH)

In [9]:
print_corpus_stats(df, vocab)

Paragraphs counts: 359628
CZ tokens counts: 10919671
EN tokens counts: 9209719
Unique CZ tokens counts: 275868
Unique EN tokens counts: 261872


# Vytvoření datasetu pro trénování w2v

In [7]:
interleaved = interleave_tokens(df)
interleaved

0          [komis, juli, dne, commiss, červenk, decid, ro...
1          [komis, dne, juli, červenk, commiss, rozhodl, ...
2          [komis, dne, červenk, juli, rozhodl, commiss, ...
3          [komis, dne, červenk, rozhodl, juli, zahájit, ...
4          [komis, dne, červenk, rozhodl, zahájit, juli, ...
                                 ...                        
2361551    [ing, ing, zal, zal, het, het, proberen, probe...
2361552    [ing, zal, ing, het, zal, proberen, het, af, p...
2361553    [ing, zal, het, ing, proberen, zal, af, het, t...
2361554    [een, een, monitor, monitoring, truste, truste...
2361555    [een, monitor, een, truste, monitoring, en, tr...
Length: 2361556, dtype: object

In [8]:
interleaved.to_csv(CORPUS_PATH + "interleaved.csv")

In [9]:
simple_joined = simple_join_tokens(df)
simple_joined

0         [juli, commiss, decid, initi, proceed, aboveme...
1         [commiss, invit, interest, third, parti, submi...
2         [order, fulli, taken, account, procedur, obser...
3         [notif, concern, follow, undertak, oznámen, tý...
4         [amey, uk, plc, amey, unit, kingdom, belong, f...
                                ...                        
359623    [ing, richt, nederland, met, het, oog, op, afs...
359624    [ing, verbindt, zich, erto, de, afstot, optima...
359625    [voort, ziet, ing, er, gedurend, een, overgang...
359626    [ing, zal, het, proberen, af, te, splitsen, bi...
359627    [een, monitor, truste, en, een, manag, worden,...
Length: 359628, dtype: object

In [10]:
simple_joined.to_csv(CORPUS_PATH + "simple_joined.csv")