# Multiword expression identification and extraction
Mateusz Wojtulewicz

In [2]:
import spacy
import tqdm

from pathlib import Path

In [3]:
nlp = spacy.load("pl_core_news_sm")
tokenizer = nlp.tokenizer

In [4]:
tokenized_acts = {}

acts_dir = Path("../data/ustawy/")
n_acts = len(list(acts_dir.iterdir()))

for act in tqdm.tqdm(acts_dir.iterdir(), desc="Tokenizing acts", total=n_acts):
    act_id = act.stem
    content = act.read_text(encoding="utf8")
    tokens = tokenizer(content)
    tokenized_acts[act_id] = [token.text.lower() for token in tokens]

Tokenizing acts: 100%|██████████| 1179/1179 [00:29<00:00, 40.24it/s]


In [5]:
tokenized_acts["1993_599"][:3], tokenized_acts["1993_599"][1:4]

(['\n\n\n\n', 'dz', '.'], ['dz', '.', 'u'])

In [6]:
from collections import Counter

In [7]:
bigrams = (
    (f, s)
    for tokens in tokenized_acts.values()
    for (f, s) in zip(tokens[:-1], tokens[1:])
)

In [8]:
bigrams_count = Counter(bigrams)

In [9]:
bigrams_count.most_common(10)

[(('art', '.'), 83778),
 (('ust', '.'), 53552),
 (('.', '\n'), 49741),
 (('poz', '.'), 45198),
 ((',', 'poz'), 39655),
 (('-', '-'), 36542),
 (('r', '.'), 33015),
 (('w', 'art'), 30170),
 (('.', '1'), 29734),
 ((',', 'o'), 28739)]

In [10]:
def is_word(s: str) -> bool:
   return tokenizer(s)[0].is_alpha 

In [11]:
bigrams_filtered = (
    (f, s)
    for tokens in tokenized_acts.values()
    for (f, s) in zip(tokens[:-1], tokens[1:])
    if is_word(f) and is_word(s)
)

In [12]:
bigrams_filtered_count = Counter(bigrams_filtered)

In [13]:
bigrams_filtered_count.most_common(20)

[(('w', 'art'), 30170),
 (('mowa', 'w'), 27649),
 (('w', 'ust'), 22238),
 (('których', 'mowa'), 12973),
 (('o', 'których'), 12604),
 (('otrzymuje', 'brzmienie'), 9168),
 (('z', 'dnia'), 8989),
 (('którym', 'mowa'), 8689),
 (('o', 'którym'), 8525),
 (('do', 'spraw'), 8215),
 (('dodaje', 'się'), 7960),
 (('i', 'nr'), 7871),
 (('w', 'brzmieniu'), 6690),
 (('w', 'drodze'), 6623),
 (('stosuje', 'się'), 6232),
 (('na', 'podstawie'), 5906),
 (('w', 'przypadku'), 5371),
 (('której', 'mowa'), 5192),
 (('o', 'której'), 5061),
 (('od', 'dnia'), 4971)]

In [14]:
tokens_count = Counter(
    token for tokens in tokenized_acts.values() for token in tokens
)

In [15]:
tokens_count.most_common(10)

[('.', 437694),
 (',', 341126),
 ('w', 201224),
 ('\n', 181703),
 (')', 100194),
 ('i', 90009),
 ('art', 83804),
 ('z', 82443),
 ('1', 73108),
 ('o', 64776)]

$$
\text{pmi}(x; y) \equiv  \log_2 \frac{p(x, y)}{p(x)p(y)}
$$

In [16]:
import numpy as np

In [17]:
def p_bigram(bigram: tuple[str, str]) -> float:
    tok1, tok2 = bigram
    N = sum(bigrams_count.values())
    return bigrams_filtered_count[(tok1, tok2)] / N

def p_token(tok: str) -> float:
    N = sum(bigrams_count.values())
    return tokens_count[tok] / N

def pmi_bigram(bigram: tuple[str, str]) -> float:
    tok1, tok2 = bigram
    return np.log2(p_bigram(bigram) / (p_token(tok1) * p_token(tok2)))

In [44]:
import pickle

def dump_counter(counter: Counter, name: str) -> None:
    with open(f"{name}.p", "wb") as f:
        pickle.dump(counter, f)
        
def load_counter(name) -> Counter:
    with open(f"{name}.p", "rb") as f:
        return pickle.load(f)

In [46]:
dump_counter(bigrams_count, "bigrams_counter")
dump_counter(bigrams_filtered_count, "bigrams_filtered_counter")
dump_counter(tokens_count, "tokens_counter")

In [None]:
bigrams_pmi = [
    (bigram, pmi_bigram(bigram))
    for bigram in bigrams_filtered_count.keys()
]

In [None]:
sorted(bigrams_pmi.items(), key=lambda x: -x[1])

[(('dotychczasowa', 'treść'), 13.116306854406337),
 (('podatku', 'akcyzowym'), 10.68410880396931),
 (('zmianie', 'ustawy'), 7.925625340522062),
 (('wprowadza', 'się'), 6.791221905648496),
 (('się', 'następujące'), 6.513427944429425),
 (('o', 'zmianie'), 6.124066689390784),
 (('podatku', 'od'), 5.948931080356767),
 (('ustawie', 'z'), 5.548346488207807),
 (('od', 'towarów'), 5.502196529733125),
 (('treść', 'otrzymuje'), 5.496453707015169),
 (('z', 'dnia'), 5.146340032871757),
 (('w', 'ustawie'), 4.6595349327468325),
 (('o', 'podatku'), 4.4465467303567605),
 (('i', 'usług'), 4.3923637697079725),
 (('towarów', 'i'), 4.088129866766435),
 (('usług', 'oraz'), 3.666199941233582),
 (('i', 'nr'), 3.50403826809644),
 (('w', 'art'), 3.3831749124593045),
 (('ustawy', 'o'), 3.2601962081221934),
 (('oraz', 'o'), 1.279755286233716)]

In [None]:
# TODO: multiprocessing