## Morphosyntactic tagging
Morphosyntactic tagging is one of the core algorithms in NLP. It assigns morphological and (in some languages) syntactic tags to the words in a text. E.g. this allows to distinguish between the major grammatical categories, such as nouns and verbs.

In [89]:
from tqdm import tqdm
import sys
import os
import requests
from collections import Counter
from numpy import log
from tabulate import tabulate

In [90]:
def isalpha(s):
    for letter in s:
        if not letter.isalpha():
            return False
    return True 

In [91]:
ustawy_path = '../ustawy'
ustawy_tagged_path = 'tagged_ustawy'

In [92]:
legislation_acts = {}

with tqdm(total=1179, file=sys.stdout) as pbar: 
    for filename in os.listdir(ustawy_path):
            with open(ustawy_path + '/' + filename, 'r', encoding='utf8') as f:
                content = f.read()
                legislation_acts[filename] = content
                pbar.update(1)

100%|████████████████████████████████████████████████████████████████████████████| 1179/1179 [00:00<00:00, 5878.71it/s]


## Use the tool to tag and lemmatize the law corpus.

In [93]:
def tag_text(text):
    response = requests.post("http://localhost:9200", data=text.encode("utf-8"))
    return response.content.decode("utf-8")

In [94]:
legislation_acts_tagged = {}

In [122]:
# with tqdm(total=1179, file=sys.stdout) as pbar:
#     for filename, text in legislation_acts.items():
#         legislation_acts_tagged[filename] = tag_text(text)
#         pbar.update(1)

In [97]:
# for filename, tagged_content in legislation_acts_tagged.items():
#     with open(ustawy_tagged_path + '/' + filename, 'w+', encoding='utf-8') as f:
#         f.write(tagged_content)

In [98]:
with tqdm(total=1179, file=sys.stdout) as pbar:
    for filename in os.listdir(ustawy_tagged_path):
            with open(ustawy_tagged_path + '/' + filename, 'r', encoding='utf8') as f:
                tagged_content = f.read()
                legislation_acts_tagged[filename] = tagged_content
                pbar.update(1)

1180it [00:01, 1075.18it/s]                                                                                            


## Using the tagged corpus compute bigram statistic for the tokens containing: 

- lemmatized, downcased word
- morphosyntactic category of the word (subst, fin, adj, etc.)


In [99]:
def extract_tokens(tagged_text):
    lines = tagged_text.split("\n")
    tokens = [line.strip().split("\t") for line in lines if line.startswith("\t")]
    return [(token[0].lower(), token[1].split(":")[0]) for token in tokens]

In [100]:
tokens_map = {}

for filename, act in tqdm(legislation_acts_tagged.items()):
    tokens_map[filename] = Counter(extract_tokens(act))

100%|██████████████████████████████████████████████████████████████████████████████| 1180/1180 [00:12<00:00, 94.39it/s]


In [101]:
counter_total = Counter()

for _, counter in tqdm(tokens_map.items()):
    counter_total += counter
    
unigrams_total = Counter({x: count for x, count in counter_total.items() if isalpha(x[0])})    

total = sum(unigrams_total.values())

100%|█████████████████████████████████████████████████████████████████████████████| 1180/1180 [00:01<00:00, 718.59it/s]


In [102]:
total

3582293

In [103]:
list(unigrams_total.items())[:10]

[(('dziennik', 'brev'), 8635),
 (('ustawa', 'brev'), 7042),
 (('z', 'prep'), 87987),
 (('rok', 'brev'), 33050),
 (('numer', 'brev'), 44868),
 (('pozycja', 'brev'), 45197),
 (('ustawa', 'subst'), 24933),
 (('dzień', 'subst'), 27885),
 (('grudzień', 'subst'), 2157),
 (('o', 'prep'), 64713)]

In [123]:
bigrams_map = {}

for filename, tokens in tqdm(tokens_map.items()):
    bigrams_map[filename] = []
    previous_token = None
    for token, _ in tokens.items():
        if previous_token is not None:
            bigrams_map[filename].append((previous_token, token))
        previous_token = token
    bigrams_map[filename] = Counter(bigrams_map[filename])

100%|████████████████████████████████████████████████████████████████████████████| 1180/1180 [00:00<00:00, 3608.30it/s]


In [124]:
bigrams_total = Counter()

for _, counter in tqdm(bigrams_map.items()):
    bigrams_total += counter

sum(bigrams_total.values())

100%|██████████████████████████████████████████████████████████████████████████████| 1180/1180 [00:14<00:00, 83.42it/s]


676685

## Discard bigrams containing characters other than letters. Make sure that you discard the invalid entries after computing the bigram counts.

In [127]:
bigrams_total = Counter({x: count for x, count in bigrams_total.items() if isalpha(x[0][0]) and isalpha(x[1][0])})

In [128]:
bigrams_total.most_common()[:10]

[((('ustawa', 'brev'), ('z', 'prep')), 946),
 ((('rok', 'brev'), ('numer', 'brev')), 946),
 ((('ustawa', 'subst'), ('dzień', 'subst')), 798),
 ((('o', 'prep'), ('zmiana', 'subst')), 695),
 ((('wprowadzać', 'fin'), ('się', 'qub')), 560),
 ((('wchodzić', 'fin'), ('życie', 'subst')), 544),
 ((('się', 'qub'), ('następujący', 'adj')), 513),
 ((('otrzymywać', 'fin'), ('brzmienie', 'subst')), 475),
 ((('zwać', 'ppas'), ('daleko', 'adv')), 446),
 ((('który', 'adj'), ('mowa', 'subst')), 423)]

In [129]:
bigrams_number = sum(bigrams_total.values())
bigrams_number

466717

## Compute LLR statistic for this dataset.

In [130]:
def extract_ngram(result):
    return list(map(lambda x: x[0], result))
def extract_llr(result):
    return list(map(lambda x: x[1], result))

def print_table(bigrams, appender = ''):
    bigram_table = {('bigram ' + appender): extract_ngram(bigrams), 'LLR': extract_llr(bigrams)}
    print(tabulate(bigram_table, headers='keys', tablefmt='fancy_grid'))

In [131]:
def H(counts):
    total = float(sum(counts))
    return sum([k * log(k / total + (k==0)) for k in counts])


def llr2(word1, word2):
    together_occ = bigrams_total[(word1, word2)]
    k11 = together_occ
    k12 = unigrams_total[word2] - together_occ
    k21 = unigrams_total[word1] - together_occ
    k22 = bigrams_number - k12 - k21 - k11
    
    return 2 * (H([k11, k12, k21, k22]) - H([k11 + k12, k21 + k22]) - H([k11 + k21, k12 + k22]))

In [132]:
bigrams_total_llr = {bigram: llr2(bigram[0], bigram[1]) for bigram, _  in bigrams_total.items()}

In [133]:
bigrams_llr_top10 = Counter(bigrams_total_llr).most_common()[:10]

In [134]:
print_table(bigrams_llr_top10)

╒══════════════════════════════════════╤══════════╕
│ bigram                               │      LLR │
╞══════════════════════════════════════╪══════════╡
│ (('w', 'prep'), ('artykuł', 'brev')) │ 109214   │
├──────────────────────────────────────┼──────────┤
│ (('artykuł', 'brev'), ('w', 'prep')) │ 108185   │
├──────────────────────────────────────┼──────────┤
│ (('do', 'prep'), ('w', 'prep'))      │  76064.5 │
├──────────────────────────────────────┼──────────┤
│ (('w', 'prep'), ('ustęp', 'brev'))   │  65854   │
├──────────────────────────────────────┼──────────┤
│ (('który', 'adj'), ('w', 'prep'))    │  60618.5 │
├──────────────────────────────────────┼──────────┤
│ (('lub', 'conj'), ('w', 'prep'))     │  55929.6 │
├──────────────────────────────────────┼──────────┤
│ (('w', 'prep'), ('lub', 'conj'))     │  55929.6 │
├──────────────────────────────────────┼──────────┤
│ (('się', 'qub'), ('w', 'prep'))      │  55898.2 │
├──────────────────────────────────────┼──────────┤
│ (('w', 'pr

## Partition the entries based on the syntactic categories of the words, i.e. all bigrams having the form of w1:adj w2:subst should be placed in one partition (the order of the words may not be changed).

In [136]:
partitions = {}

for bigram in tqdm(bigrams_total.keys()):
    key = (bigram[0][1], bigram[1][1])
    if key not in partitions:
        partitions[key] = 1
    else:
        partitions[key] += 1

100%|█████████████████████████████████████████████████████████████████████| 320432/320432 [00:00<00:00, 1377766.54it/s]


In [137]:
sorted_partitions = {k: v for k, v in sorted(partitions.items(), key=lambda item: -item[1])}
sorted_partitions

{('subst', 'subst'): 46803,
 ('adj', 'subst'): 22518,
 ('subst', 'adj'): 21855,
 ('ger', 'subst'): 12942,
 ('subst', 'ger'): 12392,
 ('adj', 'adj'): 9999,
 ('subst', 'fin'): 9338,
 ('subst', 'ppas'): 9148,
 ('ppas', 'subst'): 9091,
 ('fin', 'subst'): 8304,
 ('adj', 'ger'): 5223,
 ('subst', 'pact'): 5118,
 ('ger', 'adj'): 4794,
 ('ger', 'ger'): 4788,
 ('pact', 'subst'): 4606,
 ('adj', 'fin'): 4301,
 ('adj', 'ppas'): 4187,
 ('inf', 'subst'): 3857,
 ('ppas', 'adj'): 3438,
 ('fin', 'adj'): 3418,
 ('subst', 'inf'): 3163,
 ('prep', 'subst'): 2771,
 ('ger', 'fin'): 2639,
 ('fin', 'ger'): 2619,
 ('subst', 'prep'): 2515,
 ('ppas', 'ppas'): 2376,
 ('adj', 'pact'): 2362,
 ('ger', 'ppas'): 2309,
 ('fin', 'fin'): 2239,
 ('ppas', 'ger'): 2210,
 ('subst', 'adv'): 2210,
 ('adv', 'subst'): 2189,
 ('ppas', 'fin'): 2033,
 ('subst', 'praet'): 1828,
 ('fin', 'ppas'): 1824,
 ('subst', 'conj'): 1809,
 ('pact', 'adj'): 1793,
 ('inf', 'adj'): 1675,
 ('conj', 'subst'): 1580,
 ('adj', 'inf'): 1580,
 ('praet', 's

## Select the 10 largest partitions (partitions with the largest number of entries).

In [138]:
largest_partitions = list(sorted_partitions.keys())[:10] 

## Use the computed LLR measure to select 5 bigrams for each of the largest categories.

In [139]:
for partition in largest_partitions:
    partitioned_bigrams = Counter({x: count for x, count in bigrams_total.items() if x[0][1] == partition[0] and x[1][1] == partition[1]})
    bigrams_partitioned_llr = {bigram: llr2(bigram[0], bigram[1]) for bigram, _  in partitioned_bigrams.items()}
    bigrams_partitioned_llr_top5 = Counter(bigrams_partitioned_llr).most_common()[:5]
    print_table(bigrams_partitioned_llr_top5, '(' + partition[0] + ', ' + partition[1] + ')')

╒════════════════════════════════════════════╤═════════╕
│ bigram (subst, subst)                      │     LLR │
╞════════════════════════════════════════════╪═════════╡
│ (('przepis', 'subst'), ('mowa', 'subst'))  │ 2266.16 │
├────────────────────────────────────────────┼─────────┤
│ (('mowa', 'subst'), ('przepis', 'subst'))  │ 2266.16 │
├────────────────────────────────────────────┼─────────┤
│ (('mowa', 'subst'), ('sprawa', 'subst'))   │ 2131.15 │
├────────────────────────────────────────────┼─────────┤
│ (('mowa', 'subst'), ('minister', 'subst')) │ 1997.31 │
├────────────────────────────────────────────┼─────────┤
│ (('osoba', 'subst'), ('mowa', 'subst'))    │ 1952.01 │
╘════════════════════════════════════════════╧═════════╛
╒═════════════════════════════════════════╤═════════╕
│ bigram (adj, subst)                     │     LLR │
╞═════════════════════════════════════════╪═════════╡
│ (('który', 'adj'), ('mowa', 'subst'))   │ 4000.84 │
├─────────────────────────────────────────┼

## Using the results from the previous step answer the following questions: 

### What types of bigrams have been found?


### Which of the category-pairs indicate valuable multiword expressions? Do they have anything in common?

### Which signal: LLR score or syntactic category is more useful for determining genuine multiword expressions?

### Can you describe a different use-case where the morphosyntactic category is useful for resolving a real-world problem?