In [1]:
from pathlib import Path
import regex
import time
from collections import defaultdict, Counter
import math
import requests
import itertools
import tarfile

In [2]:
def read_archive(path):
    tar = tarfile.open(path, "r:gz")   
    files = {}
    for filename in tar.getnames():
        f = tar.extractfile(filename)
        files[filename] = f.read().decode("utf-8").replace('\n', ' ').replace('\t', ' ').lower()
    return files

In [3]:
files = read_archive('./ustawy.tar.gz')
type(files)

dict

# 1. Download docker image of KRNNT2.

In [4]:
print(requests.post('http://krnnt:9200', data='Ala ma kota.').text)

Ala	none
	Ala	subst:sg:nom:f	disamb
ma	space
	mieć	fin:sg:ter:imperf	disamb
kota	space
	kot	subst:sg:acc:m2	disamb
.	none
	.	interp	disamb




# 3. Use the tool to tag and lemmatize the law corpus.

In [5]:
def lemmatize_bill(bill_content):
    lemmatized = requests.post('http://krnnt:9200', data=bill_content.encode('utf-8')).text
    res = [l for l in lemmatized.split('\n') if l != '']
    result = []
    for w, e in zip(res[0::2], res[1::2]):
        _, token, category, _ = e.split('\t')
        result.append((token, category.split(':')[0]))
    return result

In [6]:
lematized_bills = {filename:lemmatize_bill(content) for filename, content in list(files.items())}

In [7]:
def save():
    import pickle
    with open('lematized.pck', 'wb') as f:
        pickle.dump(lematized_bills, f)

In [8]:
save()

In [9]:
def load():
    import pickle
    with open('lematized.pck', 'rb') as f:
        return pickle.load(f)
        

In [10]:
print(len(lematized_bills))

1179


In [11]:
for token, category in lematized_bills[list(lematized_bills.keys())[1]][:10]:
    print(token, category)

dziennik brev
. interp
u prep
. interp
z prep
1993 adj
rok brev
. interp
numer brev
129 num


# 4. Using the tagged corpus compute bigram statistic for the tokens containing:
- lemmatized, downcased word
- morphosyntactic category of the word (subst, fin, adj, etc.)

In [12]:
def get_ngrams(lemmatized_acts, n):
    grams = []
    for act in lemmatized_acts:
        for i in range(len(act) - n + 1):
            my_gram = []
            for tok, category in act[i:i+n]:
                lower_word = tok.lower()
                my_gram.append( (lower_word, category) )
            
            grams.append(tuple(my_gram))
    return grams

In [13]:
bigrams = get_ngrams(lematized_bills.values(), 2)

In [14]:
bigrams = Counter(bigrams)

In [15]:
bigrams.most_common(10)

[((('artykuł', 'brev'), ('.', 'interp')), 83645),
 ((('ustęp', 'brev'), ('.', 'interp')), 53345),
 ((('pozycja', 'brev'), ('.', 'interp')), 45081),
 (((',', 'interp'), ('pozycja', 'brev')), 43033),
 ((('.', 'interp'), ('1', 'adj')), 39939),
 ((('-', 'interp'), ('-', 'interp')), 36548),
 ((('rok', 'brev'), ('.', 'interp')), 33025),
 ((('w', 'prep'), ('artykuł', 'brev')), 31973),
 (((',', 'interp'), ('o', 'prep')), 29920),
 ((('o', 'prep'), ('który', 'adj')), 28656)]

# 5. Discard bigrams containing characters other than letters. Make sure that you discard the invalid entries after computing the bigram counts.

In [16]:
bigrams = Counter({
    key: cnt for key, cnt in bigrams.items() if all(
        w[0].isalpha() for w in key
    ) 
})

In [17]:
bigrams.most_common(10)

[((('w', 'prep'), ('artykuł', 'brev')), 31973),
 ((('o', 'prep'), ('który', 'adj')), 28656),
 ((('który', 'adj'), ('mowa', 'subst')), 28538),
 ((('mowa', 'subst'), ('w', 'prep')), 28473),
 ((('w', 'prep'), ('ustęp', 'brev')), 23500),
 ((('z', 'prep'), ('dzień', 'subst')), 11360),
 ((('otrzymywać', 'fin'), ('brzmienie', 'subst')), 10533),
 ((('określić', 'ppas'), ('w', 'prep')), 9689),
 ((('do', 'prep'), ('sprawa', 'subst')), 8718),
 ((('ustawa', 'subst'), ('z', 'prep')), 8625)]

# 7. Compute LLR statistic for this dataset.

In [18]:
def denormEntropy(counts):
    '''Computes the entropy of a list of counts scaled by the sum of the counts. If the inputs sum to one, this is just the normal definition of entropy'''
    counts = list(counts)
    total = float(sum(counts))
    # Note tricky way to avoid 0*log(0)
    return -sum([k * math.log(k/total + (k==0)) for k in counts])

def llr_2x2(k11, k12, k21, k22):
    '''Special case of llr with a 2x2 table'''
    return 2 * (denormEntropy([k11+k12, k21+k22]) +
                denormEntropy([k11+k21, k12+k22]) -
                denormEntropy([k11, k12, k21, k22]))

In [19]:
bigrams_count_word_a = defaultdict(int)
bigrams_count_word_b = defaultdict(int)

for (word_a, word_b), cnt in bigrams.items():
    bigrams_count_word_a[word_a] += cnt
    bigrams_count_word_b[word_b] += cnt

In [20]:
total_bigrams = sum(bigrams.values())

In [21]:
bigrams_llr = dict()
for (word_a, word_b), cnt in bigrams.items():
    k11 = cnt
    k12 = bigrams_count_word_b[word_b] - cnt
    k21 = bigrams_count_word_a[word_a] - cnt
    k22 = total_bigrams - (k12 + k21 + k11)
    bigrams_llr[(word_a, word_b)] = (llr_2x2(k11, k12, k21, k22), cnt)

In [22]:
bigrams_llr_sorted = sorted(bigrams_llr.items(), key = lambda x: -x[1][0])

In [23]:
for ((text1, cat1), (text2, cat2)), (llr, occ) in bigrams_llr_sorted[:10]:
    pair = text1 +':'+cat1
    text = f'{pair:<20} {text2}:{cat2}'
    print('-'*80)
    print(f'{text:<40}: {llr}')


--------------------------------------------------------------------------------
który:adj            mowa:subst         : 248052.7143701301
--------------------------------------------------------------------------------
o:prep               który:adj          : 190483.81637835805
--------------------------------------------------------------------------------
mowa:subst           w:prep             : 177052.40660559037
--------------------------------------------------------------------------------
w:prep               artykuł:brev       : 113594.38314276375
--------------------------------------------------------------------------------
otrzymywać:fin       brzmienie:subst    : 110710.45626359055
--------------------------------------------------------------------------------
w:prep               ustęp:brev         : 87868.61976416851
--------------------------------------------------------------------------------
minister:subst       właściwy:adj       : 70827.04352484498
---------

# 8. Partition the entries based on the syntactic categories of the words, i.e. all bigrams having the form of w1:adj w2:subst should be placed in one partition (the order of the words may not be changed).

In [24]:
bigrams_grouped = defaultdict(list)

for ((text1, cat1), (text2, cat2)), (llr, occ) in bigrams_llr_sorted:
    bigrams_grouped[(cat1, cat2)].append( (text1, text2, llr, occ) )

In [25]:
for (cat1, cat2), list_of_tokens in bigrams_grouped.items():
    header = f'{cat1} {cat2}'
    
    words = [f'\t- {t[0]} {t[1]}' for t in list_of_tokens[:10]]
    words = '\n'.join(words)
    text = f'{header}\n{words}'
    print(text)
    break

adj subst
	- który mowa
	- niniejszy ustawa
	- następujący zmiana
	- odrębny przepis
	- walny zgromadzenie
	- członkowski unia
	- szczegółowy zasada
	- główny inspektor
	- wojewódzki inspektor
	- państwowy straż


# 9. Select the 10 largest partitions (partitions with the largest number of entries).

In [26]:
biggest_bigrams = list(sorted(
    bigrams_grouped.items(),
    key=lambda x: -sum(map(lambda t: t[3], x[1]))
))[:10]

In [27]:
for (cat1, cat2), list_of_tokens in biggest_bigrams:
    text = f'{cat1}:{cat2}'
    sum_occ = sum(map(lambda x: x[3], list_of_tokens))
    print(f'{text:20} {sum_occ}')

prep:subst           323531
subst:subst          280155
subst:adj            273728
adj:subst            188423
subst:prep           170971
subst:conj           84085
conj:subst           83077
ger:subst            81373
prep:adj             79705
prep:brev            66969


# 10. Use the computed LLR measure to select 5 bigrams for each of the largest categories.

In [28]:
def select_top_5(bigrams_list):
    result = []
    for (cat1, cat2), list_of_tokens in bigrams_list:
        result.append(((cat1, cat2), list_of_tokens[:5]))
    return result

In [29]:
biggest_bigrams = select_top_5(biggest_bigrams)

Let's print the results.

In [30]:
def print_bigrams(bigram_list):
    for (cat1, cat2), list_of_tokens in bigram_list:
        print('-'*80)
        header = f'{cat1} {cat2}'
        
        toks = [(t[0] + ' ' + t[1], t[2]) for t in list_of_tokens]

        words = [f'\t- {t[0]:<30}: {t[1]}' for t in toks]
        words = '\n'.join(words)
        text = f'{header}\n{words}'

        print(text)

By the number of bigrams first:

In [31]:
print_bigrams(biggest_bigrams)

--------------------------------------------------------------------------------
prep subst
	- z dzień                       : 53443.161432914436
	- na podstawa                   : 47039.98370506626
	- do sprawa                     : 46293.95840313053
	- w droga                       : 31998.72346495255
	- od dzień                      : 31547.64117067482
--------------------------------------------------------------------------------
subst subst
	- droga rozporządzenie          : 54022.582674282894
	- skarb państwo                 : 22069.276876578064
	- rada minister                 : 18278.401483014226
	- terytorium rzeczpospolita     : 14071.448988002143
	- ochrona środowisko            : 14016.563116224948
--------------------------------------------------------------------------------
subst adj
	- minister właściwy             : 70827.04352484498
	- rzeczpospolita polski         : 46132.30327139744
	- jednostka organizacyjny       : 24498.74647179988
	- samorząd terytorialny     

And now by the count of bigrams:

# 11. Using the results from the previous step answer the following questions:

## a. What types of bigrams have been found?

The most of bigrams contains a noun

## b. Which of the category-pairs indicate valuable multiword expressions? Do they have anything in common?

noun + noun and noun + adjective (eg. `minister finanse`, `rachunek bankowy`)

## c. Which signal: LLR score or syntactic category is more useful for determining genuine multiword expressions?

LLR score used on filtered category from point b.

## d. Can you describe a different use-case where the morphosyntactic category is useful for resolving a real-world problem?

- Keywords:

We can grab the most common nouns in text

- More inteligent corrections of word

If we know the category we can provide better correction proposotions

- Extracting meaning from text

For example extracting subject of sentence when analyzing text.