Piotr Rzeźnik 402194

### Imports

In [2]:
import itertools
import re

from tqdm.notebook import tqdm
import json
from datasets import load_dataset
from elasticsearch import Elasticsearch


### Load dataset

In [3]:
dataset = load_dataset("clarin-knext/fiqa-pl", 'corpus')
pd_dataset = dataset['corpus'].to_pandas()

1. Use SpaCy tokenizer API to tokenize the text from the law corpus.

In [4]:
from spacy.lang.pl import Polish
nlp = Polish()
tokenizer = nlp.tokenizer

In [5]:
pd_dataset.iloc[0]['text']

'Nie mówię, że nie podoba mi się też pomysł szkolenia w miejscu pracy, ale nie możesz oczekiwać, że firma to zrobi. Szkolenie pracowników to nie ich praca – oni tworzą oprogramowanie. Być może systemy edukacyjne w Stanach Zjednoczonych (lub ich studenci) powinny trochę martwić się o zdobycie umiejętności rynkowych w zamian za ich ogromne inwestycje w edukację, zamiast wychodzić z tysiącami zadłużonych studentów i narzekać, że nie są do niczego wykwalifikowani.'

2. Compute bigram counts of downcased tokens. Given the sentence: "The quick brown fox jumps over the lazy dog.", the bigram counts are as follows:
"the quick": 1
"quick brown": 1
"brown fox": 1
...
"dog .": 1


In [6]:
doc = tokenizer('The quick brown fox jumps over the lazy dog.')
bigrams = [(x1.text,x2.text) for x1,x2 in zip(doc[:-1], doc[1:])]
print(bigrams)

[('The', 'quick'), ('quick', 'brown'), ('brown', 'fox'), ('fox', 'jumps'), ('jumps', 'over'), ('over', 'the'), ('the', 'lazy'), ('lazy', 'dog'), ('dog', '.')]


In [7]:
from collections import defaultdict
bigram_dict = defaultdict(lambda: 0)
texts = pd_dataset['text'].apply(lambda s: s.lower())
for doc in tokenizer.pipe(texts, batch_size=50):
    bigrams = [(x1.text,x2.text) for x1,x2 in zip(doc[:-1], doc[1:])]
    for bigram in bigrams:
        bigram_dict[bigram] += 1

3. Discard bigrams containing characters other than letters. Make sure that you discard the invalid entries after computing the bigram counts.

In [9]:
import re

pattern = """[0-9,\.\?\)\("'\!\\/-]"""
filtered_bigram_dict = defaultdict(lambda : 0)
for (s0, s1), count in bigram_dict.items():
    if re.search(pattern, str(s0) + str(s1)) is None:
        filtered_bigram_dict[(s0, s1)] = count

In [10]:
unigram_dict = defaultdict(lambda: 0)
texts = pd_dataset['text'].apply(lambda s: s.lower())
for doc in tokenizer.pipe(texts, batch_size=50):
    for token in doc:
        unigram_dict[token.text] += 1

4. Use pointwise mutual information to compute the measure for all pairs of words.

In [11]:
import math
uni_sum = float(sum(unigram_dict.values()))
bi_sum = float(sum(filtered_bigram_dict.values()))

def pmi(word1, word2, unigram_freq, bigram_freq):
    prob_word1 = unigram_freq[word1] / uni_sum
    prob_word2 = unigram_freq[word2] / uni_sum
    prob_word1_word2 = bigram_freq[(word1, word2)] / bi_sum
    return math.log(prob_word1_word2/float(prob_word1*prob_word2),2) 
  
pmi_dict = defaultdict(lambda : None)
for (s0, s1), count in tqdm(filtered_bigram_dict.items()):
    pmi_dict[(s0, s1)] = pmi(s0,s1, unigram_dict, filtered_bigram_dict)

  0%|          | 0/1659331 [00:00<?, ?it/s]

In [12]:
import pandas as pd 
pmi_df = pd.DataFrame(pmi_dict.items(), columns=['bigram', 'count'])

5. Sort the word pairs according to that measure in the descending order and determine top 10 entries.


In [13]:
print(pmi_df.sort_values(by='count', ascending=False).head(10))

                         bigram      count
189043      (stylistkę, marikę)  23.520279
1159280           (stora, enso)  23.520279
607752         (pedrama, mirza)  23.520279
763356   (cristinie, fernández)  23.520279
1616285   (soltar, estupideces)  23.520279
201040           (pozik, daude)  23.520279
1279434  (calitate, superioară)  23.520279
572711            (boko, haram)  23.520279
201037      (betetzen, dituzte)  23.520279
1365180  (verdadero, estiubalo)  23.520279


6. Filter bigrams with number of occurrences lower than 5. Determine top 10 entries for the remaining dataset (>=5 occurrences).


In [14]:
even_more_filtered_bigram_dict = defaultdict(lambda : 0)
for (s0, s1), count in filtered_bigram_dict.items():
    if count >= 5:
        even_more_filtered_bigram_dict[(s0, s1)] = count

In [15]:
uni_sum = float(sum(unigram_dict.values()))
bi_sum = float(sum(even_more_filtered_bigram_dict.values()))

def pmi(word1, word2, unigram_freq, bigram_freq):
    prob_word1 = unigram_freq[word1] / uni_sum
    prob_word2 = unigram_freq[word2] / uni_sum
    prob_word1_word2 = bigram_freq[(word1, word2)] / bi_sum
    return math.log(prob_word1_word2/float(prob_word1*prob_word2),2) 
  
pmi_dict = defaultdict(lambda : None)
for (s0, s1), count in tqdm(even_more_filtered_bigram_dict.items()):
    pmi_dict[(s0, s1)] = pmi(s0,s1, unigram_dict, even_more_filtered_bigram_dict)

  0%|          | 0/139502 [00:00<?, ?it/s]

In [16]:
pmi_df_2 = pd.DataFrame(pmi_dict.items(), columns=['bigram', 'count'])

In [17]:
print(pmi_df_2.sort_values(by='count', ascending=False).head(10))


                     bigram      count
121548       (мою, команду)  21.876568
119164     (królicza, nora)  21.876568
58624   (klęska, żywiołowa)  21.876568
106236    (bert, hellinger)  21.876568
121549        (моя, группа)  21.876568
121544       (олимп, трейд)  21.876568
121543     (опционы, олимп)  21.876568
121542   (инарные, опционы)  21.876568
135594  (остались, вопросы)  21.876568
133418  (stucco, veneziano)  21.876568


7. Use KRNNT or Clarin-PL API to tag and lemmatize the corpus. Note: Clarin allows to upload a ZIP file with the whole corpus and process it as one request.


In [18]:
import requests

res = requests.post('http://localhost:9200',data=pd_dataset.iloc[0]['text'].encode('utf-8'))
res.text.split()[2::5] # 1


['nie',
 'mówić',
 ',',
 'że',
 'nie',
 'podobać',
 'ja',
 'się',
 'też',
 'pomysł',
 'szkolenie',
 'w',
 'miejsce',
 'praca',
 ',',
 'ale',
 'nie',
 'móc',
 'oczekiwać',
 ',',
 'że',
 'firma',
 'to',
 'zrobić',
 '.',
 'szkolenie',
 'pracownik',
 'to',
 'nie',
 'on',
 'praca',
 '–',
 'on',
 'tworzyć',
 'oprogramowanie',
 '.',
 'być',
 'móc',
 'system',
 'edukacyjny',
 'w',
 'Stany',
 'Zjednoczony',
 '(',
 'lub',
 'on',
 'student',
 ')',
 'powinien',
 'trochę',
 'martwić',
 'się',
 'o',
 'zdobyć',
 'umiejętność',
 'rynkowy',
 'w',
 'zamian',
 'za',
 'on',
 'ogromny',
 'inwestycja',
 'w',
 'edukacja',
 ',',
 'zamiast',
 'wychodzić',
 'z',
 'tysiąc',
 'zadłużyć',
 'student',
 'i',
 'narzekać',
 ',',
 'że',
 'nie',
 'być',
 'do',
 'nic',
 'wykwalifikować',
 '.']

8. Using the tagged corpus compute bigram statistic for the tokens containing: a. lemmatized, downcased word b. morphosyntactic category of the word (subst, fin, adj, etc.)

In [30]:
result = []
for x1,x2 in zip(res.text.split()[2::5], res.text.split()[3::5]):
    result.append(f"{x1}:{x2.split(':')[0]}")
# concat("1:2")

In [31]:
result

['nie:qub',
 'mówić:fin',
 ',:interp',
 'że:comp',
 'nie:qub',
 'podobać:fin',
 'ja:ppron12',
 'się:qub',
 'też:qub',
 'pomysł:subst',
 'szkolenie:subst',
 'w:prep',
 'miejsce:subst',
 'praca:subst',
 ',:interp',
 'ale:conj',
 'nie:qub',
 'móc:fin',
 'oczekiwać:inf',
 ',:interp',
 'że:comp',
 'firma:subst',
 'to:subst',
 'zrobić:fin',
 '.:interp',
 'szkolenie:subst',
 'pracownik:subst',
 'to:pred',
 'nie:qub',
 'on:ppron3',
 'praca:subst',
 '–:interp',
 'on:ppron3',
 'tworzyć:fin',
 'oprogramowanie:subst',
 '.:interp',
 'być:inf',
 'móc:fin',
 'system:subst',
 'edukacyjny:adj',
 'w:prep',
 'Stany:subst',
 'Zjednoczony:adj',
 '(:interp',
 'lub:conj',
 'on:ppron3',
 'student:subst',
 '):interp',
 'powinien:winien',
 'trochę:adv',
 'martwić:inf',
 'się:qub',
 'o:prep',
 'zdobyć:ger',
 'umiejętność:subst',
 'rynkowy:adj',
 'w:prep',
 'zamian:burk',
 'za:prep',
 'on:ppron3',
 'ogromny:adj',
 'inwestycja:subst',
 'w:prep',
 'edukacja:subst',
 ',:interp',
 'zamiast:conj',
 'wychodzić:inf',
 '

In [62]:
bigram_dict_lemma = defaultdict(lambda: 0)
unigram_dict_lemma = defaultdict(lambda: 0)
trigram_dict_lemma = defaultdict(lambda: 0)
for batch in tqdm(range(0, len(pd_dataset)//20, 50)):
    for text in pd_dataset.iloc[batch:batch+50]['text']:
        res = requests.post('http://localhost:9200',data=text.encode('utf-8'))
        doc = []
        for x1,x2 in zip(res.text.split()[2::5], res.text.split()[3::5]):
            doc.append(f"{x1}:{x2.split(':')[0]}")
            unigram_dict_lemma[doc[-1]] += 1
        bigrams = [(x1,x2) for x1,x2 in zip(doc[:-1], doc[1:])]
        for bigram in bigrams:
            bigram_dict_lemma[bigram] += 1
        trigrams = [(x1,x2,x3) for x1,x2,x3 in zip(doc[:-2], doc[1:-1], doc[2:])]
        for trigram in trigrams:
            trigram_dict_lemma[trigram] += 1

  0%|          | 0/58 [00:00<?, ?it/s]

In [63]:
bigram_dict = defaultdict(lambda: 0)
unigram_dict = defaultdict(lambda: 0)
trigram_dict = defaultdict(lambda: 0)
texts = pd_dataset.iloc[0:len(pd_dataset)//20]['text'].apply(lambda s: s.lower())
for doc in tokenizer.pipe(texts, batch_size=50):
    for token in doc:
        unigram_dict[token.text] += 1
    bigrams = [(x1.text,x2.text) for x1,x2 in zip(doc[:-1], doc[1:])]
    for bigram in bigrams:
        bigram_dict[bigram] += 1
    trigrams = [(x1.text,x2.text,x3.text) for x1,x2,x3 in zip(doc[:-2], doc[1:-1], doc[2:])]
    for trigram in trigrams:
        trigram_dict[trigram] += 1
        

10. Compute the same statistics as for the non-lemmatized words (i.e. PMI) and print top-10 entries with at least 5 occurrences.
11. Compute trigram counts for both corpora and perform the same filtering.
12. Use PMI (with 5 occurrence threshold) to compute top 10 results for the trigrams. Devise a method for computing the values, based on the results for bigrams.

tokenized:

In [87]:
uni_sum = float(sum(unigram_dict.values()))
bi_sum = float(sum(bigram_dict.values()))
tri_sum = float(sum(trigram_dict.values()))

uni_sum_lemma = float(sum(unigram_dict_lemma.values()))
bi_sum_lemma = float(sum(bigram_dict_lemma.values()))
tri_sum_lemma = float(sum(trigram_dict_lemma.values()))

def bi_pmi(word1, word2, unigram_freq,bigram_freq, uni_sum, bi_sum_):
    prob_word1 = unigram_freq[word1] / uni_sum
    prob_word2 = unigram_freq[word2] / uni_sum
    prob_word1_word2 = bigram_freq[(word1, word2)] / bi_sum_
    return math.log(prob_word1_word2/float(prob_word1*prob_word2),2)

def tri_pmi(word1, word2, word3, unigram_freq, trigram_freq, uni_sum, tri_sum_):
    prob_word1 = unigram_freq[word1] / uni_sum
    prob_word2 = unigram_freq[word2] / uni_sum
    prob_word3 = unigram_freq[word3] / uni_sum
    prob_word1_word2 = trigram_freq[(word1, word2, word3)] / tri_sum_
    return math.log(prob_word1_word2/float(prob_word1*prob_word2*prob_word3),2)

In [88]:
bigram_pmi_dict = defaultdict(lambda : None)
for (s0, s1), count in tqdm(bigram_dict.items()):
    bigram_pmi_dict[(s0, s1)] = bi_pmi(s0,s1, unigram_dict, bigram_dict, uni_sum, bi_sum)

  0%|          | 0/202005 [00:00<?, ?it/s]

In [89]:
trigram_pmi_dict = defaultdict(lambda : None)
for (s0, s1, s2), count in tqdm(trigram_dict.items()):
    trigram_pmi_dict[(s0, s1, s2)] = tri_pmi(s0,s1,s2, unigram_dict, trigram_dict, uni_sum, tri_sum)

  0%|          | 0/322768 [00:00<?, ?it/s]

lammatized:

In [90]:
bigram_pmi_dict_lemma = defaultdict(lambda: None)
for (s0, s1), count in tqdm(bigram_dict_lemma.items()):
    bigram_pmi_dict_lemma[(s0, s1)] = bi_pmi(s0, s1, unigram_dict_lemma, bigram_dict_lemma, uni_sum_lemma, bi_sum_lemma)

  0%|          | 0/187569 [00:00<?, ?it/s]

In [91]:
trigram_pmi_dict_lemma = defaultdict(lambda: None)
for (s0, s1, s2), count in tqdm(trigram_dict_lemma.items()):
    trigram_pmi_dict_lemma[(s0, s1, s2)] = tri_pmi(s0, s1, s2, unigram_dict_lemma, trigram_dict_lemma, uni_sum_lemma, tri_sum_lemma)

  0%|          | 0/321415 [00:00<?, ?it/s]

 to dataframes:
 

In [93]:
bigram_pmi_df = pd.DataFrame(bigram_pmi_dict.items(), columns=['bigram', 'count'])
print(bigram_pmi_df.sort_values(by='count', ascending=False).head(10))

                                 bigram      count
162416  (ogólnokrajowymi, gospodarzami)  18.610913
167595            (głównemu, oficerowi)  18.610913
105796            (imprezowym, szyldem)  18.610913
105772        (cheerleaderki, milczały)  18.610913
105771  (republikańskie, cheerleaderki)  18.610913
62429       (wydzierżawiona, telekomom)  18.610913
167770          (nakarmiłbym, jebanego)  18.610913
32883              (uzdolnionej, córce)  18.610913
167767              (obrzydliwy, szlam)  18.610913
3513                  (końskiej, pyska)  18.610913


In [94]:
bigram_pmi_df_lemma = pd.DataFrame(bigram_pmi_dict_lemma.items(), columns=['bigram', 'count'])
print(bigram_pmi_df_lemma.sort_values(by='count', ascending=False).head(10))

                                                   bigram      count
126348                  (pomiary:space, obejmujące:space)  18.649206
42462                     (pokazywać:ger, kwotowań:subst)  18.649206
57054                          (myśliwy:subst, se:interj)  18.649206
161531  (none:czerwony, space:dit.com/message/compose?...  18.649206
169125                  (obniżysz:space, francuską:space)  18.649206
57086   (https://www.reuters.com/article/us-usa-sec-co...  18.649206
139368                         (Aarjav:subst, Skin:subst)  18.649206
139369                           (Skin:subst, Care:subst)  18.649206
169106             (podwójnego:none, opodatkowania:space)  18.649206
57195                          (Mini:subst, Punjab:subst)  18.649206


In [95]:
trigram_pmi_df = pd.DataFrame(trigram_pmi_dict.items(), columns=['trigram', 'count'])
print(trigram_pmi_df.sort_values(by='count', ascending=False).head(10))

                                                  trigram      count
99765               (niewolników, nazywano, niewolnikami)  37.221903
224115                         (^również, ^usunie, ^przy)  37.221903
10979   (https://sciencemag.org, http://www.sciencemag...  37.221903
184413                       (rozerwą, społeczną, tkankę)  37.221903
24794                  (kalibracja, pomiarowa, us4782698)  37.221903
34911                             (която, изтрива, данни)  37.221903
224117                   (^przy, ^komentarzu, ^zdobędzie)  37.221903
224116                      (^usunie, ^przy, ^komentarzu)  37.221903
34904                                 (паметта, е, форма)  37.221903
280073               (rozkładów, modelujących, wariancję)  37.221903


In [96]:
trigram_pmi_df_lemma = pd.DataFrame(trigram_pmi_dict_lemma.items(), columns=['trigram', 'count'])
print(trigram_pmi_df_lemma.sort_values(by='count', ascending=False).head(10))


                                                  trigram      count
215942  (disamb:przestrzeni, disamb:długich, disamb:ok...  37.298486
167415           (wielką:space, dojną:space, krową:space)  37.298486
38835        (newline:najechać, space:kursor, space:mysz)  37.298486
69814   (space:wdrażać, space:przyspieszać, space:niew...  37.298486
260748        (CPGgrey's:subst, Humans:subst, Need:subst)  37.298486
69798        (space:głęboki, space:zaleta, space:łańcuch)  37.298486
134667      (Warren:newline, Buffett:space, słynął:space)  37.298486
264372    (narysować:impt, zniesienie:subst, fibba:subst)  37.298486
225953        (analizę:space, due:space, diligence:space)  37.298486
209308  (nacisnąć:inf, pedał:subst, przyspieszenie:subst)  37.298486
