In [39]:
import json
import pandas
import os
import re
from collections import Counter
import matplotlib.pyplot as plt
import numpy as np
from  itertools import tee, chain, islice
from math import log
from operator import itemgetter
from itertools import takewhile
import requests

In [40]:
root_dir = '/home/marcin/Desktop/SemestrVIII/PJN'
year = "2018"
json_data_dir = f"{root_dir}/data/json"
filesInYearPattern = 'judgments-(316[3-9]|317\d)\.json'

In [41]:
def window(it, size=2):
    yield from zip(*[islice(it, s, None) for s, it in enumerate(tee(it, size))])

In [42]:
def clean_text(line):
    _id, text = line
    notags = re.sub(r"<[^>]*>", " ", text)
    nobreaks =  re.sub(r"-\n", " ", notags)
    nodigits =  re.sub(r"\d+", " ", nobreaks)
    return _id, re.sub(r"\b[XVILMC]+\b", "", nodigits)

In [43]:
def judgement_texts(filename):
    with open(os.path.join(json_data_dir, filename), 'r') as jsonFile:
        judgements = json.load(jsonFile)['items'] 
    year_filtered = filter(lambda item: year in item['judgmentDate'], judgements)
    yield from map(lambda item: (item['id'],item['textContent']), year_filtered)

In [44]:
def process_response(line):
    word, tags = line.split('\t')[1:3]
    grammatical_class = tags.split(':')[0]
    return "{}:{}".format(word.lower(), grammatical_class)

In [45]:
def tag_text(judgement):
    _id,text = judgement 
    r = requests.post("http://localhost:9200", data=text.encode('utf-8'))
    lines= r.content.decode('utf-8').split('\n') 
    relevant = filter(lambda line: line.startswith('\t'),lines)
    tagged =  map(process_response, relevant)
    words =  filter(lambda x: re.match("\w+",x[0]),tagged) 
    words = list(words)
    return _id, words

In [46]:
def write_tagging(tagging):
   with open('tagging','w') as file:
        for _id, tags in tagging:
            file.write("{}; ".format(_id))
            file.write(' '.join(x for x in tags))

In [47]:
json_files = os.listdir(json_data_dir)
judgements_files= filter(lambda name: re.match(filesInYearPattern, name), json_files)
texts =  chain.from_iterable(map( judgement_texts, judgements_files))
cleaned_text = map(clean_text,texts)
cleaned_text = list(cleaned_text)[:3]

In [None]:
tagging_unigrams = map(tag_text, cleaned_text)
tagging_unigrams = list(tagging_unigrams)
unigrams_it1, unigrams_it2, unigrams_it3 =tee(tagging_unigrams,3)
write_tagging(unigrams_it3)

In [32]:
tagging_unigrams


[(324503,
  ['sygnatura:brev',
   'akt:subst',
   'ga:subst',
   'wyrok:subst',
   'w:prep',
   'imię:subst',
   'rzeczpospolita:subst',
   'polski:adj',
   'dzień:subst',
   'styczeń:subst',
   'rok:subst',
   'sąd:subst',
   'okręgowy:adj',
   'w:prep',
   'łódź:subst',
   'wydział:subst',
   'gospodarczy:adj',
   'odwoławczy:adj',
   'w:prep',
   'skład:subst',
   'następujący:adj',
   'przewodnicząca:subst',
   'sso:subst',
   'jolanta:subst',
   'jachowicz:subst',
   'po:prep',
   'rozpoznać:ger',
   'w:prep',
   'dzień:subst',
   'styczeń:subst',
   'rok:subst',
   'w:prep',
   'łódź:subst',
   'na:prep',
   'posiedzenie:subst',
   'niejawny:adj',
   'sprawa:subst',
   'z:prep',
   'powództwo:subst',
   'a:brev',
   'przeciwko:prep',
   'j:brev',
   'b:brev',
   'o:prep',
   'zapłata:subst',
   'na:prep',
   'skutek:subst',
   'apelacja:subst',
   'pozwany:subst',
   'od:prep',
   'wyrok:subst',
   'sąd:subst',
   'rejonowy:adj',
   'w:prep',
   'kalisz:subst',
   'z:prep',
   'd

In [189]:
unigrams =  chain.from_iterable(unigrams_it1)
bigrams = chain.from_iterable(map( lambda it: window(it,2),unigrams_it2))
bigrams = islice(bigrams,3000)
unigrams = islice(unigrams,3000)

In [190]:
counted_unigrams= Counter(unigrams)
counted_bigrams= Counter(bigrams)

unigram_count = sum(counted_unigrams.values())
bigram_count =  sum(counted_bigrams.values())

test
test
test
test


# 10 najpopularniejszych bigramów w 2018

In [191]:
counted_bigrams.most_common(10)

[((('z', 'prep'), ('dzień', 'subst')), 28),
 ((('artykuł', 'brev'), ('ustęp', 'brev')), 18),
 ((('prezes', 'subst'), ('ure', 'subst')), 16),
 ((('prawo', 'subst'), ('energetyczny', 'adj')), 12),
 ((('do', 'prep'), ('sieć', 'subst')), 11),
 ((('warunek', 'subst'), ('przyłączyć', 'ger')), 10),
 ((('styczeń', 'subst'), ('rok', 'brev')), 10),
 ((('zaliczka', 'subst'), ('na', 'prep')), 9),
 ((('artykuł', 'brev'), ('koło', 'brev')), 9),
 ((('koło', 'brev'), ('pan', 'brev')), 9)]

# 10 najpopularniejszych unigramów w 2018

In [192]:
counted_unigrams.most_common(10) 

[(('w', 'prep'), 151),
 (('z', 'prep'), 107),
 (('nie', 'qub'), 55),
 (('na', 'prep'), 49),
 (('dzień', 'subst'), 48),
 (('do', 'prep'), 48),
 (('i', 'conj'), 43),
 (('on', 'ppron3'), 41),
 (('sąd', 'subst'), 34),
 (('o', 'prep'), 33)]

In [193]:
def pmi(bigram):
    prob_word_1 = counted_unigrams[bigram[0]] / unigram_count
    prob_word_2 = counted_unigrams[bigram[1]] / unigram_count
    prob_bigram = counted_bigrams[bigram] /     bigram_count 
    return log( prob_bigram / (prob_word_1 * prob_word_2))    

In [194]:
def top_pmi(min_frequency = 1):
    frequent_bigrams = takewhile(lambda x: x[1] > min_frequency, counted_bigrams.most_common() )
    pmi_vals = map(lambda x: (x[0],pmi(x[0])),frequent_bigrams)
    return list(sorted(pmi_vals,key = itemgetter(1),reverse=True))

In [195]:
from  python_llr.llr import llr_2x2
def loglikelihood(bigram):
   k11 = counted_bigrams[bigram] 
   k12 = counted_unigrams[bigram[0]] - counted_bigrams[bigram]
   k21 = counted_unigrams[bigram[1]] - counted_bigrams[bigram]
   k22 = bigram_count - (counted_unigrams[bigram[0]] + counted_unigrams[bigram[1]] - counted_bigrams[bigram])
   return k11 * k12 *k21*k22, k11, k12,k21,k22, bigram
   #return llr_2x2(k11,k12,k21,k22) 

In [196]:

log_like_vals = filter(lambda bigram: (loglikelihood(bigram)[0] == 0) , counted_bigrams.keys())
asd = map(loglikelihood,log_like_vals)
list(asd)

[(0, 1, 106, 0, 2893, (('z', 'prep'), ('tom', 'brev'))),
 (0, 1, 0, 11, 2988, (('tom', 'brev'), ('pozycja', 'brev'))),
 (0, 3, 0, 38, 2959, (('poprzez', 'prep'), ('on', 'ppron3'))),
 (0, 3, 38, 0, 2959, (('on', 'ppron3'), ('zastosować', 'ger'))),
 (0, 3, 0, 40, 2957, (('zastosować', 'ger'), ('i', 'conj'))),
 (0, 1, 6, 0, 2993, (('brak', 'subst'), ('formalny', 'adj'))),
 (0, 1, 0, 3, 2996, (('formalny', 'adj'), ('jaki', 'adj'))),
 (0, 9, 0, 40, 2951, (('zaliczka', 'subst'), ('na', 'prep'))),
 (0, 3, 0, 7, 2990, (('poczet', 'subst'), ('koszt', 'subst'))),
 (0, 1, 16, 0, 2983, (('który', 'adj'), ('skutkować', 'praet'))),
 (0, 1, 0, 40, 2959, (('skutkować', 'praet'), ('on', 'ppron3'))),
 (0, 2, 39, 0, 2959, (('on', 'ppron3'), ('umorzyć', 'ger'))),
 (0, 1, 1, 0, 2998, (('umorzyć', 'ger'), ('mieć', 'pcon'))),
 (0, 1, 0, 48, 2951, (('mieć', 'pcon'), ('na', 'prep'))),
 (0, 1, 8, 0, 2991, (('powyższy', 'adj'), ('zarzut', 'subst'))),
 (0, 1, 0, 2, 2997, (('zarzut', 'subst'), ('wnieść', 'praet'))

# top 30 bigrams by loglikelihood

In [197]:
log_like_vals = map(lambda bigram: (bigram, loglikelihood(bigram)) , counted_bigrams.keys())
list(sorted(log_like_vals,key = itemgetter(1),reverse=True))[:30]

[((('z', 'prep'), ('dzień', 'subst')),
  (127101520, 28, 79, 20, 2873, (('z', 'prep'), ('dzień', 'subst')))),
 ((('w', 'prep'), ('artykuł', 'brev')),
  (61512880, 8, 143, 19, 2830, (('w', 'prep'), ('artykuł', 'brev')))),
 ((('w', 'prep'), ('w', 'prep')),
  (60727500, 1, 150, 150, 2699, (('w', 'prep'), ('w', 'prep')))),
 ((('w', 'prep'), ('dzień', 'subst')),
  (56023920, 3, 148, 45, 2804, (('w', 'prep'), ('dzień', 'subst')))),
 ((('w', 'prep'), ('sprawa', 'subst')),
  (34920144, 4, 147, 21, 2828, (('w', 'prep'), ('sprawa', 'subst')))),
 ((('rok', 'brev'), ('w', 'prep')),
  (30103200, 3, 24, 148, 2825, (('rok', 'brev'), ('w', 'prep')))),
 ((('ustawa', 'subst'), ('z', 'prep')),
  (29304600, 5, 20, 102, 2873, (('ustawa', 'subst'), ('z', 'prep')))),
 ((('postępowanie', 'subst'), ('w', 'prep')),
  (24995880, 4, 15, 147, 2834, (('postępowanie', 'subst'), ('w', 'prep')))),
 ((('przyłączyć', 'ger'), ('w', 'prep')),
  (23538424, 2, 28, 149, 2821, (('przyłączyć', 'ger'), ('w', 'prep')))),
 ((('po

In [198]:
log_like_vals_weighted = map(lambda bigram: (bigram, log(counted_bigrams[bigram]) * loglikelihood(bigram)) , counted_bigrams.keys())
list(sorted(log_like_vals_weighted,key = itemgetter(1),reverse=True))

TypeError: can't multiply sequence by non-int of type 'float'