In [1]:
import json
import pickle
import re
import string
import tarfile
from collections import Counter, defaultdict
from datetime import datetime
from math import log2
from pprint import pprint

In [2]:
def get_next(tars):
    index = 1
    for tarinfo in tars:
        if tarinfo.isreg():

            if tarinfo.name.startswith('data/json/judgments'):
                pprint(str(index) + ": " + tarinfo.name)
                index += 1
                yield json.load(tars.extractfile(tarinfo))


def given_date(date):
    z = datetime.strptime(date, '%Y-%m-%d')
    return z.year == 2015

In [5]:
st = r"\W+"
res1 = r"(<[^>]*>)"
res2 = r"(-(\n|\r|\r\n|\x0b|\x0c|\x1c|\x1d|\x1e|\x85|\u2028|\u2029))"
res3 = r"^[a-ząćęłńóśźż]+$"
res4 = r"^[a-ząćęłńóśźż]*$"
com = re.compile(st, re.IGNORECASE)
com1 = re.compile(res1, re.IGNORECASE)
com2 = re.compile(res2, re.IGNORECASE)
pairs1 = re.compile(res3, re.IGNORECASE)

tar = tarfile.open("../saos-dump-23.02.2018.tar.gz", "r:gz")

In [6]:
def get_pairs(fun, itera):
    i = filter(fun, map(lambda w: w.translate(str.maketrans("", "", "§" + string.punctuation)).lower(), itera))
    first = next(i)
    for word in i:
        yield (first, word)
        first = word


def get_one(fun, itera):
    i = filter(fun, map(lambda w: w.translate(str.maketrans("", "", "§" + string.punctuation)).lower(), itera))
    for word in i:
        yield word


def give_me(fun_give):
    for x in get_next(tar):
        for y in x['items']:
            try:
                if given_date(y['judgmentDate']):
                    words = iter(com.split(com2.sub("", com1.sub("", y['textContent']))))
                    yield from fun_give(pairs1.match, words)
            except KeyError:
                pprint(KeyError)

In [7]:
# Saving to diskfor future

# with open("two_words", "wb") as f:
#     two_word = Counter(give_me(get_pairs))
#     pickle.dump(two_word, f)
# with open("one_len", "wb") as f:
#     one_word = Counter(give_me(get_one))
#     pickle.dump(one_word, f)


# Getting saved grom disc
with open("two_words", "rb") as f:
    two_word = Counter(pickle.load(f))
with open("one_len", "rb") as f:
    one_word = Counter(pickle.load(f))

In [10]:
def calculate_pmi(one_dict: Counter, two_dict: Counter):
    two_sum = sum(two_dict.values())
    one_sum = sum(one_dict.values())
    for pair in two_dict:
        a = two_dict[pair] / two_sum
        b = (one_dict[pair[0]] / one_sum) * (one_dict[pair[1]] / one_sum)
        yield (pair, log2(a / b))

In [9]:
def entropy(*args):
    tmp = sum(args)
    return sum((k * log2(k / tmp + int(k==0)) for k in args))


def calculate_g2(two_dict: Counter, one_dict: Counter):
    total = sum(two_dict.values())
    for pair in two_dict:
        k11 = two_dict[pair]
        k21 = one_dict[pair[0]] - k11
        k12 = one_dict[pair[1]] - k11
        k22 = total - k21 - k12 - k11
        # n = k11 + k12 + k21 + k22
        yield (pair, 2 * (entropy(k11, k12, k21, k22) - entropy(k11 + k12, k21 + k22)
                                    - entropy(k11 + k21, k12 + k22)))

In [8]:
#Save to file after computing 

# with open("pmi.result", "wb") as f:
#     result_pmi = Counter(dict(calculate_pmi(one_word, two_word))) 
#     pickle.dump(result_pmi, f)

#From saved file
with open("pmi.result", "rb") as f:
    result_pmi = Counter(pickle.load(f)) 
result_pmi.most_common(30)

[(('dzwońców', 'wróbli'), 27.238054789461568),
 (('niepodoranych', 'ściernisk'), 27.238054789461568),
 (('ściernisk', 'niewykoszonych'), 27.238054789461568),
 (('resztkom', 'pożniwnym'), 27.238054789461568),
 (('csysadm', 'securitymodel'), 27.238054789461568),
 (('noindexesj', 'lang'), 27.238054789461568),
 (('offix', 'ibra'), 27.238054789461568),
 (('mokrzyńskiej', 'łukaszczuk'), 27.238054789461568),
 (('księżyc', 'ida'), 27.238054789461568),
 (('prezydenccy', 'legislatorzy'), 27.238054789461568),
 (('wagonowego', 'dźwigownika'), 27.238054789461568),
 (('biomasse', 'nachhaltigkeitsverordnung'), 27.238054789461568),
 (('matiasa', 'kryspina'), 27.238054789461568),
 (('biomassestrom', 'nachhaltigkeitverordnung'), 27.238054789461568),
 (('biokreftstoff', 'nachhaltigkeitsverodrnung'), 27.238054789461568),
 (('lrner', 'cntlr'), 27.238054789461568),
 (('qtp', 'bugzilla'), 27.238054789461568),
 (('brudnicy', 'mniszki'), 27.238054789461568),
 (('przewyż', 'szanie'), 27.238054789461568),
 (('me

In [2]:
#Save to file after computing 

# with open("g2.result", "wb") as f:
#     result_g2 = Counter(dict(calculate_g2(two_word)))
#     pickle.dump(result_g2, f)

#From file
with open("g2.result", "rb") as f:
    result_g2 = Counter(pickle.load(f)) 
result_g2.most_common(30)

[(('z', 'dnia'), 5736386.5536103565),
 (('art', 'k'), 4885128.856697525),
 (('sygn', 'akt'), 3259341.7825914305),
 (('na', 'podstawie'), 3253185.8683400936),
 (('art', 'ust'), 3164934.1066213544),
 (('w', 'dniu'), 2857699.348271306),
 (('k', 'p'), 2794214.014423307),
 (('sąd', 'okręgowy'), 2689075.7413860047),
 (('p', 'c'), 2424782.0621353434),
 (('na', 'rzecz'), 2158916.0221890975),
 (('ubezpieczeń', 'społecznych'), 2152198.7170862355),
 (('sąd', 'rejonowy'), 1947503.0188561736),
 (('zgodnie', 'z'), 1942563.6789397376),
 (('pozbawienia', 'wolności'), 1777693.4506057038),
 (('w', 'sprawie'), 1740916.4168041379),
 (('kwotę', 'zł'), 1689158.956167014),
 (('dz', 'u'), 1674477.0599903688),
 (('z', 'art'), 1645395.933555342),
 (('zw', 'z'), 1562595.5519716053),
 (('k', 'c'), 1518648.6422379364),
 (('w', 'tym'), 1480513.2995926293),
 (('podstawie', 'art'), 1476633.460659851),
 (('sądu', 'najwyższego'), 1403991.993600442),
 (('sądu', 'rejonowego'), 1389707.4174451528),
 (('w', 'zw'), 1327099.

In [35]:
pprint(''.join(map(lambda y: "{} {} {} \n".format(y[0][0], y[0][1], y[1] ),
           filter(lambda x: len(x[0][0]) > 1 and len(x[0][1]) > 1,result_g2.most_common(50)))))

('sygn akt 3259341.7825914305 \n'
 'na podstawie 3253185.8683400936 \n'
 'art ust 3164934.1066213544 \n'
 'sąd okręgowy 2689075.7413860047 \n'
 'na rzecz 2158916.0221890975 \n'
 'ubezpieczeń społecznych 2152198.7170862355 \n'
 'sąd rejonowy 1947503.0188561736 \n'
 'pozbawienia wolności 1777693.4506057038 \n'
 'kwotę zł 1689158.956167014 \n'
 'podstawie art 1476633.460659851 \n'
 'sądu najwyższego 1403991.993600442 \n'
 'sądu rejonowego 1389707.4174451528 \n'
 'sądu okręgowego 1301353.5693028604 \n'
 'materiału dowodowego 1250152.5790557137 \n'
 'ust ustawy 1212241.1220036373 \n'
 'rzeczypospolitej polskiej 1158164.172824277 \n'
 'nr poz 1132052.2957682116 \n'
 'skarbu państwa 1125151.5306956181 \n'
 'art kk 1083183.3534579908 \n'
 'organ rentowy 1060304.5778583575 \n'
 'ocenie sądu 1056265.3140831762 \n'
 'niniejszej sprawie 1052486.6372797196 \n'
 'tym samym 1051294.2378502362 \n'
 'co do 998364.6733399269 \n'
 'kwoty zł 980790.8143469584 \n'
 'od dnia 979244.6362658457 \n'
 'zastępst