In [1]:
import json
import pickle
import re
import string
import tarfile
from collections import Counter, defaultdict
from datetime import datetime
from math import log2
from pprint import pprint

In [2]:
def get_next(tars):
    index = 1
    for tarinfo in tars:
        if tarinfo.isreg():

            if tarinfo.name.startswith('data/json/judgments'):
                pprint(str(index) + ": " + tarinfo.name)
                index += 1
                yield json.load(tars.extractfile(tarinfo))


def given_date(date):
    z = datetime.strptime(date, '%Y-%m-%d')
    return z.year == 2015

In [3]:
st = r"\W+"
res1 = r"(<[^>]*>)"
res2 = r"(-(\n|\r|\r\n|\x0b|\x0c|\x1c|\x1d|\x1e|\x85|\u2028|\u2029))"
res3 = r"^[a-ząćęłńóśźż]+$"
res4 = r"^[a-ząćęłńóśźż]*$"
com = re.compile(st, re.IGNORECASE)
com1 = re.compile(res1, re.IGNORECASE)
com2 = re.compile(res2, re.IGNORECASE)
pairs1 = re.compile(res3, re.IGNORECASE)

tar = tarfile.open("../saos-dump-23.02.2018.tar.gz", "r:gz")

In [4]:
def get_pairs(fun, itera):
    i = filter(lambda y: fun(y),
               map(lambda w: w.translate(str.maketrans("", "", "§" + string.punctuation)).lower(),
                   itera))
    first = next(i)
    for word in i:
        yield (first, word)
        first = word


def get_one(fun, itera):
    i = filter(lambda y: fun(y),
               map(lambda w: w.translate(str.maketrans("", "", "§" + string.punctuation)).lower(),
                   itera))
    for word in i:
        yield word


def give_me(fun_give):
    for x in get_next(tar):
        for y in x['items']:
            try:
                if given_date(y['judgmentDate']):
                    words = iter(com.split(com2.sub("", com1.sub("", y['textContent']))))
                    yield from fun_give(pairs1.match, words)
            except KeyError:
                pprint(KeyError)

In [5]:
# with open("two_words", "wb") as f:
#     two_word = Counter(give_me(get_pairs))
#     pickle.dump(two_word, f)
with open("two_words", "rb") as f:
    two_word = Counter(pickle.load(f))
    
# with open("one_len", "wb") as f:
#     one_word = Counter(give_me(get_one))
#     pickle.dump(one_word, f)
with open("one_len", "rb") as f:
    one_word = Counter(pickle.load(f))

In [10]:
def calculate_pmi(one_dict: Counter, two_dict: Counter):
    two_sum = sum(two_dict.values())
    one_sum = sum(one_dict.values())
    for pair in two_dict:
        a = two_dict[pair] / two_sum
        b = (one_dict[pair[0]] / one_sum) * (one_dict[pair[1]] / one_sum)
        yield (pair, log2(a / b))

In [9]:
def entropy(*args):
    tmp = sum(args)
    return sum((k * log2(k / tmp + int(k==0)) for k in args))


def calculate_g2(two_dict: Counter):
    total = sum(two_dict.values())
    table = defaultdict(lambda: Counter())
    table_reverse = defaultdict(lambda: Counter())
    for pair in two_dict:
        table[pair[0]][pair[1]] = two_dict[pair]
        table_reverse[pair[1]][pair[0]] = two_dict[pair]
    for fst, sec in two_dict:
        k11 = table[fst][sec]
        k21 = sum(table[fst].values()) - k11
        k12 = sum(table_reverse[sec].values()) - k11
        k22 = total - k21 - k12 - k11
        # n = k11 + k12 + k21 + k22
        yield ((fst, sec), 2 * (entropy(k11, k12, k21, k22) - entropy(k11 + k12, k21 + k22)
                                    - entropy(k11 + k21, k12 + k22)))

In [11]:
# with open("pmi.result", "wb") as f:
#     result_pmi = Counter(dict(calculate_pmi(one_word, two_word))) 
#     pickle.dump(result_pmi, f)
with open("pmi.result", "rb") as f:
    result_pmi = Counter(pickle.load(f)) 
result_pmi.most_common(10)

[(('dzwońców', 'wróbli'), 27.238054789461568),
 (('niepodoranych', 'ściernisk'), 27.238054789461568),
 (('ściernisk', 'niewykoszonych'), 27.238054789461568),
 (('resztkom', 'pożniwnym'), 27.238054789461568),
 (('csysadm', 'securitymodel'), 27.238054789461568),
 (('noindexesj', 'lang'), 27.238054789461568),
 (('offix', 'ibra'), 27.238054789461568),
 (('mokrzyńskiej', 'łukaszczuk'), 27.238054789461568),
 (('księżyc', 'ida'), 27.238054789461568),
 (('prezydenccy', 'legislatorzy'), 27.238054789461568)]

In [12]:
# with open("g2.result", "wb") as f:
#     result_g2 = Counter(dict(calculate_g2(two_word)))
#     pickle.dump(result_g2, f)
with open("g2.result", "rb") as f:
    result_g2 = Counter(pickle.load(f)) 
result_g2.most_common(10)

[(('z', 'dnia'), 5736386.5536103565),
 (('art', 'k'), 4885128.856697525),
 (('sygn', 'akt'), 3259341.7825914305),
 (('na', 'podstawie'), 3253185.8683400936),
 (('art', 'ust'), 3164934.1066213544),
 (('w', 'dniu'), 2857699.348271306),
 (('k', 'p'), 2794214.014423307),
 (('sąd', 'okręgowy'), 2689075.7413860047),
 (('p', 'c'), 2424782.0621353434),
 (('na', 'rzecz'), 2158916.0221890975)]