In [140]:
import json
import pandas
import os
import re
from collections import Counter
import matplotlib.pyplot as plt
import numpy as np
from  itertools import tee, chain, islice, groupby
from math import log
from operator import itemgetter
from itertools import takewhile
import requests
import re

### Remarks
- 2018 has 990 judgements. 786 has justification. That means 20% of judgements has no justification. Quite weird. It turns out that half of these judgements contains word `oddalać (remove)` in some form. What about rest of judgements? Why they do not contain justification?   

In [141]:
root_dir = '/home/marcin/Desktop/SemestrVIII/PJN'
year = "2018"
json_data_dir = f"{root_dir}/data/json"

patterns = {
    re.compile('A?C.*') : 'civil',
    re.compile('A?U.*') : 'insurance',
    re.compile('A?K.*') : 'criminal', 
    re.compile('G.*') :  'economic',
    re.compile('A?P.*'): 'work', 
    re.compile('R.*'): 'family', 
    re.compile('W.*') : 'violations', 
    re.compile('Am.*'): 'competition'  ,
    re.compile('.*'): 'other' 
    
    }

common_words = ['w', 'z', 'na', 'i', 'do',
 'nie', 'o', 'k', 'r', 'że',
 'art', 'dnia', 'się', 'od', 'a',
 'przez', 'sąd', 'roku', 'pracy', 'za']

def extract_file_number(filename):
    return int(filename.split('-')[1].split('.')[0])

def is_2017_in_file(filename):
    return 2716 <= extract_file_number(filename) <= 3163 

def is_2018_in_file(filename):
    return 3163 <= extract_file_number(filename) <=  3173

def clean_text(line):
    justification = line.split('<h2>UZASADNIENIE</h2>')[1]
    notags = re.sub(r"<[^>]*>", " ", justification)
    nobreaks =  re.sub(r"-\n", " ", notags)
    nodigits =  re.sub(r"\d+", " ", nobreaks)
    noromans = re.sub(r"\b[XVILMC]+\b", "", nodigits)
    lower = noromans.lower()
    words =  filter(lambda x: x not in common_words, re.findall(r"\w+",lower))
    return words

def filter_judgements(judgement):
    return year in judgement['judgmentDate']  and \
    judgement['courtType'] in ['COMMON', 'SUPREME'] and \
    '<h2>UZASADNIENIE</h2>' in judgement['textContent']

def map_category(judgement):
    caseNumber = judgement['courtCases'][0]['caseNumber'].split(' ')
    _, label = next(filter(lambda pat : pat[0].match(caseNumber[1]),patterns.items()))
    return label, caseNumber

def judgements_raw(filename):
    with open(os.path.join(json_data_dir, filename), 'r') as jsonFile:
        judgements = json.load(jsonFile)['items'] 
    return judgements

In [142]:
judgements_files= filter(is_2018_in_file, os.listdir(json_data_dir))
texts =  chain.from_iterable(map( judgements_raw, judgements_files))
filtered = filter(filter_judgements,texts)
with_categories = map(lambda j: (j,map_category(j)), filtered)
cleaned = map(lambda j: (j[0]['id'], clean_text(j[0]['textContent']),j[1][0],j[1][1]) ,
             with_categories) 
sorted_categories = sorted(cleaned,key = itemgetter(2))
groups = groupby(sorted_categories,itemgetter(2))
grouped = dict((k, list(g)) for k, g in groups)    

In [143]:
grouped['other']

[(323022, <filter at 0x7fb1760544a8>, 'other', ['I', 'Ns', '514/15']),
 (320990, <filter at 0x7fb17d17c588>, 'other', ['I', 'Ns', '96/17']),
 (321229, <filter at 0x7fb17d152668>, 'other', ['I', 'Ns', '92/16']),
 (322819, <filter at 0x7fb17cea4518>, 'other', ['I', 'Ns', '613/14']),
 (326268, <filter at 0x7fb17a81df98>, 'other', ['III', 'Nsm', '815/17']),
 (324484, <filter at 0x7fb178ba97f0>, 'other', ['X', 'Ns', '753/17']),
 (324523, <filter at 0x7fb1789cbc18>, 'other', ['XVI', 'Ns', '577/15']),
 (326357, <filter at 0x7fb176b9bf98>, 'other', ['I', 'Ns', '90/16']),
 (326355, <filter at 0x7fb17882bda0>, 'other', ['I', 'Ns', '252/17']),
 (326356, <filter at 0x7fb178807cc0>, 'other', ['I', 'Ns', '452/16']),
 (326196, <filter at 0x7fb17c331f60>, 'other', ['I', 'Ns', '31/17']),
 (324819, <filter at 0x7fb17ad3a5f8>, 'other', ['IV', 'Nsm', '907/17']),
 (325510, <filter at 0x7fb17a394630>, 'other', ['I', 'Ns', '228/17']),
 (325951, <filter at 0x7fb176114828>, 'other', ['II', 'S', '3/17']),
 (325