In [1]:
import json
import pandas
import os
import re
from collections import Counter
import matplotlib.pyplot as plt
import numpy as np
from itertools import tee, chain, islice
from math import log
from operator import itemgetter
from itertools import takewhile
import urllib
import requests
from tqdm import tqdm
import time

In [2]:
root_dir = '/home/marcin/Desktop/SemestrVIII/PJN'
year = "2018"
json_data_dir = f"{root_dir}/data/json"
filesInYearPattern = 'judgments-(316[3-9]|317\d)\.json'
out_dir = 'data/responses'

In [3]:
def judgement_texts(filename):
    with open(os.path.join(json_data_dir, filename), 'r') as jsonFile:
        judgements = json.load(jsonFile)['items']
    year_filtered = filter(lambda item: year in item['judgmentDate'],
                           judgements)
    yield from map(
        lambda item: (item['textContent'], item['judgmentDate'], item['id']),
        year_filtered)

In [4]:
def clean_text(line):
    notags = re.sub(r"<[^>]*>", " ", line)
    nobreaks = re.sub(r"-\n", " ", notags)
    noparens = re.sub("\(\.\.\.\)", " ", nobreaks)
    nonewline = re.sub("\\n", " ", noparens)
    return nonewline.replace(u'\xa0', u' ')

In [5]:
def send_request(text):
    url = "http://ws.clarin-pl.eu/nlprest2/base"
    lpmn = """any2txt|wcrft2|liner2({"model":"n82"})"""
    user = "mojadresemail"
    json_data = {
        'lpmn': lpmn,
        'user': user,
        'text': str(text.encode('utf-8'), 'utf-8')
    }
    resp = requests.post(
        url + '/process/',
        data=json.dumps(json_data),
        headers={'Content-Type': 'application/json'})
    return resp.text

In [9]:
def process_texts(texts):
    for _id, text in tqdm(texts):
        response = send_request(text)
        with open(os.path.join(out_dir, str(_id)), 'w') as out_file:
            out_file.write(response)
        time.sleep(60)

In [7]:
json_files = os.listdir(json_data_dir)
judgements_files = filter(lambda name: re.match(filesInYearPattern, name),
                          json_files)
text_date = list(chain.from_iterable(map(judgement_texts, judgements_files)))
to_process = sorted(text_date, key=itemgetter(1))[:100]
texts = [(t[2], clean_text(t[0])) for t in to_process]

In [10]:
process_texts(texts)


  0%|          | 0/100 [00:00<?, ?it/s][A
  1%|          | 1/100 [01:38<2:41:52, 98.11s/it][A
  2%|▏         | 2/100 [03:06<2:32:31, 93.38s/it][A
  3%|▎         | 3/100 [04:26<2:23:33, 88.79s/it][A
  4%|▍         | 4/100 [05:34<2:13:45, 83.59s/it][A
  5%|▌         | 5/100 [07:06<2:15:09, 85.36s/it][A
  6%|▌         | 6/100 [08:52<2:19:01, 88.74s/it][A
  7%|▋         | 7/100 [10:42<2:22:15, 91.78s/it][A
  8%|▊         | 8/100 [11:52<2:16:37, 89.11s/it][A
  9%|▉         | 9/100 [13:10<2:13:10, 87.81s/it][A
 10%|█         | 10/100 [14:27<2:10:08, 86.76s/it][A
 11%|█         | 11/100 [15:56<2:09:00, 86.97s/it][A
 12%|█▏        | 12/100 [17:50<2:10:49, 89.20s/it][A
 13%|█▎        | 13/100 [20:31<2:17:21, 94.73s/it][A
 14%|█▍        | 14/100 [21:47<2:13:54, 93.42s/it][A
 15%|█▌        | 15/100 [23:04<2:10:45, 92.30s/it][A
 16%|█▌        | 16/100 [24:50<2:10:27, 93.18s/it][A
 17%|█▋        | 17/100 [26:37<2:09:57, 93.95s/it][A
 18%|█▊        | 18/100 [27:43<2:06:18, 92.43s/