In [None]:
import xml
import xml.etree.ElementTree as ET
import base64
from pathlib import Path
import requests
#
import http.client as client
from http.client import HTTPResponse
from http.client import UnknownProtocol
import json

from io import BytesIO
#
from itertools import groupby
from deepdiff import DeepDiff

In [None]:
class FakeSocket():
    def __init__(self, response_bytes):
        self._file = BytesIO(response_bytes)
    def makefile(self, *args, **kwargs):
        return self._file

def parse_response(res):
    source = FakeSocket(res)
    response = HTTPResponse(source)
    response.begin()
    return response


def extract_problems(p_raw):
    tree = ET.parse(str(p_raw))
    root = tree.getroot()
    #
    DATA = {
        "total": 0,
        "data": []
    }
    PROBLEMS = []
    for item in root:
        try:
            res = item.findall("response")
            assert len(res) == 1
            res = base64.b64decode(res[0].text)
            res = parse_response(res)
            #
            content = res.read()
            if len(content) == 0:
                continue
            data = json.loads(content.decode())
            #print(data.keys())
            if not 'total' in data:
                continue
            if not "data" in data:
                continue
            PROBLEMS.extend(data["data"])
        except UnknownProtocol as e:
            #print("Unknown Protocol", str(e))
            pass
    return PROBLEMS

def argmax(iterable):
    return max(enumerate(iterable), key=lambda x: x[1])[0]

def parse_datetime(dstr):
    formats = ["%Y-%m-%dT%H:%M:%S.%f", "%Y-%m-%dT%H:%M:%S",]
    for date_form in formats:
        try:
            ts = datetime.strptime(dstr, date_form)
            return ts
        except ValueError:
            pass
    else:
        raise ValueError("Unknown format {}".format(dstr))

def get_unique_problems(problems):
    # groupby name
    problems.sort(key=lambda x: x["apiId"])
    problem_groups =  [list(it) for k, it in groupby(problems, lambda x: x["apiId"])]
    #
    problems_new = []
    for pg in problem_groups:
        if len(pg) == 1:
            problems_new.append(pg[0])
        else:
            # same problem exist multiple times
            date_strings = [parse_datetime(p["dateUpdated"]) for p in pg]
            # latest
            idx = argmax(date_strings)
            problems_new.append(pg[idx])
    return problems_new
    non_unique_problems = [pg for pg in problem_groups if len(pg) > 1]
    if len(non_unique_problems) > 0:
        # select latest problem ;)
        return non_unique_problems
    assert len(non_unique_problems) == 0
    return problems

In [None]:
P_ROOT = Path("./data")
P_RAW = P_ROOT / "raw"
P_PROB = P_ROOT / "problems"
P_RAWS = {
    "moon_2016_45": P_RAW / "moon_2016_45d.xml",
    "moon_2019_45": P_RAW / "moon_2019_45d.xml",
    "moon_2019_25": P_RAW / "moon_2019_25d.xml",
    "moon_2017_25": P_RAW / "moon_2017_25d.xml",
    "moon_2017_45": P_RAW / "moon_2017_45d.xml",
}
for p_raw in P_RAWS.values():
    assert p_raw.exists(), p_raw

In [None]:
for name, p_raw in P_RAWS.items():
    #print(p_raw)
    problems = extract_problems(p_raw)
    problems = get_unique_problems(problems)
    print("{:>14}: {:6d}".format(name, len(problems)))
    #
    data_file_name = "{}.json".format(name)
    p_dest = P_PROB / data_file_name
    #
    with open(p_dest, 'w') as file:
        json.dump(problems, file)

In [None]:
p_dest

In [None]:
problems