In [1]:
import json
from html import unescape
from urllib import parse
from bs4 import BeautifulSoup

In [2]:
raw_data = json.load(open('./problems.json'))

In [3]:
def parse_example(s):
    if s is None:
        return
    DELIMITER = ' → '
    i, o = s.split(DELIMITER)
    return {
        'input': i,
        'output': o,
    }

In [4]:
cleaned_data = []
for problem, metadata in raw_data.items():
    problem_url = problem
    title = metadata['title']
    section = metadata['section']
    
    soup = BeautifulSoup(metadata['raw'], 'html5lib')
    
    difficulty = soup.find_all('p')[-3]
    difficulty = difficulty.text.split(': ')[1]
    
    soup = soup.find_all('td')[-2]
    soup.div.unwrap()
    
    summary = soup.contents[0]
    
    examples = []
    for i, el in enumerate(soup.contents[1:]):
        if getattr(el, 'is_empty_element', False):
            try:
                res = parse_example(soup.contents[i+1+1])
                if res:
                    examples.append(res)
            except TypeError:
                pass # only 1 instance...funky edge case
            
    # TODO: check for embedded solution
    button = soup.find_all('button', class_='gray')
    if button and button[0].text.lower() == 'show solution':
        inline_javascript = button[0]['onclick']
        skip_front = len('document.getElementById("results").innerHTML="<b>Solution:</b><pre>"+unescape("')
        skip_back = len('" + "</pre>"); sendRemark("h2");')
        escaped_solution = inline_javascript[skip_front:-skip_back]
        solution = unescape(parse.unquote(escaped_solution))
    else:
        solution = None
    
    signature = soup.find(id='ace_div').text.split('{')[0]
    
    info = {
        'url': problem_url,
        'section': section,
        'title': title,
        # use split strategy to deal with "post-solution available" case
        'difficulty': float(difficulty.split('\n')[0].strip()),
        'summary': summary,
        'signature': signature.strip(),
        'examples': examples,
        'solution': [solution] if solution is not None else []
    }
    cleaned_data.append(info)

In [5]:
json.dump(cleaned_data, open('./problems_with_builtin_answers.json', 'w'))