In [1]:
import mechanicalsoup
import numpy as np
from collections import namedtuple
import pandas as pd
from pandarallel import pandarallel
import pathlib
import re
import os
import sqlite3
from time import sleep
try:
    from tqdm.notebook import tqdm
except ImportError:
    from tqdm import tqdm

In [2]:
import logging
# logging.basicConfig(filename='scrap-items.log', level=logging.DEBUG)
filehandler = logging.FileHandler('scrap-items-debug.log', 'w')
formatter = logging.Formatter('%(asctime)-15s::%(levelname)s::%(filename)s::%(funcName)s::%(lineno)d::%(message)s')
filehandler.setFormatter(formatter)
log = logging.getLogger('root')  # root logger - Good to get it only once
for hdlr in log.handlers[:]:  # remove the existing file handlers
    if isinstance(hdlr,logging.FileHandler):
        log.removeHandler(hdlr)
log.addHandler(filehandler)      # set the new handler
# set the log level to INFO, DEBUG as the default is ERROR
log.setLevel(logging.DEBUG)


In [3]:
conn = sqlite3.connect('./links.sqlite3')
cursor = conn.execute('SELECT name, difficulty, link, contest FROM codeforces ORDER BY contest;')

In [4]:
df = pd.DataFrame(cursor.fetchall())
df.columns = [description[0] for description in cursor.description]
df.head()

Unnamed: 0,name,difficulty,link,contest
0,Ancient Berland Circus,C,http://codeforces.com/problemset/problem/1/C,1
1,Spreadsheet,B,http://codeforces.com/problemset/problem/1/B,1
2,Theatre Square,A,http://codeforces.com/problemset/problem/1/A,1
3,Commentator problem,C,http://codeforces.com/problemset/problem/2/C,2
4,The least round way,B,http://codeforces.com/problemset/problem/2/B,2


In [5]:
browser = mechanicalsoup.StatefulBrowser()

In [6]:
pathlib.Path("./problems/").mkdir(parents=True, exist_ok=True)

In [7]:
info_filehandler = logging.FileHandler('scrap-items-info.log', 'w')
info_formatter = logging.Formatter('%(asctime)-15s::%(levelname)s::%(funcName)s::%(lineno)d  %(message)s')
info_filehandler.setFormatter(info_formatter)
info_log = logging.getLogger('info')  # root logger - Good to get it only once
for hdlr in info_log.handlers[:]:  # remove the existing file handlers
    if isinstance(hdlr,logging.FileHandler):
        info_log.removeHandler(hdlr)
info_log.addHandler(info_filehandler)      # set the new handler
# set the log level to INFO, DEBUG as the default is ERROR
info_log.setLevel(logging.INFO)


In [8]:
def get_time_limit(page):
    time_limit = page.select('.time-limit')[0]
    time_limit.div.replace_with('')
    time = re.search('(\d+)[^\d]*', time_limit.text).group(1)
    return float(time)

def get_memory_limit(page):
    memory_limit = page.select('.memory-limit')[0]
    memory_limit.div.replace_with('')
    memory = re.search('(\d+)([^\d]+)', memory_limit.text).group(1)
    unit = re.search('(\d+)\s*([^\d]+)', memory_limit.text).group(2)
    if unit == 'megabytes':
        memory = float(memory) * 1024 * 1024
    return memory

def get_outputs(page):
    sample_tests = page.select('.sample-tests')[0]
    outputs = sample_tests.find_all('div', attrs={'class': 'output'})
    for i in range(len(outputs)):
        outputs[i].select('.title')[0].replace_with('')
        outputs[i] = outputs[i].text
    return outputs

def get_inputs(page):
    sample_tests = page.select('.sample-tests')[0]
    inputs = sample_tests.find_all('div', attrs={'class': 'input'})
    for i in range(len(inputs)):
        inputs[i].select('.title')[0].replace_with('')
        inputs[i] = inputs[i].text
    return inputs


problem = namedtuple("problem", ["time_limit", "memory_limit", "inputs", "outputs"])

def scrap_item(row):
    name = row.name
    url = row.link
    info_log.info(f"scrapping {row.name}th link at /{'/'.join(url.split('/')[-2:])} ...")
    resp = browser.get(url)
    if not resp.headers['Content-Type'].startswith('text/html'):
        log.debug(f'url {url} is not html and is malformed')
        return problem(time_limit=np.nan, memory_limit=np.nan, inputs=[], outputs=[])
    browser.open(url)
    resp = browser.page
    time_limit = get_time_limit(resp)
    memory_limit = get_memory_limit(resp)
    try:
        inputs = get_inputs(resp)
        outputs = get_outputs(resp)
    except Exception as e:
        log.debug(f"an error occured while scraping {row.name}th link ...\nerror: {e}")
        return problem(time_limit=time_limit, memory_limit=memory_limit, inputs=[], outputs=[])
    return problem(time_limit=time_limit, memory_limit=memory_limit, inputs=inputs, outputs=outputs)


In [9]:
tqdm.pandas()
pandarallel.initialize(progress_bar=True)

INFO: Pandarallel will run on 4 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [None]:
%%timeit
problems = pd.DataFrame.from_records(df.parallel_apply(lambda r: scrap_item(r), axis=1), columns=problem._fields)
problem.head()


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=1578), Label(value='0 / 1578'))), …