In [10]:
import mechanicalsoup
import numpy as np
from collections import namedtuple
import pandas as pd
from pandarallel import pandarallel
import pathlib
import re
import os
import sqlite3
from time import sleep
try:
    from tqdm.notebook import tqdm
except ImportError:
    from tqdm import tqdm

In [11]:
conn = sqlite3.connect('./links.sqlite3')
cursor = conn.execute('SELECT name, difficulty, link, contest FROM codeforces ORDER BY contest;')

In [12]:
df = pd.DataFrame(cursor.fetchall())
df.columns = [description[0] for description in cursor.description]
df.head()

Unnamed: 0,name,difficulty,link,contest
0,Ancient Berland Circus,C,http://codeforces.com/problemset/problem/1/C,1
1,Spreadsheet,B,http://codeforces.com/problemset/problem/1/B,1
2,Theatre Square,A,http://codeforces.com/problemset/problem/1/A,1
3,Commentator problem,C,http://codeforces.com/problemset/problem/2/C,2
4,The least round way,B,http://codeforces.com/problemset/problem/2/B,2


In [13]:
browser = mechanicalsoup.StatefulBrowser()

In [14]:
pathlib.Path("./problems/").mkdir(parents=True, exist_ok=True)

In [15]:
def get_time_limit(page):
    time_limit = page.select('.time-limit')[0]
    time_limit.div.replace_with('')
    time = re.search('(\d+)[^\d]*', time_limit.text).group(1)
    return float(time)

def get_memory_limit(page):
    memory_limit = page.select('.memory-limit')[0]
    memory_limit.div.replace_with('')
    memory = re.search('(\d+)([^\d]+)', memory_limit.text).group(1)
    unit = re.search('(\d+)\s*([^\d]+)', memory_limit.text).group(2)
    if unit == 'megabytes':
        memory = float(memory) * 1024 * 1024
    return memory

def get_outputs(page):
    sample_tests = page.select('.sample-tests')[0]
    outputs = sample_tests.find_all('div', attrs={'class': 'output'})
    for i in range(len(outputs)):
        outputs[i].select('.title')[0].replace_with('')
        outputs[i] = outputs[i].text
    return outputs

def get_inputs(page):
    sample_tests = page.select('.sample-tests')[0]
    inputs = sample_tests.find_all('div', attrs={'class': 'input'})
    for i in range(len(inputs)):
        inputs[i].select('.title')[0].replace_with('')
        inputs[i] = inputs[i].text
    return inputs


problem = namedtuple("problem", ["time_limit", "memory_limit", "inputs", "outputs"])

def scrap_item(row):
    name = row.name
    url = row.link
    sleep(0.3)
    resp = browser.get(url)
    if not resp.headers['Content-Type'].startswith('text/html'):
        print(f'url {url} is not html and is malformed')
        return problem(time_limit=np.nan, memory_limit=np.nan, inputs=[], outputs=[])
    browser.open(url)
    resp = browser.page
    time_limit = get_time_limit(resp)
    memory_limit = get_memory_limit(resp)
    inputs = get_inputs(resp)
    outputs = get_outputs(resp)
    return problem(time_limit=time_limit, memory_limit=memory_limit, inputs=inputs, outputs=outputs)


In [16]:
tqdm.pandas()
pandarallel.initialize(progress_bar=True)

INFO: Pandarallel will run on 4 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [17]:
%%timeit
problems = pd.DataFrame.from_records(df.parallel_apply(lambda r: scrap_item(r), axis=1), columns=problem._fields)
problem.head()


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=1578), Label(value='0 / 1578'))), …

Process ForkPoolWorker-22:
Process ForkPoolWorker-21:
Process ForkPoolWorker-19:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/home/mohammad/.pyenv/versions/3.8.7/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/home/mohammad/.pyenv/versions/3.8.7/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
Process ForkPoolWorker-20:
  File "/home/mohammad/.pyenv/versions/3.8.7/lib/python3.8/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/home/mohammad/.pyenv/versions/3.8.7/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/home/mohammad/.pyenv/versions/3.8.7/lib/python3.8/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/home/mohammad/.pyenv/versions/3.8.7/lib/python3.8/multiprocessing/process.py", line 108, in run

KeyboardInterrupt: 