In [1]:
from joblib import Parallel, delayed
import mechanicalsoup
import multiprocessing
import numpy as np
from collections import namedtuple
import pandas as pd
from pandarallel import pandarallel
import pathlib
import re
import os
import sqlite3
from time import sleep
try:
    from tqdm.notebook import tqdm
except ImportError:
    from tqdm import tqdm

In [2]:
import logging
# logging.basicConfig(filename='scrap-items.log', level=logging.DEBUG)
filehandler = logging.FileHandler('scrap-items-debug.log', 'w')
# formatter = logging.Formatter('%(asctime)-15s::%(levelname)s::%(filename)s::%(funcName)s::%(lineno)d::%(message)s')
formatter = logging.Formatter('%(message)s')
filehandler.setFormatter(formatter)
log = logging.getLogger('root')  # root logger - Good to get it only once
for hdlr in log.handlers[:]:  # remove the existing file handlers
    if isinstance(hdlr,logging.FileHandler):
        log.removeHandler(hdlr)
log.addHandler(filehandler)      # set the new handler
# set the log level to INFO, DEBUG as the default is ERROR
log.setLevel(logging.DEBUG)


In [3]:
conn = sqlite3.connect('./links.sqlite3')
cursor = conn.execute('SELECT name, difficulty, link, contest FROM codeforces ORDER BY contest;')

In [4]:
df = pd.DataFrame(cursor.fetchall())
df.columns = [description[0] for description in cursor.description]
df.head()

Unnamed: 0,name,difficulty,link,contest
0,Ancient Berland Circus,C,http://codeforces.com/problemset/problem/1/C,1
1,Spreadsheet,B,http://codeforces.com/problemset/problem/1/B,1
2,Theatre Square,A,http://codeforces.com/problemset/problem/1/A,1
3,Commentator problem,C,http://codeforces.com/problemset/problem/2/C,2
4,The least round way,B,http://codeforces.com/problemset/problem/2/B,2


In [5]:
user_agent = 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.10; rv:62.0) Gecko/20100101 Firefox/49.0'
browser = mechanicalsoup.StatefulBrowser(user_agent=user_agent)

In [6]:
pathlib.Path("./problems/").mkdir(parents=True, exist_ok=True)

In [7]:
info_filehandler = logging.FileHandler('scrap-items-info.log', 'w')
info_formatter = logging.Formatter('%(asctime)-15s::%(levelname)s::%(funcName)s::%(lineno)d  %(message)s')
info_filehandler.setFormatter(info_formatter)
info_log = logging.getLogger('info')  # root logger - Good to get it only once
for hdlr in info_log.handlers[:]:  # remove the existing file handlers
    if isinstance(hdlr,logging.FileHandler):
        info_log.removeHandler(hdlr)
info_log.addHandler(info_filehandler)      # set the new handler
# set the log level to INFO, DEBUG as the default is ERROR
info_log.setLevel(logging.INFO)


In [8]:
def get_time_limit(page):
    time_limit = page.select('.time-limit')[0]
    time_limit.div.replace_with('')
    time = re.search('(\d+)[^\d]*', time_limit.text).group(1)
    return float(time)

def get_memory_limit(page):
    memory_limit = page.select('.memory-limit')[0]
    memory_limit.div.replace_with('')
    memory = re.search('(\d+)([^\d]+)', memory_limit.text).group(1)
    unit = re.search('(\d+)\s*([^\d]+)', memory_limit.text).group(2)
    if unit == 'megabytes':
        memory = float(memory) * 1024 * 1024
    return memory

def get_outputs(page):
    sample_tests = page.select('.sample-tests')[0]
    output = sample_tests.find('div', attrs={'class': 'output'}).find('pre')
    for br in output.find_all('br'):
        br.replace_with('\n')
    tests = output.text.strip().rstrip().split('\n')
    return tests

def get_inputs(page):
    sample_tests = page.select('.sample-tests')[0]
    _input = sample_tests.find('div', attrs={'class': 'input'}).find('pre')
    for br in _input.find_all('br'):
        br.replace_with('\n')
    _inputs = _input.text.strip().rstrip().split('\n')
    return _inputs


problem = namedtuple("problem", ["problem_name", "difficulty", "link", "contest", "time_limit", "memory_limit", "inputs", "outputs"])

def scrap_item(row):
    name = row.name
    link = row.link
    difficulty = row.difficulty
    contest = row.contest
    info_log.info(f"scrapping {row.name}th link at /{'/'.join(link.split('/')[-2:])} ...")
    browser = mechanicalsoup.StatefulBrowser(user_agent=user_agent)
    resp = browser.get(link)
    if not resp.headers['Content-Type'].startswith('text/html'):
        log.debug(f'{row.name}')
        browser.close()
        return problem(problem_name=name, link=link, difficulty=difficulty, contest=contest, time_limit=np.nan, memory_limit=np.nan, inputs=[], outputs=[])
    sleep(0.08)
    browser.open(link)
    resp = browser.page
    try:
        time_limit = get_time_limit(resp)
    except Exception as e:
#         log.debug(f"an error occured while scraping {row.name}th time-limit ...\nerror: {e}")
        log.debug(f"{row.name}")
        browser.close()
        return problem(problem_name=name, link=link, difficulty=difficulty, contest=contest, time_limit=np.nan, memory_limit=np.nan, inputs=[], outputs=[])
    try:
        memory_limit = get_memory_limit(resp)
    except Exception as e:
        log.debug(f"{row.name}")
        browser.close()
        return problem(problem_name=name, link=link, difficulty=difficulty, contest=contest, time_limit=time_limit, memory_limit=np.nan, inputs=[], outputs=[])        
    try:
        inputs = get_inputs(resp)
    except Exception as e:
        log.debug(f"{row.name}")
        browser.close()
        return problem(problem_name=name, link=link, difficulty=difficulty, contest=contest, time_limit=time_limit, memory_limit=memory_limit, inputs=[], outputs=[])
    try:
        outputs = get_outputs(resp)
    except Exception as e:
        log.debug(f"{row.name}")
        browser.close()
        return problem(problem_name=name, link=link, difficulty=difficulty, contest=contest, time_limit=time_limit, memory_limit=memory_limit, inputs=inputs, outputs=[])
    browser.close()
    return problem(problem_name=name, link=link, difficulty=difficulty, contest=contest, time_limit=time_limit, memory_limit=memory_limit, inputs=inputs, outputs=outputs)


In [9]:
# %%timeit
# tqdm.pandas()
# pandarallel.initialize(progress_bar=True)
# problems = pd.DataFrame.from_records(df.parallel_apply(lambda r: scrap_item(r), axis=1), columns=problem._fields)
# problem.head()

In [10]:
def apply_on_chunk(chunk: pd.core.frame.DataFrame):
    out = pd.DataFrame()
    for i in tqdm(range(len(chunk))):
        try:
            processed_row = pd.DataFrame([scrap_item(chunk.iloc[i])], columns=problem._fields)
            out = out.append(processed_row)
        except Exception as e:
            log.debug(f"{chunk.iloc[i].name}")
    return out


if not pathlib.Path('problems.pkl').exists():
    num_cores = multiprocessing.cpu_count()
    inputs = np.array_split(df, num_cores)
    problems = Parallel(n_jobs=-1, prefer='threads')(delayed(apply_on_chunk)(i) for i in inputs)
    problems_df = pd.concat(problems)
    problems_df.to_pickle('problems.pkl')
else:
    problems_df = pd.read_pickle('problems.pkl')

In [11]:
def get_name(row):
    link = row.link
    browser = mechanicalsoup.StatefulBrowser(user_agent=user_agent)
    browser.open(link)
    page = browser.page
    title = page.select('.title')[0].text
    name = re.sub(r'^[a-zA-Z0-9]{1,2}\. (.+)$', r'\1', title)
    return name

def chunk_name_df(chunk: pd.core.frame.DataFrame):
    names = []
    for i in tqdm(range(len(chunk))):
        try:
            names.append(get_name(chunk.iloc[i]))
        except Exception as e:
            print(f"an error occured while getting {chunk.iloc[i].link} ...")
            return pd.DataFrame(columns='problem_name')
    return pd.DataFrame(names, columns='problem_name')


num_cores = multiprocessing.cpu_count()
inputs = np.array_split(problems_df, num_cores)
names = Parallel(n_jobs=-1, prefer='threads')(delayed(chunk_name_df)(i) for i in inputs)




  0%|          | 0/1577 [00:00<?, ?it/s]

  0%|          | 0/1578 [00:00<?, ?it/s]

  0%|          | 0/1578 [00:00<?, ?it/s]

  0%|          | 0/1578 [00:00<?, ?it/s]

an error occured while getting http://codeforces.com/problemset/problem/417/E ...
an error occured while getting http://codeforces.com/problemset/problem/1170/B ...
an error occured while getting http://codeforces.com/problemset/problem/32/E ...


TypeError: Index(...) must be called with a collection of some kind, 'problem_name' was passed

an error occured while getting http://codeforces.com/problemset/problem/799/F ...
