In [1]:
from bs4 import BeautifulSoup
from joblib import Parallel, delayed
import mechanicalsoup
import multiprocessing
import numpy as np
from collections import namedtuple
import pandas as pd
import pathlib
import re
import os
import sqlite3
from time import sleep
try:
    from tqdm.notebook import tqdm
except ImportError:
    from tqdm import tqdm

In [2]:
import logging
# logging.basicConfig(filename='scrap-items.log', level=logging.DEBUG)
filehandler = logging.FileHandler('scrap-items-debug.log', 'w')
# formatter = logging.Formatter('%(asctime)-15s::%(levelname)s::%(filename)s::%(funcName)s::%(lineno)d::%(message)s')
formatter = logging.Formatter('%(message)s')
filehandler.setFormatter(formatter)
log = logging.getLogger('root')  # root logger - Good to get it only once
for hdlr in log.handlers[:]:  # remove the existing file handlers
    if isinstance(hdlr,logging.FileHandler):
        log.removeHandler(hdlr)
log.addHandler(filehandler)      # set the new handler
# set the log level to INFO, DEBUG as the default is ERROR
log.setLevel(logging.DEBUG)


In [3]:
conn = sqlite3.connect('./links.sqlite3')
cursor = conn.execute('SELECT problem_name, difficulty, link, contest FROM codeforces ORDER BY contest;')

In [4]:
df = pd.DataFrame(cursor.fetchall())
df.columns = [description[0] for description in cursor.description]
df.head()

Unnamed: 0,problem_name,difficulty,link,contest
0,Ancient Berland Circus,C,http://codeforces.com/problemset/problem/1/C,1
1,Spreadsheet,B,http://codeforces.com/problemset/problem/1/B,1
2,Theatre Square,A,http://codeforces.com/problemset/problem/1/A,1
3,Commentator problem,C,http://codeforces.com/problemset/problem/2/C,2
4,The least round way,B,http://codeforces.com/problemset/problem/2/B,2


In [5]:
user_agent = 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.10; rv:62.0) Gecko/20100101 Firefox/49.0'
browser = mechanicalsoup.StatefulBrowser(user_agent=user_agent)

In [6]:
info_filehandler = logging.FileHandler('scrap-items-info.log', 'w')
info_formatter = logging.Formatter('%(asctime)-15s::%(levelname)s::%(funcName)s::%(lineno)d  %(message)s')
info_filehandler.setFormatter(info_formatter)
info_log = logging.getLogger('info')  # root logger - Good to get it only once
for hdlr in info_log.handlers[:]:  # remove the existing file handlers
    if isinstance(hdlr,logging.FileHandler):
        info_log.removeHandler(hdlr)
info_log.addHandler(info_filehandler)      # set the new handler
# set the log level to INFO, DEBUG as the default is ERROR
info_log.setLevel(logging.INFO)


In [7]:
problem = pd.DataFrame(columns=[
            "problem_name",
            "difficulty",
            "link",
            "contest",
            "input_file",
            "output_file",
            "title",
            "time_limit",
            "memory_limit",
            "problem_text",
            "input_specification",
            "output_specification",
            "inputs",
            "outputs",
            "note"
        ])

problem.columns.tolist()

['problem_name',
 'difficulty',
 'link',
 'contest',
 'input_file',
 'output_file',
 'title',
 'time_limit',
 'memory_limit',
 'problem_text',
 'input_specification',
 'output_specification',
 'inputs',
 'outputs',
 'note']

In [8]:
def scrap(row: pd.DataFrame):
    out = {}
    out['link'] = row['link']
    out['problem_name'] = row['problem_name']
    out['difficulty'] = row['difficulty']
    out['contest'] = row['contest']
    browser.open(row['link'])
    sleep(0.2 + np.random.rand())
    page = browser.page
    problem_statement = page.find('div', class_='problem-statement')
    try:
        title = problem_statement.find('div', class_='title').text
        out['title'] = title
    except Exception as e:
        info_log.warning(f'an error occured while scrapping {row.name}th title\n{e}')


    try:
        redundant_classes = ['property-title', 'section-title']
        for c in redundant_classes:
            for pt in problem_statement.find_all(class_=c):
                pt.decompose()

        time_limit = problem_statement.find('div', class_='time-limit').get_text()
        out['time_limit'] = time_limit
    except Exception as e:
        info_log.warning(f'an error occured while scrapping {row.name}th time-limit\n{e}')
    
    try:
        memory_limit = problem_statement.find('div', class_='memory-limit').get_text()
        out['memory_limit'] = memory_limit
    except Exception as e:
        info_log.warning(f'an error occured while scrapping {row.name}th memory-limit\n{e}')

    try:
        input_file = problem_statement.find('div', class_='input-file').get_text()
        out['input_file'] = input_file
    except Exception as e:
        info_log.warning(f'an error occured while scrapping {row.name}th input_file\n{e}')

    try:
        output_file = problem_statement.find('div', class_='output-file').get_text()
        out['output_file'] = output_file
    except Exception as e:
        info_log.warning(f'an error occured while scrapping {row.name}th output_file\n{e}')

    try:
        problem_text = problem_statement.find('div', class_='header').next_sibling.get_text()
        out['problem_text'] = problem_text
    except Exception as e:
        info_log.warning(f'an error occured while scrapping {row.name}th problem_text\n{e}')

    try:
        input_specification = problem_statement.find('div', class_='input-specification').text.replace(u"\u2009", ' ')
        out['input_specification'] = input_specification
    except Exception as e:
        info_log.warning(f'an error occured while scrapping {row.name}th input_specification\n{e}')

    try:
        output_specification = problem_statement.find('div', class_='output-specification').text.replace(u"\u2009", ' ')
        out['output_specification'] = output_specification
    except Exception as e:
        info_log.warning(f'an error occured while scrapping {row.name}th output_specification\n{e}')

    try:
        for t in problem_statement.find('div', class_='sample-tests').find_all('div', class_='title'):
            t.decompose()

        sample_tests = problem_statement.find('div', class_='sample-tests')
    except Exception as e:
        info_log.warning(f'an error occured while scrapping {row.name}th sample-tests\n{e}')


    try:
        inputs, outputs = [], []
        for _input in sample_tests.find_all('div', class_='input'):
            inputs.append(_input.get_text('\n'))

        out['inputs'] = inputs
    except Exception as e:
        info_log.warning(f'an error occured while scrapping {row.name}th inputs\n{e}')


    try:
        for _output in sample_tests.find_all('div', class_='output'):
            outputs.append(_output.get_text('\n'))
            
        out['outputs'] = outputs
    except Exception as e:
        info_log.warning(f'an error occured while scrapping {row.name}th outputs\n{e}')

        

    try:
        for e in problem_statement.find('div', class_='note').find_all(class_='tex-span'):
            e.replace_with(e.text)

        note = problem_statement.find('div', class_='note').text
        
        out['note'] = note
    except Exception as e:
        info_log.warning(f'an error occured while scrapping {row.name}th note\n{e}')

        
    return out



In [9]:
num_cores = multiprocessing.cpu_count()
chunks = np.array_split(df, num_cores)

def scrap_chunk(chunk: pd.DataFrame):
    df = pd.DataFrame()
    for i in tqdm(range(len(chunk))):
        try:
            scrapped = scrap(chunk.iloc[i])
            processed = pd.DataFrame([scrapped])
            df = df.append(processed)
            sleep(1)

        except Exception as e:
            log.warning(chunk.iloc[i].name)
            info_log.warning("{chunk.iloc[i].name}th row raised an unexpected exception:\n{e}")

    return df

if not pathlib.Path('problems.pkl').exists():
    outputs = Parallel(n_jobs=-1, prefer='threads')(delayed(scrap_chunk)(chunk) for chunk in chunks)
    problems_df = pd.concat(outputs)
    problems_df = problems_df[problems_df['problem_text'].notna()]
    problems_df = problems_df[(~ problems_df['memory_limit'].str.contains('мегабайт'))]
    meg_to_byte = lambda r: int(re.search(r"^(\d+)", r).group(1)) * 1024 * 1024
    problems_df['memory_limit'] = problems_df['memory_limit'].apply(meg_to_byte)
    time_to_float = lambda t: float(re.search(r"^([0-9.]+)", t).group(1))
    problems_df['time_limit'] = problems_df['time_limit'].apply(time_to_float)
    problems_df.to_pickle('problems.pkl')
else:
    problems_df = pd.read_pickle('problems.pkl')




In [39]:
problems_df.head()

Unnamed: 0,link,problem_name,difficulty,contest,title,time_limit,memory_limit,input_file,output_file,problem_text,input_specification,output_specification,inputs,outputs,note
0,http://codeforces.com/problemset/problem/1/C,Ancient Berland Circus,C,1,C. Ancient Berland Circus,2.0,67108864,standard input,standard output,Nowadays all circuses in Berland have a round ...,"The input file consists of three lines, each o...",Output the smallest possible area of the ancie...,[0.000000 0.000000\n1.000000 1.000000\n0.00000...,[1.00000000],
0,http://codeforces.com/problemset/problem/1/B,Spreadsheet,B,1,C. Captains Mode,2.0,268435456,standard input,standard output,Kostya is a progamer specializing in the disci...,The first line contains a single integer n (2 ...,Print a single integer — the difference betwee...,"[2\n2 1\n2\np 1\np 2, 6\n6 4 5 4 5 5\n4\nb 2\n...","[1, 0, -2]",
0,http://codeforces.com/problemset/problem/1/A,Theatre Square,A,1,B. Preparing for the Contest,2.0,268435456,standard input,standard output,Soon there will be held the world's largest pr...,The first line contains three space-separated ...,If the university can't correct all bugs print...,"[3 4 9\n1 3 1 2\n2 1 3\n4 3 6, 3 4 10\n2 3 1 2...","[YES\n2 3 2 3, YES\n1 3 1 3, YES\n3 3 2 3, NO]",Consider the first sample.The third student (w...
0,http://codeforces.com/problemset/problem/2/C,Commentator problem,C,2,A. Maze,2.0,268435456,standard input,standard output,Pavel loves grid mazes. A grid maze is an n × ...,"The first line contains three integers n, m, k...",Print n lines containing m characters each: th...,"[3 4 2\n#..#\n..#.\n#..., 5 4 5\n#...\n#.#.\n....","[#.X#\nX.#.\n#..., #XXX\n#X#.\nX#..\n...#\n.#.#]",
0,http://codeforces.com/problemset/problem/2/B,The least round way,B,2,B. Semifinals,1.0,268435456,standard input,standard output,Two semifinals have just been in the running t...,The first line contains a single integer n (1 ...,"Print two strings consisting of n characters, ...",[4\n9840 9920\n9860 9980\n9930 10020\n10040 10...,"[1110\n1100, 1100\n1100]",Consider the first sample. Each semifinal has ...


In [42]:
if not pathlib.Path('./problems.csv').exists():
    problems_df.to_csv('problems.csv')
