In [4]:
from joblib import Parallel, delayed
import mechanicalsoup
import multiprocessing
import pandas as pd
import re
import sqlite3
from sklearn.preprocessing import MultiLabelBinarizer
try:
    from tqdm.notebook import tqdm
except ImportError:
    from tqdm import tqdm
from urllib.parse import urljoin, urlparse

In [5]:
browser = mechanicalsoup.StatefulBrowser()
url = "http://codeforces.com/problemset"

In [6]:
def get_links(page: str):
    browser.open(urljoin(base_url, page))
    table = browser.page.select('.problems')[0]
    problems = table.find_all('tr')
    out = []
    for i in range(1, len(problems)):
        data = problems[i].find_all('td')[0:2]
        ida = data[0].a['href']
        difficulty = urlparse(ida).path.rpartition('/')[2]
        if not difficulty.isalpha():
            continue
        name = data[1].select('div:nth-child(1)')[0]
        tags = [tag.text for tag in data[1].select('div:nth-child(2)')[0].find_all('a')]
        out.append({
            'link': urljoin(base_url, ida),
            'difficulty': difficulty[0],
            'problem_name': name.a.text.strip(),
            'tags': tags
        })
    return out

In [7]:
resp = browser.open(url)
next_page_anchor = browser.page.find('a', text='→')
last_page_url = next_page_anchor.parent.parent.select('li:nth-last-child(2)')[0].span.a['href']
last_page = urljoin(url, last_page_url)
pattern = r'.+/(\d+)$'
lp = int(re.search(pattern, last_page).groups(1)[0])
base_url = urljoin(last_page, '.')

In [8]:
num_cores = multiprocessing.cpu_count()
inputs = tqdm(range(1, lp + 1))
scraped = Parallel(n_jobs=num_cores, prefer="threads")(delayed(get_links)(f'{i}') for i in inputs)

  0%|          | 0/67 [00:00<?, ?it/s]

In [9]:
df = pd.DataFrame(columns=['problem_name', 'difficulty', 'link', 'tags'])
for page in scraped:
    links = pd.DataFrame(page, columns=['problem_name', 'difficulty', 'link', 'tags'])
    links['tags'] = links['tags'].apply(lambda tags: [re.sub(r'[^A-Za-z0-9 ]+', '', tag).replace(' ', '_') for tag in tags])
    df = df.append(links)

df.head()

Unnamed: 0,problem_name,difficulty,link,tags
0,AB Tree,F,http://codeforces.com/problemset/problem/1481/F,"[dp, greedy, trees]"
1,Sorting Books,E,http://codeforces.com/problemset/problem/1481/E,"[data_structures, dp]"
2,AB Graph,D,http://codeforces.com/problemset/problem/1481/D,"[constructive_algorithms, graphs, greedy, impl..."
3,Fence Painting,C,http://codeforces.com/problemset/problem/1481/C,"[brute_force, constructive_algorithms, greedy,..."
4,New Colony,B,http://codeforces.com/problemset/problem/1481/B,"[brute_force, greedy, implementation]"


In [10]:
df['contest'] = df.apply(lambda r: urlparse(r.link).path.rpartition('/')[0].rpartition('/')[2], axis=1)
df['contest'] = df['contest'].apply(int)
df.head()

Unnamed: 0,problem_name,difficulty,link,tags,contest
0,AB Tree,F,http://codeforces.com/problemset/problem/1481/F,"[dp, greedy, trees]",1481
1,Sorting Books,E,http://codeforces.com/problemset/problem/1481/E,"[data_structures, dp]",1481
2,AB Graph,D,http://codeforces.com/problemset/problem/1481/D,"[constructive_algorithms, graphs, greedy, impl...",1481
3,Fence Painting,C,http://codeforces.com/problemset/problem/1481/C,"[brute_force, constructive_algorithms, greedy,...",1481
4,New Colony,B,http://codeforces.com/problemset/problem/1481/B,"[brute_force, greedy, implementation]",1481


In [11]:
df.to_csv('links.csv', index=False)

In [12]:
mlb = MultiLabelBinarizer()
dft = df.join(pd.DataFrame(mlb.fit_transform(df['tags']),columns=mlb.classes_))
dft.drop('tags', axis=1, inplace=True)
dft.head()

Unnamed: 0,problem_name,difficulty,link,contest,2sat,binary_search,bitmasks,brute_force,chinese_remainder_theorem,combinatorics,...,probabilities,schedules,shortest_paths,sortings,special_problem,string_suffix_structures,strings,ternary_search,trees,two_pointers
0,AB Tree,F,http://codeforces.com/problemset/problem/1481/F,1481,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
0,Add to Neighbour and Remove,D,http://codeforces.com/problemset/problem/1462/D,1462,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
0,Engineer Artem,C,http://codeforces.com/problemset/problem/1438/C,1438,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
0,Rescue Nibel!,D,http://codeforces.com/problemset/problem/1420/D,1420,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
0,Binary String To Subsequences,D,http://codeforces.com/problemset/problem/1399/D,1399,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [13]:
conn = sqlite3.connect('./links.sqlite3')

In [14]:
dft.to_sql('codeforces', conn, if_exists='replace')
conn.commit()

In [15]:
tags = dft.columns.tolist()[3:]
tags

['contest',
 '2sat',
 'binary_search',
 'bitmasks',
 'brute_force',
 'chinese_remainder_theorem',
 'combinatorics',
 'constructive_algorithms',
 'data_structures',
 'dfs_and_similar',
 'divide_and_conquer',
 'dp',
 'dsu',
 'expression_parsing',
 'fft',
 'flows',
 'games',
 'geometry',
 'graph_matchings',
 'graphs',
 'greedy',
 'hashing',
 'implementation',
 'interactive',
 'math',
 'matrices',
 'meetinthemiddle',
 'number_theory',
 'probabilities',
 'schedules',
 'shortest_paths',
 'sortings',
 'special_problem',
 'string_suffix_structures',
 'strings',
 'ternary_search',
 'trees',
 'two_pointers']

In [16]:
pd.DataFrame(tags, columns=['tags']).to_sql('tags', conn, if_exists='replace')
conn.commit()
conn.close()