In [11]:
import mechanicalsoup
import pandas as pd
import re
import sqlite3
from sklearn.preprocessing import MultiLabelBinarizer
from tqdm import tqdm
from urllib.parse import urljoin, urlparse

In [2]:
browser = mechanicalsoup.StatefulBrowser()
url = "http://codeforces.com/problemset"

In [3]:
def get_links(page: str):
    browser.open(urljoin(base_url, page))
    table = browser.page.select('.problems')[0]
    problems = table.find_all('tr')
    out = []
    for i in range(1, len(problems)):
        data = problems[i].find_all('td')[0:2]
        ida = data[0].a['href']
        difficulty = urlparse(ida).path.rpartition('/')[2]
        if not difficulty.isalpha():
            continue
        name = data[1].select('div:nth-child(1)')[0]
        tags = [tag.text for tag in data[1].select('div:nth-child(2)')[0].find_all('a')]
        out.append({
            'link': urljoin(base_url, ida),
            'difficulty': difficulty[0],
            'name': name.a.text.strip(),
            'tags': tags
        })
    return out

In [4]:
resp = browser.open(url)
next_page_anchor = browser.page.find('a', text='→')
last_page_url = next_page_anchor.parent.parent.select('li:nth-last-child(2)')[0].span.a['href']
last_page = urljoin(url, last_page_url)
pattern = r'.+/(\d+)$'
lp = int(re.search(pattern, last_page).groups(1)[0])
base_url = urljoin(last_page, '.')

In [5]:
scraped = []
for i in tqdm(range(1, lp + 1)):
    scraped.append(get_links(f'{i}'))


100%|██████████████████████████████████████████| 67/67 [01:03<00:00,  1.06it/s]


In [15]:
df = pd.DataFrame(columns=['name', 'difficulty', 'link', 'tags'])
for page in scraped:
    links = pd.DataFrame(page, columns=['name', 'difficulty', 'link', 'tags'])
    links['tags'] = links['tags'].apply(lambda tags: [re.sub(r'[^A-Za-z0-9 ]+', '', tag).replace(' ', '_') for tag in tags])
    df = df.append(links)

df.head()

Unnamed: 0,name,difficulty,link,tags
0,Nezzar and Symmetric Array,C,http://codeforces.com/problemset/problem/1478/C,"[implementation, math, sortings]"
1,Nezzar and Lucky Number,B,http://codeforces.com/problemset/problem/1478/B,"[brute_force, dp, greedy, math]"
2,Nezzar and Colorful Balls,A,http://codeforces.com/problemset/problem/1478/A,"[brute_force, greedy]"
3,Nezzar and Chocolate Bars,F,http://codeforces.com/problemset/problem/1477/F,"[combinatorics, fft, math, probabilities]"
4,Nezzar and Tournaments,E,http://codeforces.com/problemset/problem/1477/E,"[data_structures, greedy]"


In [16]:
df['contest'] = df.apply(lambda r: urlparse(r.link).path.rpartition('/')[0].rpartition('/')[2], axis=1)
df['contest'] = df['contest'].apply(int)
df.head()

Unnamed: 0,name,difficulty,link,tags,contest
0,Nezzar and Symmetric Array,C,http://codeforces.com/problemset/problem/1478/C,"[implementation, math, sortings]",1478
1,Nezzar and Lucky Number,B,http://codeforces.com/problemset/problem/1478/B,"[brute_force, dp, greedy, math]",1478
2,Nezzar and Colorful Balls,A,http://codeforces.com/problemset/problem/1478/A,"[brute_force, greedy]",1478
3,Nezzar and Chocolate Bars,F,http://codeforces.com/problemset/problem/1477/F,"[combinatorics, fft, math, probabilities]",1477
4,Nezzar and Tournaments,E,http://codeforces.com/problemset/problem/1477/E,"[data_structures, greedy]",1477


In [17]:
df.to_csv('links.csv', index=False)

In [18]:
mlb = MultiLabelBinarizer()
dft = df.join(pd.DataFrame(mlb.fit_transform(df['tags']),columns=mlb.classes_))
dft.drop('tags', axis=1, inplace=True)
dft.head()

Unnamed: 0,name,difficulty,link,contest,2sat,binary_search,bitmasks,brute_force,chinese_remainder_theorem,combinatorics,...,probabilities,schedules,shortest_paths,sortings,special_problem,string_suffix_structures,strings,ternary_search,trees,two_pointers
0,Nezzar and Symmetric Array,C,http://codeforces.com/problemset/problem/1478/C,1478,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
0,Water Level,E,http://codeforces.com/problemset/problem/1461/E,1461,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
0,Minimal Height Tree,D,http://codeforces.com/problemset/problem/1437/D,1437,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
0,Decryption,E,http://codeforces.com/problemset/problem/1419/E,1419,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
0,Two Types of Spells,E,http://codeforces.com/problemset/problem/1398/E,1398,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [19]:
conn = sqlite3.connect('./links.sqlite3')

In [20]:
dft.to_sql('codeforces', conn, if_exists='replace')
conn.commit()

In [21]:
tags = dft.columns.tolist()[3:]
tags

['contest',
 '2sat',
 'binary_search',
 'bitmasks',
 'brute_force',
 'chinese_remainder_theorem',
 'combinatorics',
 'constructive_algorithms',
 'data_structures',
 'dfs_and_similar',
 'divide_and_conquer',
 'dp',
 'dsu',
 'expression_parsing',
 'fft',
 'flows',
 'games',
 'geometry',
 'graph_matchings',
 'graphs',
 'greedy',
 'hashing',
 'implementation',
 'interactive',
 'math',
 'matrices',
 'meetinthemiddle',
 'number_theory',
 'probabilities',
 'schedules',
 'shortest_paths',
 'sortings',
 'special_problem',
 'string_suffix_structures',
 'strings',
 'ternary_search',
 'trees',
 'two_pointers']

In [22]:
pd.DataFrame(tags, columns=['tags']).to_sql('tags', conn, if_exists='replace')
conn.commit()
conn.close()