In [1]:
import time

import requests
from bs4 import BeautifulSoup

In [2]:
BASE_URL = 'http://codingbat.com'

In [3]:
def get_matching_urls(page_url, url_prefix):
    resp = requests.get(page_url)
    if resp.status_code != 200:
        print(resp.status_code)
        print('bad get')
        return
    
    soup = BeautifulSoup(resp.content, 'html.parser')
    # use two lists here to allow for duplicates
    ret_text, ret_url = [], []
    for anchor_el in soup.find_all('a'):
        if anchor_el['href'].startswith(url_prefix):
            href = anchor_el['href']
            text = anchor_el.text
            
            ret_text.append(text)
            ret_url.append(href)
    return ret_text, ret_url

In [4]:
sections = get_matching_urls(BASE_URL + '/java', '/java/')

In [5]:
problem_data = []
for section in sections[1]:
    print(section)
    res = get_matching_urls(BASE_URL + section, '/prob/')
    problem_data.append(res)
    
    time.sleep(10)

/java/Warmup-1
/java/Warmup-2
/java/String-1
/java/Array-1
/java/Logic-1
/java/Logic-2
/java/String-2
/java/String-3
/java/Array-2
/java/Array-3
/java/AP-1
/java/Recursion-1
/java/Recursion-2
/java/Map-1
/java/Map-2
/java/Functional-1
/java/Functional-2


In [6]:
problem_urls = [url for grouping in problem_data for url in grouping[1]]
len(problem_urls), len(set(problem_urls))

(317, 317)

In [7]:
problems = {}
for i, subsection in enumerate(sections[0]):
    for title, url in zip(problem_data[i][0], problem_data[i][1]):
        problems[url] = {
            'section': subsection,
            'title': title,
        }

In [8]:
def get_problem(url):
    body = requests.get(url)
    if body.status_code != 200:
        print(body.status_code)
        return
    content = body.text.replace('<br>', '<br/>')
    return content

In [9]:
import datetime

In [11]:
print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
for problem_url, v in problems.items():
    if problem_url.endswith('1'):
        print(problem_url)
    else:
        print('.', end='')
    res = get_problem(BASE_URL + problem_url)
    v['raw'] = res
    time.sleep(5)
print('done')
print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))

2017-03-30 16:43:57
/prob/p136351
............../prob/p159531
../prob/p139411
............................/prob/p163411
.............../prob/p191991
......../prob/p143461
./prob/p128461
............/prob/p105771
/prob/p136041
...../prob/p194491
....../prob/p170371
/prob/p115011
..................../prob/p194781
........../prob/p177181
....../prob/p196441
..................../prob/p172021
./prob/p105031
/prob/p110141
.................................../prob/p130781
............../prob/p165701
........./prob/p105671
..../prob/p183071
/prob/p184031
....../prob/p170181
./prob/p113261
..../prob/p199171
............../prob/p170221
.........../prob/p141061
...../prob/p167011
..../prob/p165941
......................../prob/p186031
.......done
2017-03-30 17:10:57


In [13]:
import json

In [14]:
with open('./problems.json', 'w') as f:
    json.dump(problems, f)