In [1]:
# Our jupyter/datascience-notebook Docker container comes with 
# BeautifulSoup4 and requests, both popular libraries!

from bs4 import BeautifulSoup
import requests

In [2]:
START_URL = 'https://brickset.com/sets/year-2016'

In [3]:
# Exercise #1: Get the titles for each "brickset" on the first page

def get_titles(soup):    
    """ Returns a list of titles on the page """
    # the "soup" parameter is of the type that is
    # returned by Beautiful Soup when it parses HTML.
    # The function should then use the object to
    # extract a list of titles (of the lego sets)
    #
    # Lookup the documentation for Beautiful Soup
    # Figure out how to select the text of the title
    # of each legoset. A title should look like: 
    # "10252: Volkswagen Beetle"
    
    sections = soup.find('section', attrs = {'class':'setlist'})
    titles = []
    for art in sections.find_all('article', attrs={'class':'set'}):
        div = art.find('div', attrs={'class':'meta'})
        h1 = div.find('h1')
        title = h1.get_text()
        titles.append(title)
    return titles

def parse_bricks(url):
    """ Fetches Lego Bricks page and extracts titles """
    # Lookup the documentation to the "requests" library
    #
    # Use requests to make a get request to the
    # url given in the argument "url" (which is a string)
    # and get the raw HTML body of the response
    #
    # Use "BeautifulSoup" to parse this HTML. 
    #
    # Use the "get_titles" function to extract the
    # titles from the BeautifulSoup object.
    #
    # Return the titles
    request = requests.get(url)
    soup = BeautifulSoup(request.text, 'html.parser')
    titles = get_titles(soup)
    return titles



In [4]:
bricks = parse_bricks(START_URL)

In [5]:
assert(bricks[0] == '10251:  Brick Bank')
assert(bricks[9] == '10722:  Snake Showdown')

In [None]:
# Exercise #2

# Now write code that gets you all the links from ALL the pages.

# HINT: you will probably want to extract the URL in the "next" button on 
# the bottom of the search pagination, which looks like ">".

# HINT HINT: Think of the previous exercise on API's and internet data.
# The Pokemon API returned JSON, that we converted to a dictionary, that
# had a nice structure. In particular, there were two top-level keys of interest, 
# one had the "results" in a list, the other was the "next" url to call to get
# more items. If you can replicate this return structure, you will be able to 
# almost reuse the while loop you had there!

# HINT HINT HINT: There's no reason you shouldn't be able to reuse the previous 
# functions (get_titles and parse_bricks)

def next_page(soup):
    pages = soup.find('div', attrs = {'class':'pagination'})
    next_page = pages.find('li', attrs = {'class':'next'})
    next_page_link = next_page.find('a')
    if next_page_link is not None:
        return next_page_link['href']
    else:
        return False

def find_all_titles(url):
    titles_all = []
    while url is not False:
        request = requests.get(url)
        soup = BeautifulSoup(request.text, 'html.parser')
        titles_all += get_titles(soup)
        url = next_page(soup)
    return titles_all

        
        
titles = find_all_titles(START_URL)

