In [1]:
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")

In [2]:
from pyquery import PyQuery as pq
from bs4 import BeautifulSoup
# The "requests" library makes working with HTTP requests easier
# than the built-in urllib libraries.
import requests

In [3]:
# here we access the webpage and download the content using requests
t1970=requests.get("http://en.wikipedia.org/wiki/Billboard_Year-End_Hot_100_singles_of_1970")

In [4]:
d_=pq(t1970.text)

In [5]:
d_rows=pq(d_('.wikitable tr')[1:])

In [6]:
# Start by creating an empty list.
songs=[]

# Iterate over the elements of d_rows. In this case "r" will
# receive each value from "d_rows" in turn.
for r in d_rows:
    # Extract the "td" element from the current value of r.
    d_td=pq(r)('td')
    # Get the text from the first (index zero) "td" element, and convert
    # it to an integer. If you have the page open, this is the value on
    # the first column of the row (i.e. the song position on the chart).
    ranking = int(pq(d_td[0]).text())
    # The second column holds an "a" element, so we need to extract that
    # before getting the text data. This is the song title.
    title=pq(d_td[1])('a').text()
    # We then get the singer name, which is the text of the third column.
    band_singer=pq(d_td[2])('a').text()
    # Along with the singer name, we also want to get the URL to her Wikipedia page,
    # which is held on the "href" attribute of the a element.
    # Notice that we are still looking the same element we used to get the name (index 2).
    band_singer_url=pq(d_td[2])('a').attr.href
    # Next we'll place all this information on a dictionary (also called
    # a map, an associative array, a hash, etc.). We will use dictionaries
    # a lot, so it's worth to do some quick research on this versatil Python
    # structure.
    songdict=dict(ranking=ranking, title=title, band_singer=band_singer, url=band_singer_url)
    # Here we're just printing the dicitonary that was created during this iteration.
    # Lastly, we add the dictionary with the row information to the list we
    # created in the begining of the cell.
    # Lists will also be used extensively during the course, 
    # so you might as well do some reserach on it too.
    songs.append(songdict)

In [7]:
# By the way, indexes are open on the upper bound.
# So songs[2:4] will give us the third and fourth elements, but not the fifth.
songs[2:4]

[{'band_singer': 'The Guess Who',
  'ranking': 3,
  'title': 'American Woman',
  'url': '/wiki/The_Guess_Who'},
 {'band_singer': 'B.J. Thomas',
  'ranking': 4,
  'title': "Raindrops Keep Fallin' on My Head",
  'url': '/wiki/B.J._Thomas'}]