In [1]:
# scrape USN's top 60 countries
import requests
import re
import warnings
import pandas as pd                # for export
import pickle                      # for export
from bs4 import BeautifulSoup   # html parser
from collections import namedtuple # like a dict
from urlparse import urljoin 

In [2]:
def dl_data(url):
    print "Downloading data..."
    r = requests.get(url)
    print "Done."
    return r


def get_country_info(item):
    name = item.text
    url = item['href']
    return CountryStore(name=name, url=url)


def get_countries(raw):
    soup = BeautifulSoup(raw.text, "html.parser")
    divs = soup.find_all('div', attrs={'class': 'small-12 column'})
    p = divs[1].select('p')[0]
    return [get_country_info(c) for c in p.select('a') if get_country_info(c) is not None]


def get_subrank_info(item):
    name = item.text
    url = item['href'].replace(BASEURL, "")
    return name, url


def tofloat(text):
    t = text.strip().replace("(", "")
    try:
        float(t)
        return float(t)
    except ValueError:
        warnings.warn("Warning: Did not get a float where needed.")
        return False

    
def get_attributes(raw):
    soup = BeautifulSoup(raw.text, "html.parser")
    div = soup.select('span#docs-internal-guid-c27701e2-0e7e-7a4a-d5bd-469d49804385')
    # name, url:
    alinks = div[0].find_all('a')
    # weights:
    wts = re.findall('<b>(\S?\d+\.\d+ )percent\S?<\/b>', str(div))
    # lists of attributes:
    aa = div[0].find_all('span')
    la = []
    for i in range(len(aa)):
        if i % 3 ==0:
            t = aa[i].text.replace(u"\xa0", "").replace("<span>", "")
            t = t.replace("): ", "")
            la.append([t.split(", ")])
    # COMBINE:
    res = []
    for i in range(len(wts)):
        wts[i] = tofloat(wts[i])
        name, url = get_subrank_info(alinks[i])
        # last one manually (:
        if i == len(wts)-1:
            la.append(["a good job market, affordable, economically stable, family friendly, income equality, politically stable, safe, well-developed public education system, well-developed public health system".split(", ")])
        res.append(AttributeStore(subranking=name, url=url, weight=wts[i], attributes=la[i]))
    return res

In [16]:
BASEURL = "http://www.usnews.com"

post1 = "/news/best-countries/data-explorer"
r1 = dl_data(urljoin(BASEURL, post1))
post2= "/news/best-countries/articles/methodology"
r2 = dl_data(urljoin(BASEURL, post2))

CountryStore = namedtuple("CountryStore", 'name, url')
countries = get_countries(r1)
print "Found %i countries." % len(countries) # 60

AttributeStore = namedtuple("AttributeStore", 'subranking, url, weight, attributes')
attributes = get_attributes(r2)
print "Found %i subrankings." % len(attributes)  # 9

Downloading data...
Done.
Downloading data...
Done.
Found 60 countries.
Found 9 subrankings.


In [17]:
dfc = pd.DataFrame.from_records(countries, columns=CountryStore._fields)
print dfc.head()

dfa = pd.DataFrame.from_records(attributes, columns=AttributeStore._fields)
print dfa

#outfile = open("../../data/countries.pickle", "wb")
#pickle.dump(dfc, outfile)
#outfile.close()

#outfile = open("../../data/attributes.pickle", "wb")
#pickle.dump(dfa, outfile)
#outfile.close()

         name                              url
0     Algeria     /news/best-countries/algeria
1   Argentina   /news/best-countries/argentina
2   Australia   /news/best-countries/australia
3     Austria     /news/best-countries/austria
4  Azerbaijan  /news/best-countries/azerbaijan
           subranking                                              url  \
0           Adventure          /news/best-countries/adventure-rankings   
1         Citizenship        /news/best-countries/citizenship-rankings   
2  Cultural Influence          /news/best-countries/influence-rankings   
3    Entrepreneurship   /news/best-countries/entrepreneurship-rankings   
4            Heritage           /news/best-countries/heritage-rankings   
5   Open for Business  /news/best-countries/open-for-business-rankings   
6              Movers             /news/best-countries/movers-rankings   
7               Power              /news/best-countries/power-rankings   
8     Quality of Life    /news/best-countries/qualit