## Data Acquisition 

This notebook seeks the genres for the artists responsible for top 100 before 1992.   It uses the approach of HW1 first. 

In [21]:
# The stuff from HW1
# The %... is an iPython thing, and is not part of the Python language.
# In this case we're just telling the plotting library to draw things on
# the notebook, instead of on a separate window.
%matplotlib inline
# See all the "as ..." contructs? They're just aliasing the package names.
# That way we can call methods like plt.plot() instead of matplotlib.pyplot.plot().
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
import time
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")

from pyquery import PyQuery as pq

import requests
import json
import os

Now the scraping routine adjusted to include years 1970 to 1991

In [22]:
readflag=False
if os.access('tempdata/yearinfo.json', os.R_OK)== True:
    readflag=True
    print "Reading from File, Not Internet"
    with open("tempdata/yearinfo.json", "r") as fd:
        yearinfo = json.load(fd)

In [23]:
if not readflag:
    years=range(1970,2015)
    print years
    yearstext={}
    for y in years:
        print y
        yreq=requests.get("http://en.wikipedia.org/wiki/Billboard_Year-End_Hot_100_singles_of_%i" % y)
        yearstext[y]=yreq.text
        time.sleep(1)


[1970, 1971, 1972, 1973, 1974, 1975, 1976, 1977, 1978, 1979, 1980, 1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014]
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014


Wikipedia's format for the billboard 100 changed between 1981 and 1982. Before 1981 The rankings were introduced with table rows; after, it was table headers. So the function `parse_year()` will work for 1982-1991.  Something else is necessary for earlier ones. 

In [24]:
def parse_year(year, ytextdict):
    #print "Year", year
    # Common over all years
    d_ = pq(ytextdict[year])
    d_rows = pq(d_('.wikitable tr')[1:])
    songs=[]
    for r in d_rows:
        # Now we have to handle pre-1982 separately from post- 
        # Wikipedia decided to change the table format along the way, and place the song position on a "th" element
        # instead of the "td" element from the first page we parsed.
        if year < 1982:
            #first cell has ranking
            ranking = int(pq(r)('td').eq(0).text())
            songid = str(year)+'-'+str(ranking)
            #second has title info
            d_title=pq(r)('td').eq(1)
            titletext = d_title.text()
            bst=pq(pq(d_title)('a'))
            #third has artist info
            d_artist = pq(r)('td').eq(2)
            tdtext=pq(d_artist.text()) 
            bs=pq(pq(d_artist)('a'))
        else:
            d_th=pq(r)('th')
            ranking = int(pq(d_th[0]).text())
            songid = str(year)+'-'+str(ranking)
            d_td=pq(r)('td')
            titletext=pq(d_td[0]).text()
            bst=pq(pq(d_td[0])('a'))
            tdtext=pq(d_td[1]).text()
            bs=pq(pq(d_td[1])('a'))
        
        songanchors=[pq(e) for e in bst]
        songnames=[item.text() for item in songanchors]
        songurls=[item.attr.href for item in songanchors]
        # And do the same for the song names and URLs.
        band_singer=[pq(e).text() for e in bs]
        band_singer_url=[pq(e).attr.href for e in bs]
        # Some singers and songs don't actually have an "a" element, and therefore don't have a URL link.
        # In those cases just use the text between the tags as the song name and set the URL field to None.
        if len(songanchors)==0:
            # Python accepts negative indexes, which count from the end of the string.
            # There is no "-0" position, to the last element is the "-1" element. In this case we are saying
            # we only want the second to next to last elements. We do this to remove the quotes around the string.
            song=[titletext[1:-1].strip()] # The strip method removes whitespaces at the begining and end of a string.
            songurl=[None]
        songdict=dict(ranking=ranking, song=songnames, songid=songid, songurl=songurls, titletext=titletext,  band_singer=band_singer, url=band_singer_url)
        songs.append(songdict)
    return songs

#yearstext[1970]
#parse_year(1980, yearstext)[:5]

In [25]:
yearinfo = {y:parse_year(y, yearstext) for y in years}

In [26]:
yearinfo[1970][0]

{'band_singer': ['Simon & Garfunkel'],
 'ranking': 1,
 'song': ['Bridge over Troubled Water'],
 'songid': '1970-1',
 'songurl': ['/wiki/Bridge_over_Troubled_Water_(song)'],
 'titletext': '" Bridge over Troubled Water "',
 'url': ['/wiki/Simon_%26_Garfunkel']}

In [27]:
if not readflag:
    fd = open("tempdata/yearinfo.json","w")
    json.dump(yearinfo, fd)
    fd.close()


Here we should be able to follow HW1 for making a flat dataframe:

In [28]:
yeardict={}
for y in yearinfo.keys():
    yearlist=yearinfo[y]
    yearlist2=[]
    for idict in yearlist:
        singers=idict['band_singer']
        for i,s in enumerate(singers):
            songs=idict['song']
            for j,so in enumerate(songs):#now inside each singer song combination
                nd={}
                nd['band_singer']=s
                nd['url']=idict['url'][i]
                nd['song']=so
                nd['songurl']=idict['songurl'][j]
                nd['ranking']=idict['ranking']
                yearlist2.append(nd)
    yeardict[y]=pd.DataFrame(yearlist2)#one for each year
yearspanel=pd.Panel.from_dict(yeardict, orient="minor")#stack dataframes into a panel
hierframe=yearspanel.to_frame() #flattening leads to a hierarchical index
flatframe = hierframe.reset_index()
flatframe = flatframe.rename(columns={'minor':'year'})
del flatframe['major']
flatframe.head(8)


Unnamed: 0,year,band_singer,ranking,song,songurl,url
0,1970,Simon & Garfunkel,1,Bridge over Troubled Water,/wiki/Bridge_over_Troubled_Water_(song),/wiki/Simon_%26_Garfunkel
1,1971,Three Dog Night,1,Joy to the World,/wiki/Joy_to_the_World_(Hoyt_Axton_song),/wiki/Three_Dog_Night
2,1972,Roberta Flack,1,The First Time Ever I Saw Your Face,/wiki/The_First_Time_Ever_I_Saw_Your_Face,/wiki/Roberta_Flack
3,1973,Tony Orlando and Dawn,1,Tie a Yellow Ribbon Round the Ole Oak Tree,/wiki/Tie_a_Yellow_Ribbon_Round_the_Ole_Oak_Tree,/wiki/Tony_Orlando_and_Dawn
4,1974,Barbra Streisand,1,The Way We Were,/wiki/The_Way_We_Were_(song),/wiki/Barbra_Streisand
5,1975,Captain & Tennille,1,Love Will Keep Us Together,/wiki/Love_Will_Keep_Us_Together,/wiki/Captain_%26_Tennille
6,1976,Wings,1,Silly Love Songs,/wiki/Silly_Love_Songs,/wiki/Wings_(band)
7,1977,Rod Stewart,1,Tonight's the Night (Gonna Be Alright),/wiki/Tonight%27s_the_Night_(Gonna_Be_Alright),/wiki/Rod_Stewart


In [30]:
flatframe.year = flatframe.year.astype(int)
flatframe.dtypes

year             int32
band_singer     object
ranking        float64
song            object
songurl         object
url             object
dtype: object

In [31]:
# Again there  is a change in construction of Genres section of artist page
# 1970- genres are comma-sparated list, not <li> tagged
# same access method, just eliminate the li argument to nextAll 
# since one ignores the other, just run both
for th in pq(requests.get("http://en.wikipedia.org/wiki/Jason_Derulo").text)(".infobox tr th"):
    if pq(th).text() == "Genres":
        for e in pq(th).nextAll("td  a"):
            if pq(e).attr.href.find("#cite_note") == -1:
                print pq(e).attr.href, pq(e).attr.title




for th in pq(requests.get("http://en.wikipedia.org/wiki/Jason_Derulo").text)(".infobox tr th"):
    if pq(th).text() == "Genres":
        for e in pq(th).nextAll("td li a"):
            if pq(e).attr.href.find("#cite_note") == -1:
                print pq(e).attr.href, pq(e).attr.title

/wiki/Contemporary_R%26B Contemporary R&B
/wiki/Hip_hop_music Hip hop music
/wiki/Pop_music Pop music
/wiki/Contemporary_R%26B Contemporary R&B
/wiki/Hip_hop_music Hip hop music
/wiki/Pop_music Pop music


In [50]:
def get_page(url):
    # Check if URL has already been visited.
    if (url not in urlcache) or (urlcache[url]==1) or (urlcache[url]==2):
        time.sleep(1)
        # try/except blocks are used whenever the code could generate an exception (e.g. division by zero).
        # In this case we don't know if the page really exists, or even if it does, if we'll be able to reach it.
        try:
            r = requests.get("http://en.wikipedia.org%s" % url)
            if r.status_code == 200:
                urlcache[url] = r.text
                #print url
            else:
                urlcache[url] = 1
                print "error not 200",url
        except:
            print "error",url
            urlcache[url] = 2
    
    return urlcache[url]


In [51]:
flatframe=flatframe.sort('year')
len(flatframe)

5073

In [52]:
%%time

readflag2=False
#if the file is there, just open it. 
if os.access('tempdata/artistinfo.json', os.R_OK)== True:
    readflag2=True
    print "Reading artist info from File, Not Internet"
    with open("tempdata/artistinfo.json") as fd:
        urlcache = json.load(fd)
else:
    # file not there, have to build it
    urlcache={}
    flatframe["url"].apply(get_page)

Wall time: 40min 26s


In [53]:
print np.sum([(urlcache[k]==1) or (urlcache[k]==2) for k in urlcache])# no one or 0's
print len(flatframe.url.unique())==len(urlcache)#we got all of the urls


0
True


In [54]:
if not readflag2:                #didnt just load from disk, so built urlcache; write it
    with open("tempdata/artistinfo.json","w") as fd:
        json.dump(urlcache, fd)
        print "writing to disk"
    


writing to disk


In [55]:
len(urlcache)

1833

In [56]:
def singer_band_info(url, page_text):
    genres=['NA']
    born=False
    ya=False

    for th in pq(page_text)(".infobox tr th"):
        if pq(th).text() == "Genres":
            genres=[]
            for e in pq(th).nextAll("td a"):
                if pq(e).attr.href.find("#cite_note") == -1:
                    genres.append(pq(e).attr.href)
        if pq(th).text() == "Born":
            for e in pq(th).nextAll("td .bday"):
                born=pq(e).text()
        if pq(th).text() == "Years active":
            for e in pq(th).nextAll("td"):
                ya=pq(e).text()
    return dict(url=url, genres=genres, born=born, ya=ya)

In [57]:
singer_band_info_list=[]
for k,v in urlcache.items():
    singer_band_info_list.append(singer_band_info(k, v))


In [58]:
tempdf=pd.DataFrame(singer_band_info_list)
tempdf.head()

Unnamed: 0,born,genres,url,ya
0,False,[/wiki/Hard_rock],/wiki/Survivor_(band),1978–1988 1993–present
1,1973-11-19,[/wiki/Country_music],/wiki/Billy_Currington,1996–present
2,False,[NA],/wiki/Buckner_%26_Garcia,False
3,False,"[/wiki/Philadelphia_soul, /wiki/Soul_music]",/wiki/Blue_Magic_(band),1972–present
4,1944-01-16,"[/wiki/Country_music, /wiki/Pop_music]",/wiki/Jim_Stafford,1974–present


In [59]:
largedf=flatframe.merge(tempdf, on="url")
largedf.head()

Unnamed: 0,year,band_singer,ranking,song,songurl,url,born,genres,ya
0,1970,Simon & Garfunkel,1,Bridge over Troubled Water,/wiki/Bridge_over_Troubled_Water_(song),/wiki/Simon_%26_Garfunkel,False,[/wiki/Folk_rock],"1957–1970 (Reunions: 1975, 1981–83, 1990, 1993..."
1,1970,Simon & Garfunkel,49,Cecilia,/wiki/Cecilia_(Simon_%26_Garfunkel_song),/wiki/Simon_%26_Garfunkel,False,[/wiki/Folk_rock],"1957–1970 (Reunions: 1975, 1981–83, 1990, 1993..."
2,1970,Marmalade,43,Reflections of My Life,/wiki/Reflections_of_My_Life,/wiki/Marmalade_(band),False,"[/wiki/Beat_music, /wiki/Pop_music, /wiki/Psyc...",1966–present
3,1970,Chicago,59,Make Me Smile,/wiki/Make_Me_Smile,/wiki/Chicago_(band),False,"[/wiki/Rock_music, /wiki/Soft_rock, /wiki/Jazz...",1967–present
4,1970,Chicago,61,25 or 6 to 4,/wiki/25_or_6_to_4,/wiki/Chicago_(band),False,"[/wiki/Rock_music, /wiki/Soft_rock, /wiki/Jazz...",1967–present


In [None]:
genres = set()
for g in largedf.genres:
    genres.update(g)
genres = sorted(genres)

#make a column for each genre
for genre in genres:
    largedf[genre] = [genre in singergenres for singergenres in largedf.genres]         

genres

In [68]:
# make dict of genres with list of songids within each.
# first a dict keyed by genre with blank list as value
genredict={}
for genre in genres:
    genredict[genre]=[]
# for each song, append its id 

In [85]:
# for each song, append its id to each genre list is it associated with 
counter=0
for song in largedf.iterrows():
    counter += 1
    #if counter>9:
        #break
    songid= str(song[1][0])+'-'+str(int(song[1][2]))
    for i in range(0,len(song[1][7])):
        genredict[song[1][7][i]].append(songid) 
        
    

In [87]:
    with open("songsbygenre.json","w") as fd:
        json.dump(genredict, fd)