In [None]:
## Getting Genres All the Way Back to 1970 

This notebook seeks the genres for the artists responsible for top 100 from 1970-2014. It uses the approach of HW1 and extends it to collect genres from earlier years when the wiki pages were formatted differently. 

In [1]:
# The stuff from HW1
# The %... is an iPython thing, and is not part of the Python language.
# In this case we're just telling the plotting library to draw things on
# the notebook, instead of on a separate window.
%matplotlib inline
# See all the "as ..." contructs? They're just aliasing the package names.
# That way we can call methods like plt.plot() instead of matplotlib.pyplot.plot().
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
import time
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")

from pyquery import PyQuery as pq

import requests
import json
import os

Now the scraping routine adjusted to include years 1970 to 1991

In [2]:
readflag=False
if os.access('tempdata/yearinfo.json', os.R_OK)== True:
    readflag=True
    print "Reading from File, Not Internet"
    with open("tempdata/yearinfo.json", "r") as fd:
        yearinfo = json.load(fd)


Reading from File, Not Internet


In [3]:
years=range(1970,2015)
if not readflag:
    print years
    yearstext={}
    for y in years:
        print y
        yreq=requests.get("http://en.wikipedia.org/wiki/Billboard_Year-End_Hot_100_singles_of_%i" % y)
        yearstext[y]=yreq.text
        time.sleep(1)
else:
    print "Skipping the http requests because the result is already stored."


Skipping the http requests because the result is already stored.


Wikipedia's format for the billboard 100 changed between 1981 and 1982. Before 1981 The rankings were introduced with table rows; after, it was table headers. So the HW1 function `parse_year()` will work for 1982-1991.  Something else is necessary for earlier ones. 

In [4]:
def parse_year(year, ytextdict):
    #print "Year", year
    # Common over all years
    d_ = pq(ytextdict[year])
    d_rows = pq(d_('.wikitable tr')[1:])
    songs=[]
    for r in d_rows:
        # Now we have to handle pre-1982 separately from post- 
        # Wikipedia decided to change the table format along the way, and place 
        # the song position on a "th" element
        # instead of the "td" element from the first page we parsed.
        if year < 1982:
            #first cell has ranking
            ranking = int(pq(r)('td').eq(0).text())
            songid = str(year)+'-'+str(ranking)
            #second has title info
            d_title=pq(r)('td').eq(1)
            titletext = d_title.text()
            bst=pq(pq(d_title)('a'))
            #third has artist info
            d_artist = pq(r)('td').eq(2)
            tdtext=pq(d_artist.text()) 
            bs=pq(pq(d_artist)('a'))
        else:  # for the later years HW1 works 
            d_th=pq(r)('th')
            ranking = int(pq(d_th[0]).text())
            songid = str(year)+'-'+str(ranking)
            d_td=pq(r)('td')
            titletext=pq(d_td[0]).text()
            bst=pq(pq(d_td[0])('a'))
            tdtext=pq(d_td[1]).text()
            bs=pq(pq(d_td[1])('a'))
        
        songanchors=[pq(e) for e in bst]
        songnames=[item.text() for item in songanchors]
        songurls=[item.attr.href for item in songanchors]
        # And do the same for the song names and URLs.
        band_singer=[pq(e).text() for e in bs]
        band_singer_url=[pq(e).attr.href for e in bs]
        # Some singers and songs don't actually have an "a" element, and therefore don't have a URL link.
        # In those cases just use the text between the tags as the song name and set the URL field to None.
        if len(songanchors)==0:
            # we only want the second to next to last elements. We do this to remove the quotes around the string.
            song=[titletext[1:-1].strip()] # The strip method removes whitespaces at the begining and end of a string.
            songurl=[None]
        songdict=dict(ranking=ranking, song=songnames, songid=songid, songurl=songurls, titletext=titletext,  band_singer=band_singer, url=band_singer_url)
        songs.append(songdict)
    return songs



In [5]:
if not readflag:
    yearinfo = {y:parse_year(y, yearstext) for y in years}

In [6]:
yearinfo[str(1970)][0]

{u'band_singer': [u'Simon & Garfunkel'],
 u'ranking': 1,
 u'song': [u'Bridge over Troubled Water'],
 u'songid': u'1970-1',
 u'songurl': [u'/wiki/Bridge_over_Troubled_Water_(song)'],
 u'titletext': u'" Bridge over Troubled Water "',
 u'url': [u'/wiki/Simon_%26_Garfunkel']}

In [7]:
if not readflag:
    fd = open("tempdata/yearinfo.json","w")
    json.dump(yearinfo, fd)
    fd.close()


Here we should be able to follow HW1 for making a flat dataframe:

In [8]:
yeardict={}
for y in yearinfo.keys():
    yearlist=yearinfo[y]
    yearlist2=[]
    for idict in yearlist:
        singers=idict['band_singer']
        for i,s in enumerate(singers):
            songs=idict['song']
            for j,so in enumerate(songs):#now inside each singer song combination
                nd={}
                nd['band_singer']=s
                nd['url']=idict['url'][i]
                nd['song']=so
                nd['songurl']=idict['songurl'][j]
                nd['ranking']=idict['ranking']
                yearlist2.append(nd)
    yeardict[y]=pd.DataFrame(yearlist2)#one for each year
yearspanel=pd.Panel.from_dict(yeardict, orient="minor")#stack dataframes into a panel
hierframe=yearspanel.to_frame() #flattening leads to a hierarchical index
flatframe = hierframe.reset_index()
flatframe = flatframe.rename(columns={'minor':'year'})
del flatframe['major']
flatframe.head(8)


Unnamed: 0,year,band_singer,ranking,song,songurl,url
0,1970,Simon & Garfunkel,1,Bridge over Troubled Water,/wiki/Bridge_over_Troubled_Water_(song),/wiki/Simon_%26_Garfunkel
1,1971,Three Dog Night,1,Joy to the World,/wiki/Joy_to_the_World_(Hoyt_Axton_song),/wiki/Three_Dog_Night
2,1972,Roberta Flack,1,The First Time Ever I Saw Your Face,/wiki/The_First_Time_Ever_I_Saw_Your_Face,/wiki/Roberta_Flack
3,1973,Tony Orlando and Dawn,1,Tie a Yellow Ribbon Round the Ole Oak Tree,/wiki/Tie_a_Yellow_Ribbon_Round_the_Ole_Oak_Tree,/wiki/Tony_Orlando_and_Dawn
4,1974,Barbra Streisand,1,The Way We Were,/wiki/The_Way_We_Were_(song),/wiki/Barbra_Streisand
5,1975,Captain & Tennille,1,Love Will Keep Us Together,/wiki/Love_Will_Keep_Us_Together,/wiki/Captain_%26_Tennille
6,1976,Wings,1,Silly Love Songs,/wiki/Silly_Love_Songs,/wiki/Wings_(band)
7,1977,Rod Stewart,1,Tonight's the Night (Gonna Be Alright),/wiki/Tonight%27s_the_Night_(Gonna_Be_Alright),/wiki/Rod_Stewart


In [9]:
flatframe.year = flatframe.year.astype(int)
flatframe.dtypes

year             int32
band_singer     object
ranking        float64
song            object
songurl         object
url             object
dtype: object

Again there  is a change in construction of Genres section of artist page
1970- genres are comma-sparated list, not `<li>` tagged.  Same access method, just eliminate the `li` argument to `nextAll`. Since one ignores the other, just run both. It looks like this:

`for th in pq(requests.get("http://en.wikipedia.org/wiki/Jason_Derulo").text)(".infobox tr th"):
    if pq(th).text() == "Genres":
        for e in pq(th).nextAll("td  a"):
            if pq(e).attr.href.find("#cite_note") == -1:
                print pq(e).attr.href, pq(e).attr.title`

`for th in pq(requests.get("http://en.wikipedia.org/wiki/Jason_Derulo").text)(".infobox tr th"):
    if pq(th).text() == "Genres":
        for e in pq(th).nextAll("td li a"):
            if pq(e).attr.href.find("#cite_note") == -1:
                print pq(e).attr.href, pq(e).attr.title`


Slurp up the pages if they aren't already available.

In [10]:
def get_page(url):
    # Check if URL has already been visited.
    if (url not in urlcache) or (urlcache[url]==1) or (urlcache[url]==2):
        time.sleep(1)
        # try/except blocks are used whenever the code could generate an exception (e.g. division by zero).
        # In this case we don't know if the page really exists, or even if it does, if we'll be able to reach it.
        try:
            r = requests.get("http://en.wikipedia.org%s" % url)
            if r.status_code == 200:
                urlcache[url] = r.text
                #print url
            else:
                urlcache[url] = 1
                print "error not 200",url
        except:
            print "error",url
            urlcache[url] = 2
    
    return urlcache[url]


In [11]:
flatframe=flatframe.sort('year')
len(flatframe)

5073

In [12]:
%%time
# this even takes a while if it is stored locally.
readflag2=False
#if the file is there, just open it. 
if os.access('tempdata/artistinfo.json', os.R_OK)== True:
    readflag2=True
    print "Reading artist info from File, Not Internet"
    with open("tempdata/artistinfo.json") as fd:
        urlcache = json.load(fd)
else:
    # file not there, have to build it
    urlcache={}
    flatframe["url"].apply(get_page)

Reading artist info from File, Not Internet
Wall time: 3.38 s


In [13]:
print np.sum([(urlcache[k]==1) or (urlcache[k]==2) for k in urlcache])# no one or 0's
print len(flatframe.url.unique())==len(urlcache)#we got all of the urls


0
True


In [14]:
if not readflag2:                #didnt just load from disk, so built urlcache; write it
    with open("tempdata/artistinfo.json","w") as fd:
        json.dump(urlcache, fd)
        print "writing to disk" 

In [15]:
len(urlcache)

1833

In [16]:
# HW1 slightly modified
def singer_band_info(url, page_text):
    genres=['NA']
    born=False
    ya=False

    for th in pq(page_text)(".infobox tr th"): #this is what changed...
        if pq(th).text() == "Genres":
            genres=[]
            for e in pq(th).nextAll("td a"):
                if pq(e).attr.href.find("#cite_note") == -1:
                    genres.append(pq(e).attr.href)
        if pq(th).text() == "Born":
            for e in pq(th).nextAll("td .bday"):
                born=pq(e).text()
        if pq(th).text() == "Years active":
            for e in pq(th).nextAll("td"):
                ya=pq(e).text()
    return dict(url=url, genres=genres, born=born, ya=ya)

In [17]:
singer_band_info_list=[]
for k,v in urlcache.items():
    singer_band_info_list.append(singer_band_info(k, v))

In [18]:
tempdf=pd.DataFrame(singer_band_info_list)
tempdf.head()

Unnamed: 0,born,genres,url,ya
0,False,[/wiki/Hard_rock],/wiki/Survivor_(band),1978–1988 1993–present
1,1973-11-19,[/wiki/Country_music],/wiki/Billy_Currington,1996–present
2,False,[NA],/wiki/Buckner_%26_Garcia,False
3,False,"[/wiki/Philadelphia_soul, /wiki/Soul_music]",/wiki/Blue_Magic_(band),1972–present
4,1944-01-16,"[/wiki/Country_music, /wiki/Pop_music]",/wiki/Jim_Stafford,1974–present


In [19]:
largedf=flatframe.merge(tempdf, on="url")
largedf.head()

Unnamed: 0,year,band_singer,ranking,song,songurl,url,born,genres,ya
0,1970,Simon & Garfunkel,1,Bridge over Troubled Water,/wiki/Bridge_over_Troubled_Water_(song),/wiki/Simon_%26_Garfunkel,False,[/wiki/Folk_rock],"1957–1970 (Reunions: 1975, 1981–83, 1990, 1993..."
1,1970,Simon & Garfunkel,49,Cecilia,/wiki/Cecilia_(Simon_%26_Garfunkel_song),/wiki/Simon_%26_Garfunkel,False,[/wiki/Folk_rock],"1957–1970 (Reunions: 1975, 1981–83, 1990, 1993..."
2,1970,The Sandpipers,90,Come Saturday Morning,/wiki/Come_Saturday_Morning_(song),/wiki/The_Sandpipers,False,[/wiki/Folk_rock],1966–1975
3,1970,Elvis Presley,72,The Wonder of You,/wiki/The_Wonder_of_You,/wiki/Elvis_Presley,1935-01-08,"[/wiki/Rock_and_roll, /wiki/Pop_music, /wiki/R...",1953–1977
4,1972,Elvis Presley,48,Burning Love,/wiki/Burning_Love,/wiki/Elvis_Presley,1935-01-08,"[/wiki/Rock_and_roll, /wiki/Pop_music, /wiki/R...",1953–1977


In [20]:
genres = set()
for g in largedf.genres:
    genres.update(g)
genres = sorted(genres)

#make a column for each genre
for genre in genres:
    largedf[genre] = [genre in singergenres for singergenres in largedf.genres]         

In [22]:
# make dict of genres with list of songids within each.
# also put the song key in each row
# first a dict keyed by genre with blank list as value
genredict={}
songgenredict={}
for genre in genres:
    genredict[genre]=[]
    
largedf['song_key']=" "    

counter=0
songids=[]
for song in largedf.iterrows():
    songid= str(song[1][0])+'-'+str(int(song[1][2]))
    songids.append(songid)
    songgenredict[songid]= song[1][7]
    for i in range(0,len(song[1][7])):
        genredict[song[1][7][i]].append(songid)
    counter += 1
    
largedf.song_key = songids

largedf = largedf.set_index(largedf['song_key'])
largedf = largedf.sort(columns=['year','ranking'])
largedf.head()    
#also make a handy dict of songs mapped to genre list
# for each song, use its id 

Unnamed: 0_level_0,year,band_singer,ranking,song,songurl,url,born,genres,ya,/w/index.php?title=Country_Rap&action=edit&redlink=1,/wiki/2_Tone,/wiki/A_cappella,/wiki/Acid_house,/wiki/Acid_jazz,/wiki/Acid_rock,/wiki/Acoustic_music,/wiki/Acoustic_rock,/wiki/Adult_Contemporary,/wiki/Adult_Contemporary_Music,/wiki/Adult_contemporary,/wiki/Adult_contemporary_music,/wiki/Adult_contemporary_music#Soft_adult_contemporary,/wiki/Afrobeat,/wiki/Album-oriented_rock,/wiki/Alternative_R%26B,/wiki/Alternative_country,/wiki/Alternative_dance,/wiki/Alternative_dance#Indietronica,/wiki/Alternative_hip_hop,/wiki/Alternative_metal,/wiki/Alternative_pop,/wiki/Alternative_rock,/wiki/Ambient_house,/wiki/Ambient_music,/wiki/American_folk_music,/wiki/Americana_(music),/wiki/Anarcho-punk,/wiki/Anti-folk,/wiki/Arena_rock,/wiki/Art_pop,/wiki/Art_punk,/wiki/Art_rock,/wiki/Avant-garde_music,/wiki/Baggy,/wiki/Baroque_pop,/wiki/Bass_music,/wiki/Battle_rap,/wiki/Beat_music,/wiki/Bhangra_(music),/wiki/Big_band,...,/wiki/Southern_Rock,/wiki/Southern_gospel,/wiki/Southern_hip_hop,/wiki/Southern_rap,/wiki/Southern_rock,/wiki/Southern_soul,/wiki/Space_disco,/wiki/Space_rock,/wiki/Spoken_word,/wiki/Sunshine_pop,/wiki/Surf_music,/wiki/Surf_rock,/wiki/Swamp_pop,/wiki/Swamp_rock,/wiki/Swing_(genre),/wiki/Swing_music,/wiki/Symphonic_rock,/wiki/Synthpop,/wiki/Talking_blues,/wiki/Tech_house,/wiki/Techno,/wiki/Techno_music,/wiki/Teen_pop,/wiki/Tejano_music,/wiki/Thrash_metal,/wiki/Traditional_pop,/wiki/Traditional_pop_music,/wiki/Trance_music,/wiki/Trap_music,/wiki/Trip_hop,/wiki/UK_funky,/wiki/UK_garage,/wiki/Underground_hip_hop,/wiki/Urban_adult_contemporary,/wiki/Urban_contemporary,/wiki/Urban_contemporary_gospel,/wiki/Urban_music,/wiki/Vocal_music,/wiki/West_Coast_Rap,/wiki/West_Coast_hip_hop,/wiki/West_coast_hip_hop,/wiki/Western_music_(North_America),/wiki/Western_swing,/wiki/Witch_house,/wiki/World_music,/wiki/Worldbeat,/wiki/Worship_music,/wiki/Zydeco,NA,song_key
song_key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1
1970-1,1970,Simon & Garfunkel,1,Bridge over Troubled Water,/wiki/Bridge_over_Troubled_Water_(song),/wiki/Simon_%26_Garfunkel,False,[/wiki/Folk_rock],"1957–1970 (Reunions: 1975, 1981–83, 1990, 1993...",False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,1970-1
1970-2,1970,The Carpenters,2,(They Long to Be) Close to You,/wiki/(They_Long_to_Be)_Close_to_You,/wiki/The_Carpenters,False,"[/wiki/Pop_music, /wiki/Soft_rock, /wiki/Adult...",1969–1983,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,1970-2
1970-3,1970,The Guess Who,3,American Woman,/wiki/American_Woman_(song),/wiki/The_Guess_Who,False,"[/wiki/Pop_rock, /wiki/Psychedelic_rock, /wiki...",(As The Guess Who) 1965 ( 1965 ) -1975 1977 ( ...,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,1970-3
1970-4,1970,B.J. Thomas,4,Raindrops Keep Fallin' on My Head,/wiki/Raindrops_Keep_Fallin%27_on_My_Head,/wiki/B.J._Thomas,1942-08-07,"[/wiki/Country_music, /wiki/Pop_music, /wiki/C...",1966–present,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,1970-4
1970-5,1970,Edwin Starr,5,War,/wiki/War_(Edwin_Starr_song),/wiki/Edwin_Starr,1942-01-21,"[/wiki/Soul_music, /wiki/Disco, /wiki/Rhythm_a...",1951–2003,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,1970-5


In [23]:
largedf.shape

(5073, 445)

In [24]:
# for each song, append its id to each genre list it is associated with 
counter=0
for song in largedf.iterrows():
    counter += 1
    songid= str(song[1][0])+'-'+str(int(song[1][2]))
    songgenredict[songid]= song[1][7]
    for i in range(0,len(song[1][7])):
        genredict[song[1][7][i]].append(songid) 
largedf.head()

Unnamed: 0_level_0,year,band_singer,ranking,song,songurl,url,born,genres,ya,/w/index.php?title=Country_Rap&action=edit&redlink=1,/wiki/2_Tone,/wiki/A_cappella,/wiki/Acid_house,/wiki/Acid_jazz,/wiki/Acid_rock,/wiki/Acoustic_music,/wiki/Acoustic_rock,/wiki/Adult_Contemporary,/wiki/Adult_Contemporary_Music,/wiki/Adult_contemporary,/wiki/Adult_contemporary_music,/wiki/Adult_contemporary_music#Soft_adult_contemporary,/wiki/Afrobeat,/wiki/Album-oriented_rock,/wiki/Alternative_R%26B,/wiki/Alternative_country,/wiki/Alternative_dance,/wiki/Alternative_dance#Indietronica,/wiki/Alternative_hip_hop,/wiki/Alternative_metal,/wiki/Alternative_pop,/wiki/Alternative_rock,/wiki/Ambient_house,/wiki/Ambient_music,/wiki/American_folk_music,/wiki/Americana_(music),/wiki/Anarcho-punk,/wiki/Anti-folk,/wiki/Arena_rock,/wiki/Art_pop,/wiki/Art_punk,/wiki/Art_rock,/wiki/Avant-garde_music,/wiki/Baggy,/wiki/Baroque_pop,/wiki/Bass_music,/wiki/Battle_rap,/wiki/Beat_music,/wiki/Bhangra_(music),/wiki/Big_band,...,/wiki/Southern_Rock,/wiki/Southern_gospel,/wiki/Southern_hip_hop,/wiki/Southern_rap,/wiki/Southern_rock,/wiki/Southern_soul,/wiki/Space_disco,/wiki/Space_rock,/wiki/Spoken_word,/wiki/Sunshine_pop,/wiki/Surf_music,/wiki/Surf_rock,/wiki/Swamp_pop,/wiki/Swamp_rock,/wiki/Swing_(genre),/wiki/Swing_music,/wiki/Symphonic_rock,/wiki/Synthpop,/wiki/Talking_blues,/wiki/Tech_house,/wiki/Techno,/wiki/Techno_music,/wiki/Teen_pop,/wiki/Tejano_music,/wiki/Thrash_metal,/wiki/Traditional_pop,/wiki/Traditional_pop_music,/wiki/Trance_music,/wiki/Trap_music,/wiki/Trip_hop,/wiki/UK_funky,/wiki/UK_garage,/wiki/Underground_hip_hop,/wiki/Urban_adult_contemporary,/wiki/Urban_contemporary,/wiki/Urban_contemporary_gospel,/wiki/Urban_music,/wiki/Vocal_music,/wiki/West_Coast_Rap,/wiki/West_Coast_hip_hop,/wiki/West_coast_hip_hop,/wiki/Western_music_(North_America),/wiki/Western_swing,/wiki/Witch_house,/wiki/World_music,/wiki/Worldbeat,/wiki/Worship_music,/wiki/Zydeco,NA,song_key
song_key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1
1970-1,1970,Simon & Garfunkel,1,Bridge over Troubled Water,/wiki/Bridge_over_Troubled_Water_(song),/wiki/Simon_%26_Garfunkel,False,[/wiki/Folk_rock],"1957–1970 (Reunions: 1975, 1981–83, 1990, 1993...",False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,1970-1
1970-2,1970,The Carpenters,2,(They Long to Be) Close to You,/wiki/(They_Long_to_Be)_Close_to_You,/wiki/The_Carpenters,False,"[/wiki/Pop_music, /wiki/Soft_rock, /wiki/Adult...",1969–1983,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,1970-2
1970-3,1970,The Guess Who,3,American Woman,/wiki/American_Woman_(song),/wiki/The_Guess_Who,False,"[/wiki/Pop_rock, /wiki/Psychedelic_rock, /wiki...",(As The Guess Who) 1965 ( 1965 ) -1975 1977 ( ...,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,1970-3
1970-4,1970,B.J. Thomas,4,Raindrops Keep Fallin' on My Head,/wiki/Raindrops_Keep_Fallin%27_on_My_Head,/wiki/B.J._Thomas,1942-08-07,"[/wiki/Country_music, /wiki/Pop_music, /wiki/C...",1966–present,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,1970-4
1970-5,1970,Edwin Starr,5,War,/wiki/War_(Edwin_Starr_song),/wiki/Edwin_Starr,1942-01-21,"[/wiki/Soul_music, /wiki/Disco, /wiki/Rhythm_a...",1951–2003,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,1970-5


### Now add the lyrics 
from the dataframe `use-this-master-lyricsdf-extracted.csv`

store the big, wide dataframe and some handy dictionaries to make it easy to link songs to genres and vice versa.


In [27]:
dfly=pd.read_csv("../../data/conditioned/use-this-master-lyricsdf-extracted.csv")
dfly = dfly.set_index(dfly['song_key'])
dfly.head(2)


Unnamed: 0_level_0,index,position,year,title.href,title,artist,lyrics,decade,song_key,lyrics_url,lyrics_abstract
song_key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1970-1,0,1,1970,https://en.wikipedia.org/wiki/Bridge_over_Trou...,Bridge over Troubled Water,Simon and Garfunkel,When you're weary. Feeling small. When tears a...,1970,1970-1,http://lyrics.wikia.com/Simon_And_Garfunkel:Br...,When you're weary. Feeling small. When tears a...
1970-2,1,2,1970,https://en.wikipedia.org/wiki/(They_Long_to_Be...,(They Long to Be) Close to You,The Carpenters,Why do birds suddenly appear. Everytime you ar...,1970,1970-2,http://lyrics.wikia.com/Carpenters:%28They_Lon...,Why do birds suddenly appear. Everytime you ar...


In [29]:
dfg=dfly[["song_key", "lyrics","lyrics_url","lyrics_abstract","decade","artist","title"]]
dfg.head()
ydf = pd.merge(dfg,largedf,how="inner")
ydf.head(2)

Unnamed: 0,song_key,lyrics,lyrics_url,lyrics_abstract,decade,artist,title,year,band_singer,ranking,song,songurl,url,born,genres,ya,/w/index.php?title=Country_Rap&action=edit&redlink=1,/wiki/2_Tone,/wiki/A_cappella,/wiki/Acid_house,/wiki/Acid_jazz,/wiki/Acid_rock,/wiki/Acoustic_music,/wiki/Acoustic_rock,/wiki/Adult_Contemporary,/wiki/Adult_Contemporary_Music,/wiki/Adult_contemporary,/wiki/Adult_contemporary_music,/wiki/Adult_contemporary_music#Soft_adult_contemporary,/wiki/Afrobeat,/wiki/Album-oriented_rock,/wiki/Alternative_R%26B,/wiki/Alternative_country,/wiki/Alternative_dance,/wiki/Alternative_dance#Indietronica,/wiki/Alternative_hip_hop,/wiki/Alternative_metal,/wiki/Alternative_pop,/wiki/Alternative_rock,/wiki/Ambient_house,/wiki/Ambient_music,/wiki/American_folk_music,/wiki/Americana_(music),/wiki/Anarcho-punk,/wiki/Anti-folk,/wiki/Arena_rock,/wiki/Art_pop,/wiki/Art_punk,/wiki/Art_rock,/wiki/Avant-garde_music,...,/wiki/Soul_music,/wiki/Southern_Rock,/wiki/Southern_gospel,/wiki/Southern_hip_hop,/wiki/Southern_rap,/wiki/Southern_rock,/wiki/Southern_soul,/wiki/Space_disco,/wiki/Space_rock,/wiki/Spoken_word,/wiki/Sunshine_pop,/wiki/Surf_music,/wiki/Surf_rock,/wiki/Swamp_pop,/wiki/Swamp_rock,/wiki/Swing_(genre),/wiki/Swing_music,/wiki/Symphonic_rock,/wiki/Synthpop,/wiki/Talking_blues,/wiki/Tech_house,/wiki/Techno,/wiki/Techno_music,/wiki/Teen_pop,/wiki/Tejano_music,/wiki/Thrash_metal,/wiki/Traditional_pop,/wiki/Traditional_pop_music,/wiki/Trance_music,/wiki/Trap_music,/wiki/Trip_hop,/wiki/UK_funky,/wiki/UK_garage,/wiki/Underground_hip_hop,/wiki/Urban_adult_contemporary,/wiki/Urban_contemporary,/wiki/Urban_contemporary_gospel,/wiki/Urban_music,/wiki/Vocal_music,/wiki/West_Coast_Rap,/wiki/West_Coast_hip_hop,/wiki/West_coast_hip_hop,/wiki/Western_music_(North_America),/wiki/Western_swing,/wiki/Witch_house,/wiki/World_music,/wiki/Worldbeat,/wiki/Worship_music,/wiki/Zydeco,NA
0,1970-1,When you're weary. Feeling small. When tears a...,http://lyrics.wikia.com/Simon_And_Garfunkel:Br...,When you're weary. Feeling small. When tears a...,1970,Simon and Garfunkel,Bridge over Troubled Water,1970,Simon & Garfunkel,1,Bridge over Troubled Water,/wiki/Bridge_over_Troubled_Water_(song),/wiki/Simon_%26_Garfunkel,False,[/wiki/Folk_rock],"1957–1970 (Reunions: 1975, 1981–83, 1990, 1993...",False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,1970-2,Why do birds suddenly appear. Everytime you ar...,http://lyrics.wikia.com/Carpenters:%28They_Lon...,Why do birds suddenly appear. Everytime you ar...,1970,The Carpenters,(They Long to Be) Close to You,1970,The Carpenters,2,(They Long to Be) Close to You,/wiki/(They_Long_to_Be)_Close_to_You,/wiki/The_Carpenters,False,"[/wiki/Pop_music, /wiki/Soft_rock, /wiki/Adult...",1969–1983,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [30]:
with open("songsbygenre.json","w") as fd:
    json.dump(genredict, fd)

In [31]:
with open("genresbysong.json","w") as fd:
    json.dump(songgenredict, fd)

In [40]:
odf=ydf[ydf.song_key != '1971-52']
odf.reindex()
#odf= ydf.iloc[132:136,]
#odf = odf.drop(odf.index[133:133], inplace=True)
odf.head()


Unnamed: 0,song_key,lyrics,lyrics_url,lyrics_abstract,decade,artist,title,year,band_singer,ranking,song,songurl,url,born,genres,ya,/w/index.php?title=Country_Rap&action=edit&redlink=1,/wiki/2_Tone,/wiki/A_cappella,/wiki/Acid_house,/wiki/Acid_jazz,/wiki/Acid_rock,/wiki/Acoustic_music,/wiki/Acoustic_rock,/wiki/Adult_Contemporary,/wiki/Adult_Contemporary_Music,/wiki/Adult_contemporary,/wiki/Adult_contemporary_music,/wiki/Adult_contemporary_music#Soft_adult_contemporary,/wiki/Afrobeat,/wiki/Album-oriented_rock,/wiki/Alternative_R%26B,/wiki/Alternative_country,/wiki/Alternative_dance,/wiki/Alternative_dance#Indietronica,/wiki/Alternative_hip_hop,/wiki/Alternative_metal,/wiki/Alternative_pop,/wiki/Alternative_rock,/wiki/Ambient_house,/wiki/Ambient_music,/wiki/American_folk_music,/wiki/Americana_(music),/wiki/Anarcho-punk,/wiki/Anti-folk,/wiki/Arena_rock,/wiki/Art_pop,/wiki/Art_punk,/wiki/Art_rock,/wiki/Avant-garde_music,...,/wiki/Soul_music,/wiki/Southern_Rock,/wiki/Southern_gospel,/wiki/Southern_hip_hop,/wiki/Southern_rap,/wiki/Southern_rock,/wiki/Southern_soul,/wiki/Space_disco,/wiki/Space_rock,/wiki/Spoken_word,/wiki/Sunshine_pop,/wiki/Surf_music,/wiki/Surf_rock,/wiki/Swamp_pop,/wiki/Swamp_rock,/wiki/Swing_(genre),/wiki/Swing_music,/wiki/Symphonic_rock,/wiki/Synthpop,/wiki/Talking_blues,/wiki/Tech_house,/wiki/Techno,/wiki/Techno_music,/wiki/Teen_pop,/wiki/Tejano_music,/wiki/Thrash_metal,/wiki/Traditional_pop,/wiki/Traditional_pop_music,/wiki/Trance_music,/wiki/Trap_music,/wiki/Trip_hop,/wiki/UK_funky,/wiki/UK_garage,/wiki/Underground_hip_hop,/wiki/Urban_adult_contemporary,/wiki/Urban_contemporary,/wiki/Urban_contemporary_gospel,/wiki/Urban_music,/wiki/Vocal_music,/wiki/West_Coast_Rap,/wiki/West_Coast_hip_hop,/wiki/West_coast_hip_hop,/wiki/Western_music_(North_America),/wiki/Western_swing,/wiki/Witch_house,/wiki/World_music,/wiki/Worldbeat,/wiki/Worship_music,/wiki/Zydeco,NA
0,1970-1,When you're weary. Feeling small. When tears a...,http://lyrics.wikia.com/Simon_And_Garfunkel:Br...,When you're weary. Feeling small. When tears a...,1970,Simon and Garfunkel,Bridge over Troubled Water,1970,Simon & Garfunkel,1,Bridge over Troubled Water,/wiki/Bridge_over_Troubled_Water_(song),/wiki/Simon_%26_Garfunkel,False,[/wiki/Folk_rock],"1957–1970 (Reunions: 1975, 1981–83, 1990, 1993...",False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,1970-2,Why do birds suddenly appear. Everytime you ar...,http://lyrics.wikia.com/Carpenters:%28They_Lon...,Why do birds suddenly appear. Everytime you ar...,1970,The Carpenters,(They Long to Be) Close to You,1970,The Carpenters,2,(They Long to Be) Close to You,/wiki/(They_Long_to_Be)_Close_to_You,/wiki/The_Carpenters,False,"[/wiki/Pop_music, /wiki/Soft_rock, /wiki/Adult...",1969–1983,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2,1970-3,"Mmm, da da da. Mmm, mmm, da da da. Mmm, mmm, d...",http://lyrics.wikia.com/The_Guess_Who:American...,"Mmm, da da da. Mmm, mmm, da da da. Mmm, mmm, d...",1970,The Guess Who,American Woman,1970,The Guess Who,3,American Woman,/wiki/American_Woman_(song),/wiki/The_Guess_Who,False,"[/wiki/Pop_rock, /wiki/Psychedelic_rock, /wiki...",(As The Guess Who) 1965 ( 1965 ) -1975 1977 ( ...,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
3,1970-4,Raindrops are falling on my head. And just lik...,http://lyrics.wikia.com/B.J._Thomas:Raindrops_...,Raindrops are falling on my head. And just lik...,1970,B.J. Thomas,Raindrops Keep Fallin' on My Head,1970,B.J. Thomas,4,Raindrops Keep Fallin' on My Head,/wiki/Raindrops_Keep_Fallin%27_on_My_Head,/wiki/B.J._Thomas,1942-08-07,"[/wiki/Country_music, /wiki/Pop_music, /wiki/C...",1966–present,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4,1970-5,"War, huh, yeah. What is it good for? Absolutel...",http://lyrics.wikia.com/Edwin_Starr:War,"War, huh, yeah. What is it good for? Absolutel...",1970,Edwin Starr,War,1970,Edwin Starr,5,War,/wiki/War_(Edwin_Starr_song),/wiki/Edwin_Starr,1942-01-21,"[/wiki/Soul_music, /wiki/Disco, /wiki/Rhythm_a...",1951–2003,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [41]:
odf.to_csv('../../data/conditioned/master_lyrics_with_all_years_genres.csv',encoding='utf-8')
