In [3]:
# The %... is an iPython thing, and is not part of the Python language.
# In this case we're just telling the plotting library to draw things on
# the notebook, instead of on a separate window.
%matplotlib inline
# See all the "as ..." contructs? They're just aliasing the package names.
# That way we can call methods like plt.plot() instead of matplotlib.pyplot.plot().
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
import time
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")

In [70]:
from pyquery import PyQuery as pq
from bs4 import BeautifulSoup
# The "requests" library makes working with HTTP requests easier
# than the built-in urllib libraries.
import requests

In [7]:
end_year=requests.get("http://www.billboard.com/charts/year-end/2014/hot-100-songs").text

In [49]:
from datetime import date, timedelta

def allsats(year, fs):
    days=[]
    d = date(year, 1, fs)       
    while d.year == year:
        days.append(d.strftime("%Y-%m-%d"))
        d += timedelta(days = 7)
    return days

In [50]:
def allsats(syear, years, fs):
    days=[]
    d = date(syear, 1, fs)
    while d.year in years:
        days.append(d.strftime("%Y-%m-%d"))
        d += timedelta(days = 7)
    return days

In [53]:
years = [2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014]
sats = allsats(2000, years, 1)

In [60]:
satdict={}
satdict['end_year']=end_year
for sat in sats:
    satdict[sat]=requests.get("http://www.billboard.com/charts/hot-100/"+sat).text
    time.sleep(1)

In [21]:
import json

In [62]:
with open("tempdata/satdict.json","w") as fd:
    json.dump(satdict, fd)
del satdict
with open("tempdata/satdict.json") as fd:
    satdict=json.load(fd)

In [63]:
def rank_scrape(articlerows):
    songdicts = []
    songdict = {}
    ranking = 1
    for article in articlerows:
        songdict["ranking"] = ranking
        title = article.find("div", attrs={"class": "row-title"}).find("h2").get_text()
        artist = article.find("div", attrs={"class": "row-title"}).find("h3").get_text()
        title = title.strip()
        sep1 = 'Featuring'
        sep2 = ','
        sep3 = '&'
        artist = artist.split(sep1, 1)[0]
        artist = artist.split(sep2, 2)[0]
        artist = artist.split(sep3, 3)[0]
        artist = artist.strip()
        songdict["title"] = title
        songdict["artist"] = artist
        songdicts.append(dict(songdict))
        ranking += 1
    return songdicts

def get_weekly(weekstring, indict):
    raw_chart = indict[weekstring]
    text = BeautifulSoup(raw_chart, "html.parser")
    rows = text.find_all("article", attrs={"class": "chart-row"})
    
    testdicts = rank_scrape(rows)
    for test in testdicts:
        test['date'] = weekstring
    return testdicts

In [64]:
weekinfo=[]
for k in sats:
    weekinfo.append(get_weekly(k, satdict))

In [66]:
datum = []
for week in weekinfo:
    datum.append(pd.DataFrame(week))
data2014 = pd.concat(datum, ignore_index=True)

In [95]:
newdata2014 = data2014.drop_duplicates(subset=['artist', 'title'])

In [96]:
newdata2014.to_csv('tempdata/rawdata.csv', encoding='utf-8')

In [44]:
newdata2014 = pd.read_csv('tempdata/rawdata.csv')

In [103]:
lisa2014 = newdata2014[0:2888]
steven2014 = newdata2014[2888:]
lisa2014.to_csv('tempdata/lisa2014.csv')
steven2014.to_csv('tempdata/steven2014.csv')

Unnamed: 0,artist,date,ranking,title
0,Santana,2000-01-01,1,Smooth
1,Brian McKnight,2000-01-01,2,Back At One
2,Jessica Simpson,2000-01-01,3,I Wanna Love You Forever
3,Whitney Houston,2000-01-01,4,My Love Is Your Love
4,Savage Garden,2000-01-01,5,I Knew I Loved You


In [153]:
import urllib2
#all_songs = []
#featurings = []
for index, row in steven2014[780:].iterrows():
    song = urllib2.quote(row["title"].encode("utf-8"))
    artist = urllib2.quote(row["artist"].encode("utf-8"))
    link = requests.get('http://developer.echonest.com/api/v4/song/search?api_key=GT3HIFAMRWWCOWIAZ' + 
                        '&bucket=artist_discovery&bucket=artist_discovery_rank&bucket=artist_familiarity' +
                        '&bucket=artist_familiarity_rank&bucket=artist_hotttnesss&bucket=artist_hotttnesss_rank' +
                        '&bucket=artist_location&bucket=song_currency&bucket=song_currency_rank' +
                        '&bucket=song_discovery&bucket=song_discovery_rank&bucket=song_hotttnesss'+
                        '&bucket=song_hotttnesss_rank&bucket=song_type&bucket=audio_summary' +
                        '&results=1&title=%(song)s&artist=%(artist)s' % \
                        {"song": song, "artist": artist})
    result_json = json.loads(link.text)
    if not result_json["response"]["songs"]:
        featurings.append(index)
    all_songs.append(result_json)
    time.sleep(4)

In [155]:
len(featurings)

176

In [156]:
fd=open("tempdata/steven_songs.json","w")
json.dump(all_songs, fd)
fd.close()

In [22]:
with open("tempdata/steven_songs.json") as json_file:
    test_songs = json.load(json_file)

In [354]:
link = requests.get('http://developer.echonest.com/api/v4/song/search?api_key=GT3HIFAMRWWCOWIAZ' + 
                        '&bucket=artist_discovery&bucket=artist_discovery_rank&bucket=artist_familiarity' +
                        '&bucket=artist_familiarity_rank&bucket=artist_hotttnesss&bucket=artist_hotttnesss_rank' +
                        '&bucket=artist_location&bucket=song_currency&bucket=song_currency_rank' +
                        '&bucket=song_discovery&bucket=song_discovery_rank&bucket=song_hotttnesss'+
                        '&bucket=song_hotttnesss_rank&bucket=song_type&bucket=audio_summary' +
                        '&results=1&title=%(song)s&artist=%(artist)s' % \
                        {"song": "come join", "artist": 'white buffalo'})
blah = json.loads(link.text)
blah

{u'response': {u'songs': [{u'artist_discovery': 0.4672030233883395,
    u'artist_discovery_rank': 17629,
    u'artist_familiarity': 0.523893,
    u'artist_familiarity_rank': 8074,
    u'artist_hotttnesss': 0.581075,
    u'artist_hotttnesss_rank': 3538,
    u'artist_id': u'ARRU4V71187B9993CA',
    u'artist_location': {u'latitude': 33.973951,
     u'location': u'Los Angeles, CA',
     u'longitude': -118.248405},
    u'artist_name': u'White Buffalo',
    u'audio_summary': {u'acousticness': 0.669595,
     u'analysis_url': u'http://echonest-analysis.s3.amazonaws.com/TR/vKxQVgUajJWafjbOT9XUzyZ1gBzCbDY7IZWR9vjgYOQRCdl1BPzie7qmZ_esE9ckf7RWt-I5PvTj-w4mA%3D/3/full.json?AWSAccessKeyId=AKIAJRDFEY23UEVW42BQ&Expires=1449727069&Signature=8UKGRRvgv2CjOYricHitnJnSeRM%3D',
     u'audio_md5': u'',
     u'danceability': 0.529344,
     u'duration': 448.09333,
     u'energy': 0.445162,
     u'instrumentalness': 0.031966,
     u'key': 5,
     u'liveness': 0.105663,
     u'loudness': -7.156,
     u'mode': 1,


In [358]:
newdata2014['artist'][2931] = "Usher"
newdata2014['title'][3020] = "Bartender Song"
newdata2014['artist'][3067] = "Kid Rock"
newdata2014['artist'][3105] = "Kid Rock"
newdata2014['title'][3249] = "Move"
newdata2014['artist'][3209] = "Kenny Chesney"
newdata2014['artist'][3131] = "Kenny Chesney"
newdata2014['artist'][3142] = "Brad Paisley"
newdata2014['title'][3028] = "Out Here Grindin'"
newdata2014['title'][3072] = "Somethin' Special"
newdata2014['artist'][3075] = "Estelle"
newdata2014['artist'][3273] = "Soulja Boy"
newdata2014['artist'][3229] = "Soulja Boy"
newdata2014['artist'][3302] = "Soulja Boy"
newdata2014['artist'][3895] = "Soulja Boy"
newdata2014['artist'][3278] = "A.R. Rahman"
newdata2014['title'][3278] = "Jai Ho"
newdata2014['title'][3357] = "3 a.m."
newdata2014['artist'][3265] = "Coldplay"
newdata2014['title'][3230] = "Jizz in My Pants"
newdata2014['title'][3268] = "Fuck You"
newdata2014['artist'][3373] = "Fast Life Yungstaz"
newdata2014['title'][3411] = "Wetter"
newdata2014['artist'][3436] = "Keyshia Cole"
newdata2014['title'][3439] = "Fallin' For You"
newdata2014['artist'][3443] = "Demi Lovato"
newdata2014['artist'][3454] = "OneRepublic"
newdata2014['title'][3491] = "Gettin' You Home"
newdata2014['artist'][3492] = "Kenny Chesney"
newdata2014['artist'][3517] = "Jay-Z"
newdata2014['title'][3524] = "I'm Goin In"
newdata2014['artist'][3539] = "P!nk"
newdata2014['artist'][3569] = "Thirty Seconds to Mars"
newdata2014['artist'][3687] = "Jay-Z"
newdata2014['title'][3695] = "Stranded"
newdata2014['title'][3702] = "My City of Ruins"
newdata2014['artist'][3707] = "P!nk"
newdata2014['title'][3712] = "Fuck Today"
newdata2014['artist'][3746] = "Three 6 Mafia"
newdata2014['artist'][3875] = "V. V. Brown"
newdata2014['title'][3891] = "9AM in Dallas"
newdata2014['artist'][3897] = "Dirty Heads"
newdata2014['title'][3949] = "2012"
newdata2014['title'][3977] = "Forget You"
newdata2014['title'][4084] = "The Best Thing About Me is You"
newdata2014['artist'][4087] = "Jason Aldean"
newdata2014['artist'][4109] = "Michael Jackson"
newdata2014['title'][4234] = "Did It On Em"
newdata2014['artist'][4327] = "Enrique Iglesias"
newdata2014['artist'][4377] = "Brad Paisley"
newdata2014['title'][4403] = "The Man Who"
newdata2014['artist'][4449] = "Jay-Z"
newdata2014['artist'][4458] = "Jay-Z"
newdata2014['artist'][4462] = "Jay-Z"
newdata2014['artist'][4475] = "New Boyz"
newdata2014['artist'][4505] = "Chris Brown"
newdata2014['title'][4562] = "Stronger"
newdata2014['title'][4597] = "HYFR"
newdata2014['artist'][4652] = "Jay-Z"
newdata2014['title'][4679] = "4AM"
newdata2014['title'][4744] = "I Don't Really Care"
newdata2014['artist'][4845] = "Big Time Rush"
newdata2014['artist'][4863] = "Pusha T"
newdata2014['artist'][4928] = "Lovers' Eyes"
newdata2014['artist'][4901] = "Kanye West"
newdata2014['artist'][4970] = "Swedish House Mafia"
newdata2014['artist'][4972] = "Jason Aldean"
newdata2014['title'][4976] = "A Thousand Years"
newdata2014['artist'][4977] = "Christina Aguilera"
newdata2014['title'][5029] = "We Still In This Bitch"
newdata2014['artist'][5080] = "Tim McGraw"
newdata2014['title'][5092] = "Rich as Fuck"
newdata2014['artist'][5129] = "T.I./B.O.B./Kendrick Lamar"
newdata2014['title'][5139] = "Getting Over You"
newdata2014['title'][5161] = "Grandpa"
newdata2014['artist'][5228] = "Rich Gang"
newdata2014['artist'][5268] = "August Alsina"
newdata2014['title'][5268] = "I Luv this Shit"
newdata2014['artist'][5471] = "Fitz & The Tantrums"
newdata2014['artist'][5273] = "Zedd"
newdata2014['artist'][5277] = "Keith Urban"
newdata2014['title'][5339] = "Let Me Be Lonely"
newdata2014['artist'][5368] = "August Alsina"
newdata2014['title'][5368] = "I Luv this Shit"
newdata2014['title'][5506] = "We are One"
newdata2014['artist'][5507] = "K Camp"
newdata2014['title'][5532] = "Hold on"
newdata2014['artist'][5556] = "Miranda Lambert"
newdata2014['title'][5560] = "Help Falling In Love"
newdata2014['artist'][5588] = "X Ambassadors"
newdata2014['artist'][5573] = "Florida Georgia Line"
newdata2014['title'][5679] = "I Don't Fuck with You"
newdata2014['title'][5715] = "GDFR"
newdata2014['title'][5736] = "Shut Up and Dance"
newdata2014['artist'][5746] = "Nicki Minaj"
newdata2014['artist'][5762]= "Lilly Wood"
newdata2014['artist'][5771] = "White Buffalo"
newdata2014['title'][3695] = "Stranded"

A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.

In [361]:
newdata2014['artist'][5771]

'White Buffalo'

In [364]:
featurings = []
for song in range(len(test_songs)):
    if not test_songs[song]["response"]["songs"]:
        featurings.append(song)
new_featurings = [x+2888 for x in featurings]
print test_songs[featurings[0]]
juliefixessteven2014 = newdata2014.ix[new_featurings]
juliefixessteven2014.head()

{u'response': {u'status': {u'message': u'Success', u'code': 0, u'version': u'4.2'}, u'songs': []}}


Unnamed: 0.1,Unnamed: 0,artist,date,ranking,title
2913,43426,Carrie Underwood,2008-04-26,27,Praying For Time
2914,43442,American Idol Top 8,2008-04-26,43,Shout To The Lord
2916,43479,Annie Lennox,2008-04-26,80,Many Rivers To Cross
2931,43678,Usher,2008-05-10,79,"Love In This Club, Part II"
2936,43695,Ashlee Simpson With Tom Higgenson,2008-05-10,96,Little Miss Obsessive


In [379]:
import urllib2
#juliefixes = []
#julie_leftovers = []
for index, row in juliefixessteven2014[164:].iterrows():
    song = urllib2.quote(row["title"].encode("utf-8"))
    artist = urllib2.quote(row["artist"].encode("utf-8"))
    link = requests.get('http://developer.echonest.com/api/v4/song/search?api_key=GT3HIFAMRWWCOWIAZ' + 
                        '&bucket=artist_discovery&bucket=artist_discovery_rank&bucket=artist_familiarity' +
                        '&bucket=artist_familiarity_rank&bucket=artist_hotttnesss&bucket=artist_hotttnesss_rank' +
                        '&bucket=artist_location&bucket=song_currency&bucket=song_currency_rank' +
                        '&bucket=song_discovery&bucket=song_discovery_rank&bucket=song_hotttnesss'+
                        '&bucket=song_hotttnesss_rank&bucket=song_type&bucket=audio_summary' +
                        '&results=1&title=%(song)s&artist=%(artist)s' % \
                        {"song": song, "artist": artist})
    result_json = json.loads(link.text)
    print result_json
    if not result_json["response"]["songs"]:
        julie_leftovers.append(index)
    juliefixes.append(result_json)
    time.sleep(4.5)

{u'response': {u'status': {u'code': 0, u'message': u'Success', u'version': u'4.2'}, u'songs': []}}
{u'response': {u'status': {u'code': 0, u'message': u'Success', u'version': u'4.2'}, u'songs': []}}
{u'response': {u'status': {u'code': 0, u'message': u'Success', u'version': u'4.2'}, u'songs': []}}
{u'response': {u'status': {u'code': 0, u'message': u'Success', u'version': u'4.2'}, u'songs': []}}
{u'response': {u'status': {u'code': 0, u'message': u'Success', u'version': u'4.2'}, u'songs': [{u'song_hotttnesss': 0.741112, u'song_discovery': 0.0, u'song_currency_rank': 4737, u'artist_discovery_rank': 8550, u'title': u'GDFR', u'artist_discovery': 0.4809879873557384, u'song_hotttnesss_rank': 41, u'artist_name': u'Flo Rida', u'song_type': [u'studio', u'electric', u'vocal'], u'artist_hotttnesss': 0.831246, u'artist_familiarity_rank': 578, u'audio_summary': {u'time_signature': 4, u'analysis_url': u'http://echonest-analysis.s3.amazonaws.com/TR/HLIM0g82Qf182SojQi-FxDEcomqoD3tskahG-MiERWxzL3IMJPqfYXX

In [384]:
juliefixessteven2014.to_csv('tempdata/juliefixessteven2014.csv', encoding='utf-8')
fd=open("tempdata/julie_fixes.json","w")
json.dump(juliefixes, fd)
fd.close()

In [508]:
lisa_featurings = pd.read_csv('tempdata/lisa_featurings.csv')
lisa_featurings[180:]

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,artist,date,ranking,title
180,2165,33091,Angels And Airwaves,2006-04-29,92,The Adventure
181,2204,33642,Chris Daughtry,2006-06-10,43,Wanted Dead Or Alive
182,2211,33697,Busta Rhymes,2006-06-10,98,I Love My B****
183,2214,33785,Mary J. Blige And U2,2006-06-17,86,One
184,2234,34072,Janet Duet With Nelly,2006-07-08,73,Call On Me
185,2265,34485,30 Seconds To Mars,2006-08-05,86,The Kill (Bury Me)
186,2282,34705,Sean Paul Feat. Keyshia Cole,2006-08-26,6,(When You Gonna) Give It Up To Me
187,2324,35269,Aly,2006-09-30,70,Chemicals React
188,2353,35680,Lady Sovereign,2006-10-28,81,Love Me Or Hate Me (F**k You!!!!)
189,2358,35777,Tenacious D,2006-11-04,78,The Pick Of Destiny


In [517]:
link = requests.get('http://developer.echonest.com/api/v4/song/search?api_key=GT3HIFAMRWWCOWIAZ' + 
                        '&bucket=artist_discovery&bucket=artist_discovery_rank&bucket=artist_familiarity' +
                        '&bucket=artist_familiarity_rank&bucket=artist_hotttnesss&bucket=artist_hotttnesss_rank' +
                        '&bucket=artist_location&bucket=song_currency&bucket=song_currency_rank' +
                        '&bucket=song_discovery&bucket=song_discovery_rank&bucket=song_hotttnesss'+
                        '&bucket=song_hotttnesss_rank&bucket=song_type&bucket=audio_summary' +
                        '&results=1&title=%(song)s&artist=%(artist)s' % \
                        {"song": "Crank That (Soulja Boy)", "artist": "Soulja Boy"})
blah = json.loads(link.text)
blah

{u'response': {u'songs': [{u'artist_discovery': 0.3428039160603859,
    u'artist_discovery_rank': 261579,
    u'artist_familiarity': 0.728608,
    u'artist_familiarity_rank': 463,
    u'artist_hotttnesss': 0.609224,
    u'artist_hotttnesss_rank': 2374,
    u'artist_id': u'ARXHGWB1187FB557F5',
    u'artist_location': {u'latitude': 41.8842,
     u'location': u'Chicago, IL, US',
     u'longitude': -87.6324},
    u'artist_name': u'Soulja Boy',
    u'audio_summary': {u'acousticness': 0.545036,
     u'analysis_url': u'http://echonest-analysis.s3.amazonaws.com/TR/vJVXVgp-SlH7i5W8isc3_eVX0rFSNgZqlB64sZA7p0WeZihRQn8AP43Ovi4ZlNryrGIhefEd6RcKOdRLg%3D/3/full.json?AWSAccessKeyId=AKIAJRDFEY23UEVW42BQ&Expires=1449742682&Signature=KP80ykzMqSBl9a77L4vYJ/U6CdE%3D',
     u'audio_md5': u'',
     u'danceability': 0.688805,
     u'duration': 197.78667,
     u'energy': 0.686087,
     u'instrumentalness': 0.0,
     u'key': 0,
     u'liveness': 0.905182,
     u'loudness': -4.795,
     u'mode': 1,
     u'speech

In [26]:
weekinfo=[]
for k in sats:
    weekinfo.append(get_weekly(k, satdict))
yearend=get_weekly('end_year', satdict)
weektuples=zip(sats, weekinfo)

In [16]:
def get_for_title(weektups):
    titles={}
    weekindex=1
    for weekdate, weeksrankings in weektups:
        for rankingtuple in weeksrankings:#iterate over the week's top-100
            ranking = rankingtuple[0]
            weekdict=rankingtuple[1]
            if not titles.has_key(weekdict['title']):
                titles[weekdict['title']]=[]
            titles[weekdict['title']].append((weekindex, ranking))
        weekindex=weekindex+1
    return titles

In [17]:
titles = get_for_title(weektuples)

In [None]:
tdict={}
for title in titles.keys():
    wtlist=titles[title]
    weeks=[e[0] for e in wtlist]
    ranks=[e[1] for e in wtlist]
    rankmean=np.mean(ranks)
    rankstd=np.std(ranks, ddof=1)#numpy standard deviation is population based, make it sample based
    ranklen=len(ranks)
    tdict[title.lower()]={'ranks':(rankmean, rankstd, ranklen)}