In [1]:
import requests
import html
import urllib.request
import urllib.error
import urllib.parse
from bs4 import BeautifulSoup as bs

import time
import os
import seaborn as sns
import pandas as pd
import scipy as sc
import numpy as np

import statsmodels.formula.api as sm

import matplotlib.pyplot as plt 
plt.style.use('ggplot')
%matplotlib inline  
plt.rcParams['figure.figsize'] = (10, 6) 

## Important Note:<br>
## If you want to build the DataFrame by your own and download the relevant files to achive this task then begin from here.<br> 
## If you want to run the code when you already using existed DataFrame and downloaded files, Please skip to Phase 2 - Learning Algorithm & Performance Evaluation

# Phase 1 - Data Acquisition & Data Cleaning & Data Vectorization

In [2]:
#creating the list of Billboard urls
billboardUrls = []
years = ["2016","2017","2018"]
for i in years:
    url = 'https://en.wikipedia.org/wiki/Billboard_Year-End_Hot_100_singles_of_' + str(i)
    billboardUrls.append(url)   
print(billboardUrls)

['https://en.wikipedia.org/wiki/Billboard_Year-End_Hot_100_singles_of_2016', 'https://en.wikipedia.org/wiki/Billboard_Year-End_Hot_100_singles_of_2017', 'https://en.wikipedia.org/wiki/Billboard_Year-End_Hot_100_singles_of_2018']


In [3]:
#saving 3 billboard pages on the laptop   
def save_html(url):
    response1 = urllib.request.urlopen(url)
    webContent = response1.read()
    
    f = open('./data/top_100/'+url[url.index('Billboard'):]+'.html', 'wb') #if there is no such file the function open creates it 
    f.write(webContent)
    f.close

for url in billboardUrls:
    save_html(url)

In [4]:
#creating lists of lists for each year
songs={"2016": [],"2017": [], "2018":[]}
artists={"2016": [],"2017": [], "2018":[]}

for i in range(6,9):
    content = open("./data/top_100/Billboard_Year-End_Hot_100_singles_of_201"+str(i)+".html", encoding='utf-8')
    soup_html = bs(content, "html.parser")

    titles=soup_html.findAll('td')
    for title in titles[0:len(titles)-5:2]:
        songs["201{}".format(i)].append(title.text.replace('\n','').replace(',','').replace('(','').replace(')','')
                                        .replace('é','e').replace('?','').replace('"','').replace("!",""))
    for title in titles[1:len(titles)-5:2]:    
        artists["201{}".format(i)].append(title.text.replace('é','e').replace('í','i').replace('\n','')
                                          .replace('+','').replace('.',''))
 

print(artists["2018"][80]) 
print(songs["2018"][80])

Nio Garcia, Darell and Casper Mágico featuring Bad Bunny, Nicky Jam and Ozuna
Te Bote


In [5]:
#creating a dictionary of a df for each year for name of the song & name of artist
songs_df={"2016_df": pd.DataFrame(),"2017_df": pd.DataFrame(), "2018_df":pd.DataFrame()}
for key, value in songs_df.items():
    songs_df[key]['Title'] = songs["2016"]
    songs_df[key]['Artist'] = artists["2016"]
songs_df["2016_df"].head()

Unnamed: 0,Title,Artist
0,Love Yourself,Justin Bieber
1,Sorry,Justin Bieber
2,One Dance,Drake featuring Wizkid and Kyla
3,Work,Rihanna featuring Drake
4,Stressed Out,Twenty One Pilots


In [6]:
#creating a dictionary of lists which contain url for each song of top 100 for each year in wikipedia
songs_urls_wiki={"2016": [],"2017": [], "2018": []}

for i in range(6,9):
    content = open("./data/top_100/Billboard_Year-End_Hot_100_singles_of_201"+str(i)+".html", encoding='utf-8')
    soup_html = bs(content, "html.parser")

    titles=soup_html.findAll('td')
    for title in titles[0:len(titles)-5:2]:
        name='https://en.wikipedia.org'+str(title)[str(title).index('href')+6:str(title).index('title')-2]
        songs_urls_wiki["201{}".format(i)].append(name)
        
songs_urls_wiki

{'2016': ['https://en.wikipedia.org/wiki/Love_Yourself',
  'https://en.wikipedia.org/wiki/Sorry_(Justin_Bieber_song)',
  'https://en.wikipedia.org/wiki/One_Dance',
  'https://en.wikipedia.org/wiki/Work_(Rihanna_song)',
  'https://en.wikipedia.org/wiki/Stressed_Out',
  'https://en.wikipedia.org/wiki/Panda_(song)',
  'https://en.wikipedia.org/wiki/Hello_(Adele_song)',
  'https://en.wikipedia.org/wiki/Don%27t_Let_Me_Down_(The_Chainsmokers_song)',
  'https://en.wikipedia.org/wiki/Can%27t_Stop_the_Feeling!',
  'https://en.wikipedia.org/wiki/Closer_(The_Chainsmokers_song)',
  'https://en.wikipedia.org/wiki/Cheap_Thrills_(song)',
  'https://en.wikipedia.org/wiki/7_Years_(Lukas_Graham_song)',
  'https://en.wikipedia.org/wiki/Needed_Me',
  'https://en.wikipedia.org/wiki/My_House_(Flo_Rida_song)',
  'https://en.wikipedia.org/wiki/I_Took_a_Pill_in_Ibiza',
  'https://en.wikipedia.org/wiki/Work_from_Home',
  'https://en.wikipedia.org/wiki/This_Is_What_You_Came_For',
  'https://en.wikipedia.org/wiki

In [8]:
#saving 300 songs wikipedia pages of 3 years
def save_html2(url, k):
    response1 = urllib.request.urlopen(url)
    webContent = response1.read()
    
    f = open('./data/300_songs_wiki/'+"201"+str(k)+"_"+name_of_song+'.html', 'wb') #if there is no such file the function open creates it 
    f.write(webContent)
    f.close

    
for i in range (6,9):    
    for url in songs_urls_wiki["201{}".format(i)]:
        temp=str(url).split('/')
        name_of_song=temp[len(temp)-1]
        save_html2(url, i)


Collecting eachh song XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX

In [142]:
#get release, genre and producer for each song, via wiki page.
wiki300_dir='./data/300_songs_wiki/'
wiki300_dir_fse = os.fsencode(wiki300_dir)


# ------ test one song ---------
# content = open("./data/300_songs_wiki/2016_2_Phones.html", encoding='utf-8')
# soup_wiki_song = bs(content, "html.parser")

# temp = soup_wiki_song.find("table", {"class": "infobox vevent"})
# kaki=temp.findAll('span', {"class": "bday dtstart published updated"})
# list_of_inner_text = [x.text for x in kaki]
# text = ', '.join(list_of_inner_text)

# print(text)


# ---------- all songs ---------------

for file in os.listdir(wiki300_dir_fse):
    filename = os.fsdecode(file)
    content = open("{}{}".format(wiki300_dir, filename), encoding='utf-8')
    soup_wiki_song = bs(content, "html.parser")
    
    print(filename)
    
    table = soup_wiki_song.find("table", {"class": "infobox vevent"})
    
    # get release info
    t_rel = table.findAll('span', {"class": "bday dtstart published updated"})
    list_of_inner_text = [x.text for x in t_rel]
    text_rel = ', '.join(list_of_inner_text)
    print (text_rel)
    
    # get genre
    t_genre = table.findAll('td', {"class": "category hlist"})
    list_of_inner_text = [x.text for x in t_genre]
    text_genre = ', '.join(list_of_inner_text)
    print (text_genre)
      
    # get producer
    t_producer = table.findAll('td', {"class": "hlist"})
    list_of_inner_text = [x.text for x in t_producer]
    text_producer = ', '.join(list_of_inner_text)
    print (text_producer)
    print("\n")

2016_2_Phones.html
2015-11-05
Hip hop
Digital download, Hip hop, BWAAtlantic, Kevin GilyardBrittany Hazzard


2016_679_(song).html
2015-06-29
Hip hop
Digital download, Hip hop, 
RGF
300
, 
Willie Maxwell
Remy Boyz
, 
Brian "Peoples" Garcia
Micah Street



2016_7_Years_(Lukas_Graham_song).html

Soul-pop
Soul-pop, CopenhagenWarner Bros., Lukas ForchammerStefan ForrestMorten RistorpMorten Pilegaard, Future AnimalsPilo


2016_Adventure_of_a_Lifetime.html


Pop[1]
disco[1][2][3]
funk[1]

Digital download, 
Pop[1]
disco[1][2][3]
funk[1]
, 
Parlophone
Atlantic
, 
Guy Berryman
Jonny Buckland
Will Champion
Mikkel S. Eriksen
Tor Erik Hermansen
Chris Martin
, 
Rik Simpson
Stargate



2016_All_In_My_Head_(Flex).html
2016-05-31
Reggae-trap
Digital download, Reggae-trap, EpicSyco, 
Various


Mikkel Eriksen
Tor Hermansen
Benjamin Levin
Willie Maxwell
Skye Sweetnam
Daystar Peterson
Nolan Lambroza
Julia Cavazos
Brian Garcia
Ewart Brown
Clifton Dillon
Richard Foulks
Camila Cabello
Lauren Jauregui
Norman

KeyboardInterrupt: 

In [None]:
#TO TAKE OUT THE GENRE OF THE SONG FROM WIKI


#content = open('/Users/alisamakarova/Documents/DS_Project/300_songs_wiki'+"2016"+"_"+songs["2016"][0].replace(" ","_")+'.html', 'wb')
# content=open('/Users/alisamakarova/Documents/DS_Project/300_songs_wiki/2016_Hotline_Bling.html', encoding='utf-8')
# txt=content.read()
# content.close()
# soup=bs(txt,'html.parser')
# print(soup)


In [None]:
import requests

#THE PART OF Q&A PROCESS:


#the case of & in songs:
name_of_song_for_lyrics=songs["2017"][77].replace("& ","")
name_of_song_for_lyrics=name_of_song_for_lyrics.replace(" ","-")

#the case of featuring:
name=songs["2017"][68]
name=name.replace(" ","-")
list=artists["2017"][68].split('featuring')
#we will always take the first artist
artist_str=list[0]


#the case of and and ':
name=songs["2017"][26].replace("'","")
name=name.replace(" ","-")
list=artists["2017"][26].split('and')
#we will always take the first artist
artist_str=list[0]

#the case of & in artists:
name=songs["2017"][62]
name=name.replace(" ","-")
list=artists["2017"][62].split('&')
#we will always take the first artist
artist_str=list[0]


#the case of , and featuring in artists:
songs["2018"][44]='halsey' #(Him & I different name of the song)
name=songs["2018"][44]
name=name.replace(" ","-")
list=artists["2018"][44].split(',')
#we will always take the first artist
artist_str=list[0]
list=artist_str.split('and')
artist_str=list[0]
list=artist_str.split('featuring')
artist_str=list[0]


url='https://www.metrolyrics.com/'+name+'-lyrics-'+artist_str.replace('"','').replace(" ","-")+'.html'
#the case of "Maroon-5-.html"
url=url.replace("-.",".")                              
print(url)


In [None]:
#creating the dictionary of urls for each song's lyrics
lyrics_urls={"2016": [],"2017": [], "2018": []}

#handling with the specific cases 
songs["2018"][44]='halsey' 
songs["2018"][83]='18002738255' 
songs["2016"][37]=songs["2016"][37].replace("U","you")
songs["2016"][48]='holy'
artists["2018"][82]='nerd-the-neptunes'

for i in range(6,9):
    for k in range(0,100):
        #the case of & in songs:
        name=songs["201{}".format(i)][k].replace("& ","").replace("'","")
        name=name.replace(" ","-")

        list=artists["201{}".format(i)][k].split(',')
        #we will always take the first artist
        artist_str=list[0]
        list=artist_str.split('and')
        artist_str=list[0]
        list=artist_str.split('featuring')
        artist_str=list[0]

        url='https://www.metrolyrics.com/'+name+'-lyrics-'+artist_str.replace('"','').replace(" ","-")+'.html'
        #the case of "Maroon-5-.html"
        url=url.replace("-.",".") 

        lyrics_urls["201{}".format(i)].append(url)
   
print(len(lyrics_urls["2018"]))  

In [None]:
import time
time.sleep(3)

#the part of creation of the lyrics_texts dictionary which contain the lyrics of each song in top 2016,2017,2018
lyrics_texts={"2016": [],"2017": [], "2018": []}

for i in range(6,9):
    for k in range(0,100):
        url = lyrics_urls["201{}".format(i)][k]
        response = requests.get(url)
        data = response.text
        soup = bs(data,'html.parser')
        try2 = soup.findAll("p",{"class":"verse"})
        temp = ""
        for p in try2:
            temp = str(temp) + str(p.text)
        lyrics_texts["201{}".format(i)].append(temp)
       

In [None]:
print(temp)

# Phase 2 - Learning Algoorithm & Performance Evaluation