In [51]:
import requests
import html
import urllib.request
import urllib.error
import urllib.parse
from bs4 import BeautifulSoup as bs

import time
import os
import seaborn as sns
import pandas as pd
import scipy as sc
import numpy as np

import statsmodels.formula.api as sm

import matplotlib.pyplot as plt 
plt.style.use('ggplot')
%matplotlib inline  
plt.rcParams['figure.figsize'] = (10, 6) 

## Important Note:<br>
## If you want to build the DataFrame by your own and download the relevant files to achive this task then begin from here.<br> 
## If you want to run the code when you already using existed DataFrame and downloaded files, Please skip to Phase 2 - Learning Algorithm & Performance Evaluation

# Phase 1 - Data Acquisition & Data Cleaning & Data Vectorization

In [52]:
#creating the list of Billboard urls
billboardUrls = []
years = ["2016","2017","2018"]
for i in years:
    url = 'https://en.wikipedia.org/wiki/Billboard_Year-End_Hot_100_singles_of_' + str(i)
    billboardUrls.append(url)   
print(billboardUrls)

['https://en.wikipedia.org/wiki/Billboard_Year-End_Hot_100_singles_of_2016', 'https://en.wikipedia.org/wiki/Billboard_Year-End_Hot_100_singles_of_2017', 'https://en.wikipedia.org/wiki/Billboard_Year-End_Hot_100_singles_of_2018']


In [53]:
#saving 3 billboard pages on the laptop   
def save_html(url):
    response1 = urllib.request.urlopen(url)
    webContent = response1.read()
    
    f = open('./data/top_100/'+url[url.index('Billboard'):]+'.html', 'wb') #if there is no such file the function open creates it 
    f.write(webContent)
    f.close

for url in billboardUrls:
    save_html(url)

In [54]:
#creating lists of lists for each year
songs_wiki={"2016": [],"2017": [], "2018":[]}
artists_wiki={"2016": [],"2017": [], "2018":[]}

for i in range(6,9):
    content = open("./data/top_100/Billboard_Year-End_Hot_100_singles_of_201"+str(i)+".html", encoding='utf-8')
    soup_html = bs(content, "html.parser")

    titles=soup_html.findAll('td')
    for title in titles[0:len(titles)-5:2]:
        songs_wiki["201{}".format(i)].append(title.text.replace('\n','').replace(',','').replace('(','').replace(')','').replace('-','')
                                        .replace('é','e').replace('?','').replace('"','').replace("!",""))
    for title in titles[1:len(titles)-5:2]:    
        artists_wiki["201{}".format(i)].append(title.text.replace('é','e').replace('í','i').replace('\n','')
                                          .replace('+','').replace('.','').replace('ó','o'))
 

print(artists_wiki["2016"][87]) 
print(songs_wiki["2018"][83])

Silento
18002738255


In [55]:
#creating a dictionary of a df for each year for name of the song & name of artist
songs_df_wiki={"2016_df": pd.DataFrame(),"2017_df": pd.DataFrame(), "2018_df":pd.DataFrame()}
i=6
for key, value in songs_df_wiki.items():
    songs_df_wiki[key]['Title'] = songs_wiki["2016"]
    songs_df_wiki[key]['Artist'] = artists_wiki["2016"]
    songs_df_wiki[key]['Year']="201{}".format(i)
    songs_df_wiki[key]['Is_top100']=1
    i+=1
songs_df_wiki["2016_df"].head()

Unnamed: 0,Title,Artist,Year,Is_top100
0,Love Yourself,Justin Bieber,2016,1
1,Sorry,Justin Bieber,2016,1
2,One Dance,Drake featuring Wizkid and Kyla,2016,1
3,Work,Rihanna featuring Drake,2016,1
4,Stressed Out,Twenty One Pilots,2016,1


In [56]:
#creating one dataframe for all of the songs in top 2016,2017,2018
df_wiki=pd.concat([songs_df_wiki["2016_df"],songs_df_wiki["2017_df"],songs_df_wiki["2018_df"]],axis=0,sort=False, ignore_index=False)
mid = len(df_wiki)/2
df_wiki.iloc[np.r_[0:2,mid:mid+2, -2:0]]
# df_wiki.head()

Unnamed: 0,Title,Artist,Year,Is_top100
0,Love Yourself,Justin Bieber,2016,1
1,Sorry,Justin Bieber,2016,1
50,Into You,Ariana Grande,2017,1
51,Gold,Kiiara,2017,1
98,See You Again,Wiz Khalifa featuring Charlie Puth,2018,1
99,Perfect,One Direction,2018,1


In [57]:
#creating a dictionary of lists which contain url for each song of top 100 for each year in wikipedia
songs_urls_wiki={"2016": [],"2017": [], "2018": []}

for i in range(6,9):
    content = open("./data/top_100/Billboard_Year-End_Hot_100_singles_of_201"+str(i)+".html", encoding='utf-8')
    soup_html = bs(content, "html.parser")

    titles=soup_html.findAll('td')
    for title in titles[0:len(titles)-5:2]:
        name='https://en.wikipedia.org'+str(title)[str(title).index('href')+6:str(title).index('title')-2]
        songs_urls_wiki["201{}".format(i)].append(name)
        
songs_urls_wiki

{'2016': ['https://en.wikipedia.org/wiki/Love_Yourself',
  'https://en.wikipedia.org/wiki/Sorry_(Justin_Bieber_song)',
  'https://en.wikipedia.org/wiki/One_Dance',
  'https://en.wikipedia.org/wiki/Work_(Rihanna_song)',
  'https://en.wikipedia.org/wiki/Stressed_Out',
  'https://en.wikipedia.org/wiki/Panda_(song)',
  'https://en.wikipedia.org/wiki/Hello_(Adele_song)',
  'https://en.wikipedia.org/wiki/Don%27t_Let_Me_Down_(The_Chainsmokers_song)',
  'https://en.wikipedia.org/wiki/Can%27t_Stop_the_Feeling!',
  'https://en.wikipedia.org/wiki/Closer_(The_Chainsmokers_song)',
  'https://en.wikipedia.org/wiki/Cheap_Thrills_(song)',
  'https://en.wikipedia.org/wiki/7_Years_(Lukas_Graham_song)',
  'https://en.wikipedia.org/wiki/Needed_Me',
  'https://en.wikipedia.org/wiki/My_House_(Flo_Rida_song)',
  'https://en.wikipedia.org/wiki/I_Took_a_Pill_in_Ibiza',
  'https://en.wikipedia.org/wiki/Work_from_Home',
  'https://en.wikipedia.org/wiki/This_Is_What_You_Came_For',
  'https://en.wikipedia.org/wiki

In [58]:
#saving 300 songs wikipedia pages of 3 years
def save_html2(url,k):
    response1 = urllib.request.urlopen(url)
    webContent = response1.read()
    
    f = open('./data/300_songs_wiki/'+"201"+str(k)+"_"+name_of_song+'.html', 'wb') #if there is no such file the function open creates it 
    f.write(webContent)
    f.close

    
for i in range (6,9):    
    for url in songs_urls_wiki["201{}".format(i)]:
        temp=str(url).split('/')
        name_of_song=temp[len(temp)-1]
        save_html2(url,i)


In [59]:
#TO ADD HERE OREN's DATAFRAME AND MERGE WITH MY DF







In [60]:
#TO TAKE OUT THE GENRE OF THE SONG FROM WIKI
genres={"2016": [],"2017": [], "2018":[]}
producers={"2016": [],"2017": [], "2018":[]}

for i in range(6,9):
    for k in range(0,100):
        url=songs_urls_wiki["201{}".format(i)][k]
        temp=str(url).split('/')
        name_of_song=temp[len(temp)-1]
        content=open('./data/300_songs_wiki/'+"201"+str(i)+"_"+name_of_song+'.html', encoding='utf-8')
        txt=content.read()
        content.close()
        soup=bs(txt,'html.parser')
        
        table=soup.find("table",{"class":"infobox vevent"})
        t_genre = table.findAll('td', {"class": "category hlist"})
        list_of_inner_text = [x.text for x in t_genre]
        text_genre = ', '.join(list_of_inner_text)
        genres["201{}".format(i)].append(str(text_genre).replace("\n","").replace("[1]","").replace("[2]","").replace("[3]","").replace("[4]",""))
        
print(genres["2016"])           
 

['Acoustic pop', 'Dancehall-poptropical housemoombahton', 'Dancehallafrobeatspop', 'Dancehallreggae-popR&B', 'Rap rockalternative rock', 'Hip hoptrap', 'Soul', 'EDMtrap', 'Discopopsoul', 'Future basspop', 'Synthpopdancehall', 'Soul-pop', 'Electro-R&Btrap', 'Pop-rap', 'Folk pop (original version)Tropical house (Seeb remix)', 'PopR&B', 'EDM', 'Nu-discofunk', 'Hip hopR&B', 'Reggae fusionelectropophip hopalternative rock', 'Rap rock', 'Alternative R&Belectronic', 'Pop', 'PopR&B', '', 'PopR&B', 'Dance-pop', 'Pop', 'Dancehallsoca', 'Traphip hop', 'Poptropical house', 'Alternative R&Btrap', '', 'Hip hoptrap', 'R&Bhip hoptrap', 'PopR&Bretro-soul', 'Hip hoptrap', 'Pop', 'R&B', 'Electropopsynth-pop', 'Dancehall', 'Soul', 'Pop soul', 'Indie rocksoulblues rocksoft rock', 'Dance-popR&B', 'Danceelectropop', 'EDMtropical house', 'Hip hop', 'Country', 'Pop', 'EDMhouse', 'Electropoptrap', 'R&Btrap', 'Hip hop', 'R&Btrap', 'Dance-popsynth-pop', 'Hip hop', 'R&Belectropop', 'Hip hop', 'Future bassalternati

In [61]:
import requests

#THE PART OF Q&A PROCESS:


#the case of & in songs:
name_of_song_for_lyrics=songs["2017"][77].replace("& ","")
name_of_song_for_lyrics=name_of_song_for_lyrics.replace(" ","-")

#the case of featuring:
name=songs["2017"][68]
name=name.replace(" ","-")
list=artists["2017"][68].split('featuring')
#we will always take the first artist
artist_str=list[0]


#the case of and and ':
name=songs["2017"][26].replace("'","")
name=name.replace(" ","-")
list=artists["2017"][26].split('and')
#we will always take the first artist
artist_str=list[0]

#the case of & in artists:
name=songs["2017"][62]
name=name.replace(" ","-")
list=artists["2017"][62].split('&')
#we will always take the first artist
artist_str=list[0]


#the case of , and featuring in artists:
songs["2018"][44]='halsey' #(Him & I different name of the song)
name=songs["2018"][44]
name=name.replace(" ","-")
list=artists["2018"][44].split(',')
#we will always take the first artist
artist_str=list[0]
list=artist_str.split('and')
artist_str=list[0]
list=artist_str.split('featuring')
artist_str=list[0]


url='https://www.metrolyrics.com/'+name+'-lyrics-'+artist_str.replace('"','').replace(" ","-")+'.html'
#the case of "Maroon-5-.html"
url=url.replace("-.",".")                              
print(url)


NameError: name 'songs' is not defined

In [63]:
#creating the dictionary of urls for each song's lyrics
lyrics_urls={"2016": [],"2017": [], "2018": []}

#handling with the specific cases so we have more results that included in top100
songs_wiki["2018"][44]='halsey' 
songs_wiki["2016"][37]=songs_wiki["2016"][37].replace("U","you")
songs_wiki["2016"][48]='holy'
artists_wiki["2018"][82]='nerd-the-neptunes'

for i in range(6,9):
    for k in range(0,100):
        #the case of & in songs:
        name=songs_wiki["201{}".format(i)][k].replace("& ","").replace("'","")
        name=name.replace(" ","-")

        list=artists_wiki["201{}".format(i)][k].split(',')
        #we will always take the first artist
        artist_str=list[0]
        list=artist_str.split(' and ')
        artist_str=list[0]
        list=artist_str.split('featuring')
        artist_str=list[0]

        url='https://www.metrolyrics.com/'+name+'-lyrics-'+artist_str.replace('"','').replace(" ","-")+'.html'
        #the case of "Maroon-5-.html"
        url=url.replace("-.",".") 

        lyrics_urls["201{}".format(i)].append(url)
   
print(lyrics_urls["2018"])  

['https://www.metrolyrics.com/Gods-Plan-lyrics-Drake.html', 'https://www.metrolyrics.com/Perfect-lyrics-Ed-Sheeran.html', 'https://www.metrolyrics.com/Meant-to-Be-lyrics-Bebe-Rexha.html', 'https://www.metrolyrics.com/Havana-lyrics-Camila-Cabello.html', 'https://www.metrolyrics.com/Rockstar-lyrics-Post-Malone.html', 'https://www.metrolyrics.com/Psycho-lyrics-Post-Malone.html', 'https://www.metrolyrics.com/I-Like-It-lyrics-Cardi-B.html', 'https://www.metrolyrics.com/The-Middle-lyrics-Zedd.html', 'https://www.metrolyrics.com/In-My-Feelings-lyrics-Drake.html', 'https://www.metrolyrics.com/Girls-Like-You-lyrics-Maroon-5.html', 'https://www.metrolyrics.com/Nice-for-What-lyrics-Drake.html', 'https://www.metrolyrics.com/Lucid-Dreams-lyrics-Juice-Wrld.html', 'https://www.metrolyrics.com/Better-Now-lyrics-Post-Malone.html', 'https://www.metrolyrics.com/Finesse-lyrics-Bruno-Mars.html', 'https://www.metrolyrics.com/Bood-Up-lyrics-Ella-Mai.html', 'https://www.metrolyrics.com/New-Rules-lyrics-Dua-Li

In [69]:
import time
time.sleep(3)

#the part of creation of the lyrics_texts dictionary which contain the lyrics of each song in top 2016,2017,2018
lyrics_texts={"2016": [],"2017": [], "2018": []}

for i in range(6,9):
    for k in range(0,100):
        url = lyrics_urls["201{}".format(i)][k]
        response = requests.get(url)
        data = response.text
        soup = bs(data,'html.parser')
        couplets = soup.findAll("p",{"class":"verse"})
        temp = ""
        for p in couplets:
            temp = str(temp) + str(p.text)
            temp=temp+"\n"
        #cleaning the text from unnecessary punctuation    
        temp=temp.replace(",","").replace("!","").replace("...","").replace('(','').replace(')','').replace('?','')
         #for the further work with text we would like to lowercase all of the words in text   
        lyrics_texts["201{}".format(i)].append(temp.lower())
        

In [70]:
print(lyrics_texts["2016"][87])

you already know who it is
silento
silento
gonna do it for you
now watch me whip
now watch me nae nae
now watch me whip whip
watch me nae nae
now watch me whip kill it
watch me nae nae okay
now watch me whip whip
watch me nae nae can you do it
now watch me
ooh watch me watch me
ooh watch me watch me
ooh watch me watch me
ooh ooh ooh ooh
ooh watch me watch me
ooh watch me watch me
ooh watch me watch me
ooh ooh ooh ooh
do the stanky leg do the stanky leg
do the stanky leg do the stanky leg
do the stanky leg do the stanky leg
do the stanky leg do the stanky leg
now break your legs
break your legs
tell 'em "break your legs"
break your legs
now break your legs
break your legs
now break your legs
break your legs
now watch me
bop bop bop bop bop bop bop bop bop
now watch me
bop bop bop bop bop bop bop bop bop
now watch me whip kill it
now watch me nae nae okay
now watch me whip whip
watch me nae nae want me do it
now watch me whip kill it
watch me nae nae okay
now watch me whip whip
watch me 

In [None]:
#we'd like to now how many words appear in our lyrics

words_count={"2016": [],"2017": [], "2018": []}

for i in range(6,9):
    for k in range(0,100):
        words_count["201{}".format(i)].append(len(lyrics_texts["201{}".format(i)][k]))
        
print(words_count)        

In [None]:
res_list = [] 
for i in range(0, 100) : 
    if words_count["2016"][i] == 0 : 
        res_list.append(i)
print(res_list)        

In [None]:
print(lyrics_urls["2016"][87]) 

In [None]:
import nltk

In [None]:
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize

In [None]:
import re
#here we would like to count how many positive words can we see in each song lyrics




count_pos={"2016": [0]*100,"2017": [0]*100, "2018": [0]*100}

keywords_pos=['love','like','baby','happy','dream','dreaming','kiss',]

# ps = PorterStemmer()
# words=word_tokenize(keywords_pos[0])

# for word in words:
#     print(word + ":"+ ps.stem(word))


# for i in range(6,9):
#     for k in range(0,100):
#         temp=[]
#         for word in keywords_pos:
#             #here we create a temp list to store all the words from keywords_pos which appear in lyrics 
#             temp+=re.findall(word, lyrics_texts["201{}".format(i)][k])
#         #now we'd want to know how many times these words appeared in text
#         count_pos["201{}".format(i)][k]=len(temp)
# print(count_pos)            


temp=[]

for word in keywords_pos:
    temp+=re.findall(word, lyrics_texts["2016"][0])
print(temp)

In [None]:
#here we would like to count how many negative words can we see in each song lyrics

count_neg={"2016": [0]*100,"2017": [0]*100, "2018": [0]*100}

keywords_neg=['hate','crying','dying','bitch','die','lie','lying','mess','lied','fuck']


for i in range(6,9):
    for k in range(0,100):
        temp=[]
        for word in keywords_neg:
            #here we create a temp list to store all the words from keywords_neg which appear in lyrics 
            temp+=re.findall(word, lyrics_texts["201{}".format(i)][k])
        #now we'd want to know how many times these words appeared in text
        count_neg["201{}".format(i)][k]=len(temp)
print(count_neg)  

In [None]:
#here we would like to count how many repeating words can we see in each song lyrics

# Phase 2 - Learning Algoorithm & Performance Evaluation