<a href="https://colab.research.google.com/github/lucacerab/emotion-detection/blob/main/lyrics_download.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [None]:
# Scraping songs/artists/genres
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re
from tqdm import tqdm

# Get lyrics
!pip install lyricsgenius
import lyricsgenius

Collecting lyricsgenius
[?25l  Downloading https://files.pythonhosted.org/packages/0d/32/be32f6922f70fd1b9900b50b228f6585cd60a96bdf03589df738f627d388/lyricsgenius-3.0.1-py3-none-any.whl (59kB)
[K     |█████▌                          | 10kB 12.9MB/s eta 0:00:01[K     |███████████                     | 20kB 18.1MB/s eta 0:00:01[K     |████████████████▌               | 30kB 16.1MB/s eta 0:00:01[K     |██████████████████████          | 40kB 10.5MB/s eta 0:00:01[K     |███████████████████████████▋    | 51kB 7.5MB/s eta 0:00:01[K     |████████████████████████████████| 61kB 3.9MB/s 
Installing collected packages: lyricsgenius
Successfully installed lyricsgenius-3.0.1


# Scrape Billboard Year-End Hot 100 singles of 2020 page to get songs, artists and genres
(https://en.wikipedia.org/wiki/Billboard_Year-End_Hot_100_singles_of_2020)

In [None]:
url = 'https://en.wikipedia.org/wiki/Billboard_Year-End_Hot_100_singles_of_2020'

html = requests.get(url=url).content
soup = BeautifulSoup(html, 'html5lib')

table = soup.find('table', class_='wikitable sortable').findAll('td')
rows = [t.text for t in table]

# Get rank, song name and artist from the table
rank = [i for i in rows[0::3]]
songs = [i.strip("\"") for i in rows[1::3]]
artists_raw = table[2::3]
artists = []
for i in artists_raw:
  artist = i.findChild("a")['title']
  artist = re.sub(r"\([^()]*\)", "", artist)
  artists.append(artist)

dic = {'rank': rank, 'song': songs, 'artist': artists}     
df = pd.DataFrame(dic)

In [None]:
df.head()

Unnamed: 0,rank,song,artist
0,1,Blinding Lights,The Weeknd
1,2,Circles,Post Malone
2,3,The Box,Roddy Ricch
3,4,Don't Start Now,Dua Lipa
4,5,Rockstar,DaBaby


In [None]:
# Get link to wiki page of each song from the table
all_links = soup.find('table', class_='wikitable sortable').findAll('a')[3:]
base_url = 'https://en.wikipedia.org'
song_links = {}

for i in all_links:
  if i.text in songs:
    url = base_url + i.get('href')
    song_links[i.text] = url

In [None]:
# Get the song's genre from its wiki page
song_genres = {}

for song, link in song_links.items():
  html = requests.get(url=link).content
  soup = BeautifulSoup(html, 'html5lib')
  try: 
    genres_raw = soup.find('td', class_='infobox-data category hlist').findAll('a')
    for i in genres_raw:
      if i.get('title') is not None:
        if song in song_genres:
          # if song already in dictionary go to the next song
          continue
        else:
          # if it doesnt get its genre
          song_genres[song] = i.get('title')
  except AttributeError:
    continue

In [None]:
song_genres

{'10,000 Hours': 'Country pop',
 'Adore You': 'Pop music',
 'All I Want for Christmas Is You': 'Christmas music',
 'Bad Guy': 'Electropop',
 'Be Like That': 'Country music',
 'Before You Go': 'Soft pop',
 'Blinding Lights': 'Synthwave',
 'Bluebird': 'Country music',
 'Break My Heart': 'Dance-pop',
 "Chasin' You": 'Country music',
 'Circles': 'Pop rock',
 'Come & Go': 'Emo rap',
 'Dance Monkey': 'Electropop',
 'Death Bed': 'Lo-fi hip hop',
 'Die from a Broken Heart': 'Country music',
 'Dior': 'Drill music',
 'Does to Me': 'Country music',
 "Don't Start Now": 'Nu-disco',
 'Dynamite': 'Disco',
 "Even Though I'm Leaving": 'Country music',
 'Everything I Wanted': 'Pop music',
 'Falling': 'Contemporary R&B',
 'For the Night': 'Hip hop music',
 'Go Crazy': 'Contemporary R&B',
 'Godzilla': 'Hip hop music',
 'Good as Hell': 'Pop music',
 'Got What I Got': 'Country music',
 'Hard to Forget': 'Country music',
 'Heart on Ice': 'Trap music',
 'Heartless': 'Contemporary R&B',
 'High Fashion': 'West 

In [None]:
df['genre'] = df['song'].map(song_genres)

In [None]:
df.head()

Unnamed: 0,rank,song,artist,genre
0,1,Blinding Lights,The Weeknd,Synthwave
1,2,Circles,Post Malone,Pop rock
2,3,The Box,Roddy Ricch,Hip hop
3,4,Don't Start Now,Dua Lipa,Nu-disco
4,5,Rockstar,DaBaby,Hip hop music


# Get the lyrics of the songs through the Genius API

In [None]:
api = 'xxxxxxxx' #get yours at https://genius.com/developers
genius = lyricsgenius.Genius(api, skip_non_songs=True, excluded_terms=["(Remix)", "(Live)"], remove_section_headers=True)

In [None]:
song_lyrics = {}

for index, row in df.iterrows():
  try:
    songs = genius.search_song(row['song'], row['artist'])
    song_lyrics[row['song']] = songs.lyrics
  except AttributeError:
    continue
  except Timeout:
    continue

Searching for "Blinding Lights" by The Weeknd...
Done.
Searching for "Circles" by Post Malone...
Done.
Searching for "The Box" by Roddy Ricch...
Done.
Searching for "Don't Start Now" by Dua Lipa...
Done.
Searching for "Rockstar" by DaBaby...
Done.
Searching for "Adore You" by Harry Styles...
Done.
Searching for "Life Is Good" by Future ...
Done.
Searching for "Memories" by Maroon 5...
Done.
Searching for "The Bones" by Maren Morris...
Done.
Searching for "Someone You Loved" by Lewis Capaldi...
Done.
Searching for "Say So" by Doja Cat...
Done.
Searching for "I Hope" by Gabby Barrett...
Done.
Searching for "Whats Poppin" by Jack Harlow...
Done.
Searching for "Dance Monkey" by Tones and I...
Done.
Searching for "Savage" by Megan Thee Stallion...
Done.
Searching for "Roxanne" by Arizona Zervas...
Done.
Searching for "Intentions" by Justin Bieber...
Done.
Searching for "Everything I Wanted" by Billie Eilish...
Done.
Searching for "Roses (Imanbek Remix)" by Saint Jhn...
Specified song does n

In [None]:
df['lyrics'] = df['song'].map(song_lyrics)

In [None]:
df.head()

Unnamed: 0,rank,song,artist,genre,lyrics
0,1,Blinding Lights,The Weeknd,Synthwave,Yeah\n\nI've been tryna call\nI've been on my ...
1,2,Circles,Post Malone,Pop rock,"Oh, oh, oh\nOh, oh, oh\nOh, oh, oh, oh, oh\n..."
2,3,The Box,Roddy Ricch,Hip hop,Pullin' out the coupe at the lot\nTold 'em fuc...
3,4,Don't Start Now,Dua Lipa,Nu-disco,If you don't wanna see me\n\nDid a full one-ei...
4,5,Rockstar,DaBaby,Hip hop music,"Woo, woo\nI pull up like\nHow you pull up, Bab..."


In [None]:
# Save in a csv file
df.to_csv('/content/lyrics.csv', index=None)