In [1]:
import numpy as np
import pandas as pd
import os
from bs4 import BeautifulSoup
import requests
from dotenv import load_dotenv
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import warnings
warnings.filterwarnings('ignore')

In [3]:
directory = 'data'
all_data = []
years = []
for file in os.listdir(directory)[5:-2]:
    filename = file.split('.')[0]
    year = filename.split('-')[-1]
    years.extend([year]*100)
    with open(os.path.join(directory, file), 'r') as f:
        all_data.append(f.name)

all_data

['data\\songs-1970.csv',
 'data\\songs-1971.csv',
 'data\\songs-1972.csv',
 'data\\songs-1973.csv',
 'data\\songs-1974.csv',
 'data\\songs-1975.csv',
 'data\\songs-1976.csv',
 'data\\songs-1977.csv',
 'data\\songs-1978.csv',
 'data\\songs-1979.csv',
 'data\\songs-1980.csv',
 'data\\songs-1981.csv',
 'data\\songs-1982.csv',
 'data\\songs-1983.csv',
 'data\\songs-1984.csv',
 'data\\songs-1985.csv',
 'data\\songs-1986.csv',
 'data\\songs-1987.csv',
 'data\\songs-1988.csv',
 'data\\songs-1989.csv',
 'data\\songs-1990.csv',
 'data\\songs-1991.csv',
 'data\\songs-1992.csv',
 'data\\songs-1993.csv',
 'data\\songs-1994.csv',
 'data\\songs-1995.csv',
 'data\\songs-1996.csv',
 'data\\songs-1998.csv',
 'data\\songs-1999.csv',
 'data\\songs-2000.csv',
 'data\\songs-2001.csv',
 'data\\songs-2002.csv',
 'data\\songs-2003.csv',
 'data\\songs-2005.csv',
 'data\\songs-2006.csv',
 'data\\songs-2007.csv',
 'data\\songs-2008.csv',
 'data\\songs-2009.csv',
 'data\\songs-2010.csv',
 'data\\songs-2011.csv',


In [None]:
df = pd.concat(map(pd.read_csv, all_data), ignore_index=True)
df = df.drop(['Unnamed: 0'], axis=1)
# Add the column for the song's year of appearance
df['year'] = years

df.columns

In [None]:
import string
import re
chars = re.escape(string.punctuation) # !"\#\$%\&'\(\)\*\+,\-\./:;<=>\?@\[\\\]\^_`\{\|\}\~

load_dotenv()
cid = os.getenv('client_id')
secret = os.getenv('client_secret')
client_credentials_manager = SpotifyClientCredentials(client_id=cid, client_secret=secret)
sp = spotipy.Spotify(client_credentials_manager = client_credentials_manager)

df = df.assign(genres = '')

def get_genre_by_artist(df):
    for artist in df['primary_artist'].unique():
        artistname = re.sub(r'['+chars+']', '', artist) if artist in ["T'Pau", "Des'ree", "K'NAAN", "Rag'n'Bone Man"] else artist
        clean_artistname = artistname.replace('&', 'and') if '&' in artistname else artistname
        try:  
            results = sp.search(q='artist:' + clean_artistname, type='artist')
            genres = results['artists']['items'][0]['genres']

            artist_idx = df[df['primary_artist'] == str(artist)].index
            for idx in artist_idx:
                df.at[idx, 'genres'] = genres
        except:
            print('Error at', artist)
        
get_genre_by_artist(df)

In [None]:
from unidecode import unidecode # coding=utf-8
chars = r"""!"\#\$%\&'\(\)\*\+,\\./:;<=>\?@\[\\\]\^_`\{\|\}\~’"""
stop_words = ["(Pt.", "(Part", "(Live)", "(feat.", "[feat.", "(ft.", "(Theme", "(with", "(From", "(Anniversary",
                      "(Edit", "(from", "(Featuring", "(Extended", " Pt.", "(Strike", "(Part", "(Remaster", "(1", "(20", "- "]

def get_lyrics_by_song(df):
    for index, row in df.iterrows():
        artist = df.iloc[index]['primary_artist']
        song = df.iloc[index]['title']
    # Modify some special artist names
        if artist == 'Yusuf / Cat Stevens':
            artist = 'Cat Stevens'
        elif artist == 'P!nk':
            artist = 'Pink'
        artistname = artist.replace('&', 'and') if '&' in artist else artist
        artistname_2 = artistname.replace(' /', '') if ' /' in artistname else artistname
        clean_artistname = unidecode(re.sub(r'['+chars+']', '', artistname_2))
        artistname_url =  clean_artistname.replace(' ','-') if ' ' in clean_artistname else clean_artistname

        songname = song.replace('&', 'and') if '&' in song else song
        for char in [' / ', '...']:
            if char in songname:
                songname = songname.replace(char, ' ')
                
        for string in stop_words:
            if string in songname:
                songname = songname.split(string)[0][:-1]
        songname_2 = unidecode(re.sub(r'['+chars+']', '', songname))
        clean_songname = songname_2.replace(' ','-') if ' ' in songname_2 else songname_2
        songname_url = clean_songname[:-1] if clean_songname[-1:] == '-' else clean_songname

        url = 'https://genius.com/'+ artistname_url + '-' + songname_url + '-' + 'lyrics'
        page = requests.get(url)
        html = BeautifulSoup(page.text, 'html.parser')
        lyrics_container = html.find("div", class_="Lyrics__Container-sc-1ynbvzw-6 YYrds")

        if lyrics_container: 
            lyrics = lyrics_container.text
        elif lyrics_container == None:
            lyrics = None
            print(df.at[index,'primary_artist']+':', df.at[index,'title'], '||||', url)

        df.at[index, 'lyrics'] = lyrics
        
get_lyrics_by_song(df)

In [None]:
df

In [None]:
df[df['lyrics'].isnull()]

In [None]:
with open('./data/merge_with_lyrics_and_genres.csv', 'w', encoding='utf-8') as f:
        df.to_csv(f, header= True, index=False)

In [None]:
def convert_to_list(genres_str):
    arr = []
    for val in genres_str.split("',"):
        val = val.replace("['", '').replace("']", '').replace("'", '').strip()
        arr.append(val)
    return arr

# Add list-formatted genres back to the column
for idx, genre in enumerate(df['genres']):
    if genre is not None:
        df.at[idx, 'genres'] = convert_to_list(genre)  

In [None]:
genre_els = dict()
for genre in df['genres']:
    for entry in genre:
        words = entry.split(" ")
        for word in words:
            if word in genre_els:
                genre_els[word] += 1
            else:
                genre_els[word] = 1

# Look at elements that make up the genres with the most counts
common_genre_els = sorted(genre_els.items(), key=lambda item: item[1])
common_genre_els_dict = {k: v for k, v in common_genre_els}

common_genre_els_dict

In [None]:
### We will re-categorize the genres into: pop, rock, hip hop, dance/electronic, r&b, reggae, rap, soul, etc.
new_genre_list = ['rap', 'soul', 'country', 'funk', 'folk', 
                  'metal', 'blues', 'jazz', 'contemporary']
pop_genre_list = ['pop', 'pop punk', 'show tunes', 'neo mellow', 'pop argentino']
rock_genre_list = ['rock', 'punk', 'rock-and-roll', 'mellow gold', 'melancholia', 'revival']
hiphop_genre_list = ['hip hop', 'trap']
dance_electronic_genre_list = ['dance','house', 'wave', 'disco', 'edm', 'latino', 'electronic', 
                         'electronica', 'tropical', 'beach music', 'grime']
r_and_b_genre_list = ['r&b', 'quiet storm']
reggae_genre_list = ['reggae', 'fusion', 'reggaeton']

for idx, row in enumerate(df['genres']):
    new_genre = []
    for entry in row:
        if 'pop' not in new_genre:
            for pop_genre in pop_genre_list:
                if entry[-len(pop_genre):] == pop_genre:
                    new_genre.append('pop') 
                    
        if 'rock' not in new_genre:
            for rock_genre in rock_genre_list:
                if entry[-len(rock_genre):] == rock_genre:
                    new_genre.append('rock') 
                    
        if 'hip hop' not in new_genre:
            for hiphop_genre in hiphop_genre_list:
                if entry[-len(hiphop_genre):] == hiphop_genre:
                    new_genre.append('hip hop') 
                        
        if 'dance/electronic' not in new_genre:      
            for dance_electronic_genre in dance_electronic_genre_list:
                if entry[-len(dance_electronic_genre):] == dance_electronic_genre:
                    new_genre.append('dance/electronic') 
                    
        if 'r&b' not in new_genre:      
            for r_and_b_genre in r_and_b_genre_list:
                if entry[-len(r_and_b_genre):] == r_and_b_genre:
                    new_genre.append('r&b') 
                    
        if 'reggae' not in new_genre:
            for reggae_genre in reggae_genre_list:
                if entry[-len(reggae_genre):] == reggae_genre:
                    new_genre.append('reggae') 
                    
        for genre in new_genre_list:
            if entry[-len(genre):] == genre:
                if genre not in new_genre:
                    new_genre.append(genre) 
        
    if (new_genre == []):
        df.at[idx, 'genres'] = ['others']
    else:
        df.at[idx, 'genres'] = sorted(new_genre)

In [None]:
df[df['genres'].apply(lambda c: c==['others'])]

In [None]:
from unidecode import unidecode # coding=utf-8
import string
import re
import nltk
from nltk.corpus import stopwords

chars = re.escape(string.punctuation) # !"\#\$%\&'\(\)\*\+,\-\./:;<=>\?@\[\\\]\^_`\{\|\}\~
stop_words = stopwords.words('english')

def clean_lyrics(lyrics_str):
    lyrics_str = unidecode(re.sub(r'['+chars+']', '', lyrics_str))
    lyrics_list = list(lyrics_str)
    for idx in range(1, len(lyrics_list)):
        if lyrics_list[idx].isupper():
            if lyrics_list[idx-1].islower():
                lyrics_list[idx-1] += ". "
    return ''.join(lyrics_list)

In [None]:
for idx, lyrics in enumerate(df['lyrics']):
    try:
        if lyrics is not None:
            lyrics = re.sub('([\(\[]).*?([\)\]])', '', lyrics)
            df.at[idx, 'lyrics'] = clean_lyrics(lyrics)  
    except:
        print('error @', idx, '||||', lyrics)

In [None]:
df

In [None]:
with open('./data/songs_df.csv', 'w', encoding='utf-8') as f:
        df.to_csv(f, header= True, index=False)