In [1]:
import numpy as np
import pandas as pd
import os
import warnings
warnings.filterwarnings('ignore')

In [2]:
filename = 'data/merge_with_lyrics_and_genres.csv'
with open (filename, 'r') as f:     
    df = pd.read_csv(filename)
    df = df.drop(columns=['id', 'all_artists'])

pd.set_option('display.max_colwidth', 100)

In [3]:
def convert_to_list(genres_str):
    arr = []
    for val in genres_str.split("',"):
        val = val.replace("['", '').replace("']", '').replace("'", '').strip()
        arr.append(val)
    return arr

# Add list-formatted genres back to the column
for idx, genre in enumerate(df['genres']):
    if genre is not None:
        df.at[idx, 'genres'] = convert_to_list(genre)  

In [4]:
genre_els = dict()
for genre in df['genres']:
    for entry in genre:
        words = entry.split(" ")
        for word in words:
            if word in genre_els:
                genre_els[word] += 1
            else:
                genre_els[word] = 1

# Look at elements that make up the genres with the most counts
common_genre_els = sorted(genre_els.items(), key=lambda item: item[1])
common_genre_els_dict = {k: v for k, v in common_genre_els}

common_genre_els_dict

{'birmingham': 1,
 'stoner': 1,
 'doom': 1,
 'anti-folk': 1,
 'weird': 1,
 'america': 1,
 'opera': 1,
 'jamaican': 1,
 'rare': 1,
 'freakbeat': 1,
 'harmony': 1,
 'jamgrass': 1,
 'of': 1,
 'gibraltar': 1,
 'louisiana': 1,
 'mexico': 1,
 'jawaiian': 1,
 'pacific': 1,
 'islands': 1,
 'sleep': 1,
 'metalcore': 1,
 'truck-driving': 1,
 'theme': 1,
 'laboratorio': 1,
 'fremantle': 1,
 'perth': 1,
 'tierra': 1,
 'caliente': 1,
 'boogie': 1,
 'mod': 1,
 'dansktop': 1,
 'no': 1,
 'dusseldorf': 1,
 'proto-techno': 1,
 'surf': 1,
 'bossa': 1,
 'nova': 1,
 'brazilian': 1,
 'samba': 1,
 'gothic': 1,
 'post-bop': 1,
 'austropop': 1,
 'musical': 1,
 'advocacy': 1,
 'neo-progressive': 1,
 'operatic': 1,
 'popping': 1,
 'flamenco': 1,
 'rumba': 1,
 'guyanese': 1,
 'bow': 1,
 'zouk': 1,
 'sacramento': 1,
 'bedroom': 1,
 'end': 1,
 'background': 1,
 'funana': 1,
 'frankfurt': 1,
 'corrido': 1,
 'cyberpunk': 1,
 'afropop': 1,
 'mande': 1,
 'mbalax': 1,
 'anime': 1,
 'vocaloid': 1,
 'oxford': 1,
 'oi': 1,

In [5]:
### We will re-categorize the genres into: pop, rock, hip hop, dance/electronic, r&b, reggae, rap, soul, etc.
new_genre_list = ['rap', 'soul', 'country', 'funk', 'folk', 
                  'metal', 'blues', 'jazz', 'contemporary']
pop_genre_list = ['pop', 'pop punk', 'show tunes', 'neo mellow', 'pop argentino']
rock_genre_list = ['rock', 'punk', 'rock-and-roll', 'mellow gold', 'melancholia', 'revival']
hiphop_genre_list = ['hip hop', 'trap']
dance_electronic_genre_list = ['dance','house', 'wave', 'disco', 'edm', 'latino', 'electronic', 
                         'electronica', 'tropical', 'beach music', 'grime']
r_and_b_genre_list = ['r&b', 'quiet storm']
reggae_genre_list = ['reggae', 'fusion', 'reggaeton']

for idx, row in enumerate(df['genres']):
    new_genre = []
    for entry in row:
        if 'pop' not in new_genre:
            for pop_genre in pop_genre_list:
                if entry[-len(pop_genre):] == pop_genre:
                    new_genre.append('pop') 
                    
        if 'rock' not in new_genre:
            for rock_genre in rock_genre_list:
                if entry[-len(rock_genre):] == rock_genre:
                    new_genre.append('rock') 
                    
        if 'hip hop' not in new_genre:
            for hiphop_genre in hiphop_genre_list:
                if entry[-len(hiphop_genre):] == hiphop_genre:
                    new_genre.append('hip hop') 
                        
        if 'dance/electronic' not in new_genre:      
            for dance_electronic_genre in dance_electronic_genre_list:
                if entry[-len(dance_electronic_genre):] == dance_electronic_genre:
                    new_genre.append('dance/electronic') 
                    
        if 'r&b' not in new_genre:      
            for r_and_b_genre in r_and_b_genre_list:
                if entry[-len(r_and_b_genre):] == r_and_b_genre:
                    new_genre.append('r&b') 
                    
        if 'reggae' not in new_genre:
            for reggae_genre in reggae_genre_list:
                if entry[-len(reggae_genre):] == reggae_genre:
                    new_genre.append('reggae') 
                    
        for genre in new_genre_list:
            if entry[-len(genre):] == genre:
                if genre not in new_genre:
                    new_genre.append(genre) 
        
    if (new_genre == []):
        df.at[idx, 'genres'] = ['others']
    else:
        df.at[idx, 'genres'] = sorted(new_genre)

In [6]:
pd.set_option('display.max_rows', None)
df[df['genres'].apply(lambda c: c==['others'])]

Unnamed: 0,title,primary_artist,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,duration_s,year,genres,lyrics
46,Ride Captain Ride,Blues Image,0.547,0.655,2,-11.707,1,0.0414,0.136,0.00737,0.116,0.649,125.545,4,226,1970,[others],"[Intro]Seventy-three men sailed up from the San Francisco BayRolled off of their ship, and here'..."
48,Vehicle,The Ides Of March,0.474,0.734,10,-8.813,0,0.0426,0.0871,0.00356,0.26,0.926,101.751,4,178,1970,[others],"{Intro}[ Verse 1]Hey, well, I'm the friendly stranger in the black sedanWoncha hop inside my car..."
50,Yellow River,Christie,0.627,0.544,4,-11.436,1,0.029,0.0859,0.0,0.152,0.877,129.851,4,166,1970,[others],"{Intro}So long, boy, you can take my placeGot my papers, I got my paySo pack my bags and I'll be..."
61,Candida,The Dawn,0.783,0.596,7,-10.056,1,0.0676,0.235,0.0,0.0427,0.88,127.576,4,182,1970,[others],The stars won't come outIf they know that you're about'Cause they couldn't match the glow of you...
65,Montego Bay,Bobby Bloom,0.765,0.586,7,-10.332,1,0.0367,0.734,0.00218,0.113,0.943,105.567,4,176,1970,[others],Vernon'll meet me when the BOAC landsKeys to the MG will be in his handsAdjust to the drivin' an...
70,I Don't Believe In If Anymore,Roger Whittaker,0.325,0.397,5,-12.492,1,0.0323,0.16,0.000829,0.129,0.349,143.387,4,196,1970,[others],"Now if you load your rifle rightAnd if you fix your bayonet soAnd if you kill that man, my frien..."
80,Travelling Band,Creedence Clearwater Revival Experience,0.532,0.784,0,-7.067,0,0.0402,0.21,7e-06,0.466,0.968,156.976,4,133,1970,[others],
81,To Be Young Gifted and Black,Bob & Marcia,0.646,0.454,0,-10.821,1,0.0308,0.418,0.000532,0.353,0.922,117.125,4,189,1970,[others],"To be young, gifted and blackOh what a lovely precious dreamTo be young, gifted and blackOpen yo..."
131,Rose Garden,Lynn Anderson,0.589,0.501,0,-11.665,1,0.031,0.168,0.0,0.0741,0.969,131.064,4,176,1971,[others],"[Chorus]I beg your pardon, I never promised you a rose gardenAlong with the sunshine there's got..."
167,Do You Know What I Mean,Lee Michaels,0.788,0.861,0,-9.745,1,0.0623,0.158,1.3e-05,0.0199,0.95,109.232,4,194,1971,[others],{Intro}Been forty days since I don't know whenI just saw her with my best friendDo you know what...


In [7]:
from unidecode import unidecode # coding=utf-8
import string
import re
import nltk
from nltk.corpus import stopwords

chars = re.escape(string.punctuation) # !"\#\$%\&'\(\)\*\+,\-\./:;<=>\?@\[\\\]\^_`\{\|\}\~
# chars = """!"\#\$%\&'\(\)\*\+,\-\./:;<=>\?@\[\\\]\^_`\{\|\}\~"""
stop_words = stopwords.words('english')

def clean_lyrics(lyrics_str):
    lyrics_str = unidecode(re.sub(r'['+chars+']', '', lyrics_str))
    lyrics_list = list(lyrics_str)
    for idx in range(1, len(lyrics_list)):
        if lyrics_list[idx].isupper():
            if lyrics_list[idx-1].islower():
                lyrics_list[idx-1] += ". "
    return ''.join(lyrics_list)

# lyrics = df['lyrics'][4798]
# lyrics = re.sub('([\(\[]).*?([\)\]])', '', lyrics)
# test_lyrics = clean_lyrics(lyrics)
# test_lyrics

In [8]:
for idx, lyrics in enumerate(df['lyrics']):
    try:
        if lyrics is not None:
            lyrics = re.sub('([\(\[]).*?([\)\]])', '', lyrics)
            df.at[idx, 'lyrics'] = clean_lyrics(lyrics)  
    except:
        print('error @', idx, '||||', lyrics)

error @ 80 |||| nan
error @ 95 |||| nan
error @ 113 |||| nan
error @ 130 |||| nan
error @ 135 |||| nan
error @ 149 |||| nan
error @ 152 |||| nan
error @ 174 |||| nan
error @ 179 |||| nan
error @ 236 |||| nan
error @ 247 |||| nan
error @ 252 |||| nan
error @ 273 |||| nan
error @ 275 |||| nan
error @ 276 |||| nan
error @ 279 |||| nan
error @ 283 |||| nan
error @ 322 |||| nan
error @ 341 |||| nan
error @ 349 |||| nan
error @ 350 |||| nan
error @ 378 |||| nan
error @ 413 |||| nan
error @ 418 |||| nan
error @ 445 |||| nan
error @ 460 |||| nan
error @ 477 |||| nan
error @ 482 |||| nan
error @ 544 |||| nan
error @ 578 |||| nan
error @ 600 |||| nan
error @ 606 |||| nan
error @ 609 |||| nan
error @ 636 |||| nan
error @ 681 |||| nan
error @ 746 |||| nan
error @ 770 |||| nan
error @ 795 |||| nan
error @ 878 |||| nan
error @ 909 |||| nan
error @ 913 |||| nan
error @ 960 |||| nan
error @ 980 |||| nan
error @ 991 |||| nan
error @ 992 |||| nan
error @ 1071 |||| nan
error @ 1074 |||| nan
error @ 1090 

In [None]:
df

In [9]:
with open('./data/songs_final.csv', 'w', encoding='utf-8') as f:
        df.to_csv(f, header= True, index=False)