In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from py_lex import EmoLex
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from collections import Counter

In [3]:
data = pd.read_csv('./spotify_lyrics.csv', sep="\t")
data.shape

(4346, 23)

In [4]:
data.drop(columns=[ "album", "track_id", "duration_s", "track_popularity", "key"], inplace=True)

In [7]:
data.head(5)

Unnamed: 0,genre,artist,track_name,danceability,energy,loudness,mode,speechiness,instrumentalness,valence,tempo,time_signature,acousticness,album_release_year,track_name_clean,lyric_clean,ignore_track,char_count
0,Pop,Michael Jackson,Billie Jean,0.92,0.654,-3.051,0,0.0401,0.0158,0.847,117.046,4,0.0236,1982.0,Billie Jean,She was more like a beauty queen from a movie ...,0.0,2620.0
1,Pop,Michael Jackson,Smooth Criminal - 2012 Remaster,0.853,0.981,-3.947,1,0.0751,0.468,0.595,118.193,4,0.247,1987.0,Smooth Criminal,Aaow!\nCha!\nShoo-cha-choo-cha!\n\nAs he came ...,0.0,2981.0
2,Pop,Michael Jackson,Beat It,0.779,0.867,-3.704,0,0.0457,8e-06,0.915,138.858,4,0.0491,1982.0,Beat It,"They told him, ""Don't you ever come around her...",0.0,2250.0
3,Pop,Michael Jackson,The Way You Make Me Feel - 2012 Remaster,0.877,0.854,-4.523,1,0.147,5.5e-05,0.54,114.472,4,0.0544,1987.0,The Way You Make Me Feel,Hee-hee!\nOoh!\nG'on girl!\nAaow!\nHee!\n\nHey...,0.0,2885.0
4,Pop,Michael Jackson,Thriller,0.764,0.887,-3.726,1,0.0744,0.00011,0.72,118.427,4,0.0816,2017.0,Thriller,It's close to midnight\nSomething evil's lurki...,0.0,2733.0


In [8]:
lexicon = EmoLex("lexicon_english.txt")

In [9]:
stopwords = stopwords.words("english")
additional_stopwords = [")", "(", "{", "}", "''", "1", "2", "I", "[", "]", ",", ":", "*", "-", "_", ";", "``", "/", 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
stopwords.extend(additional_stopwords)

In [17]:
%%time
stopset = set(stopwords)
data['tokens'] = pd.Series(dtype=object)
for index, row in data.iterrows():
    try:
        tokens = word_tokenize(row['lyric_clean'].lower())
        tokens_wo_stopwords = [word for word in tokens if not word in stopset]
        data.at[index, 'tokens'] = tokens_wo_stopwords
    except:
        continue

Wall time: 35.5 s


In [18]:
summary = lexicon.summarize_doc(data.loc[0].tokens)
summary

{'fear': 0.0,
 'joy': 0.0992063492063492,
 'positive': 0.11904761904761904,
 'trust': 0.06746031746031746,
 'anticipation': 0.051587301587301584,
 'surprise': 0.015873015873015872,
 'sadness': 0.007936507936507936,
 'negative': 0.015873015873015872,
 'anger': 0.011904761904761904,
 'disgust': 0.007936507936507936}

In [19]:
data['anger'] = 0.0
data['anticipation'] = 0.0
data['disgust'] = 0.0
data['fear'] = 0.0
data['joy'] = 0.0
data['negative'] = 0.0
data['positive'] = 0.0
data['sadness'] = 0.0
data['surprise'] = 0.0
data['trust'] = 0.0

In [20]:
data['anger_quantity'] = 0
data['anticipation_quantity'] = 0
data['disgust_quantity'] = 0
data['fear_quantity'] = 0
data['joy_quantity'] = 0
data['negative_quantity'] = 0
data['positive_quantity'] = 0
data['sadness_quantity'] = 0
data['surprise_quantity'] = 0
data['trust_quantity'] = 0

In [21]:
%%time
for index, _ in data.iterrows():
    try:
        to_lower = list(map(lambda x:x.lower(),data.loc[index].tokens))
        summary = lexicon.summarize_doc(to_lower)
        for key in summary.keys():
            data.at[index, key] = summary[key]
    except:
        continue

Wall time: 11.1 s


In [22]:
%%time
for index, cancion in data.iterrows():
    annotate = lexicon.annotate_doc(cancion.tokens)
    lista = []
    for items in annotate:
        if items != set():
            for emotion in items:
                lista.append(emotion)
    #data.loc[(data.track_name == cancion.track_name), 'anger_quantity'] = Counter(lista)["anger"]
    data.loc[index, 'anger_quantity'] = Counter(lista)["anger"]
    data.loc[index, 'anticipation_quantity'] = Counter(lista)["anticipation"]
    data.loc[index, 'disgust_quantity'] = Counter(lista)["disgust"]
    data.loc[index, 'fear_quantity'] = Counter(lista)["fear"]
    data.loc[index, 'joy_quantity'] = Counter(lista)["joy"]
    data.loc[index, 'negative_quantity'] = Counter(lista)["negative"]
    data.loc[index, 'positive_quantity'] = Counter(lista)["positive"]
    data.loc[index, 'sadness_quantity'] = Counter(lista)["sadness"]
    data.loc[index, 'surprise_quantity'] = Counter(lista)["surprise"]
    data.loc[index, 'trust_quantity'] = Counter(lista)["trust"]

Wall time: 10.1 s


In [23]:
data

Unnamed: 0,genre,artist,track_name,danceability,energy,loudness,mode,speechiness,instrumentalness,valence,...,anger_quantity,anticipation_quantity,disgust_quantity,fear_quantity,joy_quantity,negative_quantity,positive_quantity,sadness_quantity,surprise_quantity,trust_quantity
0,Pop,Michael Jackson,Billie Jean,0.920,0.654,-3.051,0,0.0401,0.015800,0.847,...,3,13,2,0,25,4,30,2,4,17
1,Pop,Michael Jackson,Smooth Criminal - 2012 Remaster,0.853,0.981,-3.947,1,0.0751,0.468000,0.595,...,6,7,2,9,6,14,8,2,6,6
2,Pop,Michael Jackson,Beat It,0.779,0.867,-3.704,0,0.0457,0.000008,0.915,...,11,2,4,19,1,37,4,19,1,5
3,Pop,Michael Jackson,The Way You Make Me Feel - 2012 Remaster,0.877,0.854,-4.523,1,0.1470,0.000055,0.540,...,12,6,12,17,19,15,23,15,1,5
4,Pop,Michael Jackson,Thriller,0.764,0.887,-3.726,1,0.0744,0.000110,0.720,...,16,25,11,31,19,30,21,14,17,13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4341,Punk Rock,Adam & The Ants,Digital Tenderness,0.626,0.950,-6.209,1,0.0773,0.000002,0.935,...,5,5,2,5,23,7,26,6,4,3
4342,Punk Rock,Adam & The Ants,Los Rancheros - Remastered,0.648,0.747,-5.794,0,0.0308,0.000596,0.958,...,1,5,0,0,1,0,4,0,1,2
4343,Punk Rock,Adam & The Ants,That Voodoo!,0.567,0.351,-15.010,1,0.0298,0.054100,0.648,...,0,2,0,4,4,16,4,4,2,0
4344,Punk Rock,Adam & The Ants,Whip In My Valise,0.189,0.958,-5.064,1,0.0961,0.000018,0.267,...,8,2,3,5,2,10,2,3,1,8


In [None]:
data.to_csv('./spotify_lyrics_tokens.csv')