This notebook focuses on the **text-based analysis** carried out on song lyrics and consists of two main parts:
1. Texts preprocessing with the objective of computing statistics such as the lexical diversity and lexical density
2. Emotion-based model application with the aim of getting emotion scores for any song

Some documentation:
- https://towardsdatascience.com/what-songs-tell-us-about-text-mining-with-lyrics-ca80f98b3829
- The model used is a deep neural network from **HuggingFace** (*bert-base-uncased-emotion*)
    - https://huggingface.co/bhadresh-savani/bert-base-uncased-emotion
    - https://huggingface.co/datasets/viewer/?dataset=emotion

In [236]:
import pandas as pd
import numpy as np
from collections import Counter

import re # library for regular expression operations
import string # for string operations

# Stop words:
from nltk.corpus import stopwords  
stop_words = set(stopwords.words('english'))

# Tokenization:
from nltk.tokenize import word_tokenize

# Lemmatizer:
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

# Bigrams:
from gensim.models import Phrases

# HuggingFace:
from transformers import pipeline

In [237]:
df = pd.read_csv('../data/df_abba_charts.csv')
print(f'n_rows: {df.shape[0]}, n_columns: {df.shape[1]}')
df.head()

n_rows: 99, n_columns: 15


Unnamed: 0,id,album,release_date,n_tracks_original,id_track,track,lyrics,n_tracks,notes,uk_peak_pos,uk_woc,uk_weeks_n1,us_peak_pos,us_woc,us_weeks_n1
0,1,Ring Ring (International Edition) by ABBA,1973-03-26,15,1,Ring Ring,I was sitting by the phone\nI was waiting all ...,14,Third at the 1973 Melodifestivalen,32.0,5.0,0.0,0.0,0.0,0.0
1,1,Ring Ring (International Edition) by ABBA,1973-03-26,15,2,"Another Town, Another Train","Day is dawning and I must go\nYou're asleep, b...",14,,,,,,,
2,1,Ring Ring (International Edition) by ABBA,1973-03-26,15,3,Disillusion,"Changing, moving in a circle\nI can see your f...",14,,,,,,,
3,1,Ring Ring (International Edition) by ABBA,1973-03-26,15,4,People Need Love,"People need hope, people need loving\nPeople n...",14,First single ever,,,,,,
4,1,Ring Ring (International Edition) by ABBA,1973-03-26,15,5,I Saw It in the Mirror,"I saw it in the mirror, I saw it in my face\nT...",14,,,,,,,


## 1. Texts preprocessing

### 1.1 Texts exploration

In [238]:
# All song lyrics end with a code such as '1EmbedShare URLCopyEmbedCopy': it has to be deleted
df.lyrics[0]

"I was sitting by the phone\nI was waiting all alone\nBaby by myself I sit and wait and wonder about you\nIt's a dark and dreary night\nSeems like nothing's going right\nWon't you tell me honey how can I go on here without you?\n\nYes I'm down and feeling blue\nAnd I don't know what to do, oh-oh\n\nRing, ring, why don't you give me a call?\nRing, ring, the happiest sound of them all\nRing, ring, I stare at the phone on the wall\nAnd I sit all alone impatiently\nWon't you please understand the need in me\nSo, ring, ring, why don't you give me a call?\nSo, ring, ring, why don't you give me a call?\n\nYou were here and now you're gone\nHey, did I do something wrong?\nI just can't believe that I could be so badly mistaken\nWas it me or was it you?\nTell me, are we really through?\nWon't you hear me cry and you will know that my heart is breaking\nPlease forgive and then forget\nOr maybe darling better yet, oh-oh\n\nRing, ring, why don't you give me a call?\nRing, ring, the happiest sound o

In [239]:
# Join all the texts to find particular patterns or words, such as s.o.s, which are then to be amended properly
all_texts = ' '.join(df.lyrics)

In [240]:
# Find patterns defined as: some letters, dot, some letters (and more)
p = re.compile("[a-z]+(?:\.[a-z]+)+")
set(p.findall(all_texts.lower()))
# Will be replaced by the string without dots

{'s.o.s'}

In [241]:
# Find patterns defined as: some letters, -, some letters (and more)
p = re.compile("[a-z]+(?:-[a-z]+)+")
set(p.findall(all_texts.lower()))
# These cases are helpful to identify bigrams

{'a-ha',
 'a-take',
 'absent-minded',
 'ah-aa',
 'ah-ah',
 'ah-ha',
 'ah-ha-ha',
 'ba-ba',
 'baa-ba-ba',
 'blown-out',
 'boome-boomerang',
 'broken-hearted',
 'bye-bye',
 'chance-chance',
 'clear-headed',
 'd-marry',
 'de-dum-de-dum-dum',
 'dead-end',
 'ding-dong',
 'dog-gone',
 'doo-doo',
 'double-cross',
 'dum-de-dum-dum',
 'eighty-nine',
 'ever-growing',
 'ever-loving',
 'false-hearted',
 'fancy-free',
 'flower-power',
 'go-round',
 'grown-up',
 'ha-ha-had',
 'half-past',
 'hum-de-hum-hum',
 'la-la',
 'la-la-la',
 'love-making',
 'merry-go-round',
 'merry-merry-go-round',
 'na-na',
 'never-ending',
 'ninety-five',
 'no-good',
 'no-one',
 'not-the-kind-of-girl-you',
 'oh-oh',
 'open-eyed',
 'open-hearted',
 'outward-bound',
 'playing-ground',
 'self-confidence',
 'shake-up',
 'sup-p-per',
 'suzy-hang-around',
 'suzy-hang-aroundembedshare',
 'tom-tom',
 'troup-p-per',
 'voulez-vous',
 'wa-wa-wa-wa-waterloo',
 'well-known',
 'well-planned',
 'worn-out'}

In [242]:
# Define a dict of patterns that have to be replaced and the related replacements
dict_replace = {
    's.o.s': 'sos',
    '\d?embedshare urlcopyembedcopy': '',
}
# Replacements function:
def replace_words(text, dict_replacements):
    for key, value in dict_replacements.items():
        text = re.sub(key, value, text)
    return text

### 1.2 Texts cleaning 

In [243]:
# See the English stop words list
print(len(stop_words))
stop_words

179


{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [244]:
# Given a song lyrics, the following function performs a number of cleaning steps as for the order below:
# 1. lowercasing
# 2. replacements of the words previously identified
# 3. punctuation removal
# 4. remove multiple spaces
# 5. remove spaces at the beginning and at the end of each song lyrics
# 6. Tokenization
# 7. Stop words removal
# 8. Lemmatization
def cleaning_lyrics(text):
    text = text.lower()
    text = replace_words(text, dict_replacements=dict_replace)
    for el in string.punctuation:
        text = re.sub(f'\\{el}', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    text_tokens = word_tokenize(text)
    text_tokens_stop_words = [i for i in text_tokens if not i in stop_words]
    text_tokens_lemm = [wordnet_lemmatizer.lemmatize(word, pos='v') for word in text_tokens_stop_words]
    return text_tokens_lemm

In [245]:
df['cleaned_lyrics'] = df.lyrics.apply(lambda x: cleaning_lyrics(x))
df

Unnamed: 0,id,album,release_date,n_tracks_original,id_track,track,lyrics,n_tracks,notes,uk_peak_pos,uk_woc,uk_weeks_n1,us_peak_pos,us_woc,us_weeks_n1,cleaned_lyrics
0,1,Ring Ring (International Edition) by ABBA,1973-03-26,15,1,Ring Ring,I was sitting by the phone\nI was waiting all ...,14,Third at the 1973 Melodifestivalen,32.0,5.0,0.0,0.0,0.0,0.0,"[sit, phone, wait, alone, baby, sit, wait, won..."
1,1,Ring Ring (International Edition) by ABBA,1973-03-26,15,2,"Another Town, Another Train","Day is dawning and I must go\nYou're asleep, b...",14,,,,,,,,"[day, dawn, must, go, asleep, still, sure, kno..."
2,1,Ring Ring (International Edition) by ABBA,1973-03-26,15,3,Disillusion,"Changing, moving in a circle\nI can see your f...",14,,,,,,,,"[change, move, circle, see, face, dream, smile..."
3,1,Ring Ring (International Edition) by ABBA,1973-03-26,15,4,People Need Love,"People need hope, people need loving\nPeople n...",14,First single ever,,,,,,,"[people, need, hope, people, need, love, peopl..."
4,1,Ring Ring (International Edition) by ABBA,1973-03-26,15,5,I Saw It in the Mirror,"I saw it in the mirror, I saw it in my face\nT...",14,,,,,,,,"[saw, mirror, saw, face, longer, need, anyone,..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94,8,The Visitors by ABBA,1981-11-30,18,14,You Owe Me One,Now there's a shadow falling over our faces\nD...,16,1997 CD edition bonus tracks,,,,,,,"[shadow, fall, face, doubt, forever, hearts, s..."
95,8,The Visitors by ABBA,1981-11-30,18,15,Cassandra,Down in the street they're all singing and sho...,16,2001 CD edition bonus tracks,,,,,,,"[street, sing, shout, stay, alive, though, cit..."
96,8,The Visitors by ABBA,1981-11-30,18,16,Under Attack,"Don't know how to take it, don't know where to...",16,1997 CD edition bonus tracks,26.0,8.0,0.0,0.0,0.0,0.0,"[know, take, know, go, resistance, run, low, e..."
97,8,The Visitors by ABBA,1981-11-30,18,17,The Day Before You Came,I must have left my house at eight because I a...,16,1997 CD edition bonus tracks,32.0,6.0,0.0,0.0,0.0,0.0,"[must, leave, house, eight, always, train, cer..."


In [246]:
# Based on the previous findings, define a number of bigrams and trigrams to be created and added to the vocabulary:
bigrams = ['voulez-vous', 'mamma-mia', 'merry-go', 'self-confidence', 'santa-rosa', 'suzy-hang',
           'hasta-mañana', 'rock-n', 'king-kong', 'bang-boome', 'dancing-queen', 'ding-dong', 'double-cross',
           'sup-p', 'troup-p', 'ding-dong', 'dog-gone', 'absent-minded', 'play-grind', 'ring-ring']
trigrams = ['rock-n-roll', 'merry-go-round', 'bang-boome-boomerang', 'sup-p-per', 'troup-p-per', 'suzy-hang-around']

In [247]:
# Function to join adjacent words with a given sep
def join_from_onegrams(onegram_text, true_bigr_set=bigrams, sep='-'):
    res = []
    skip = False
    for prev, curr in zip(onegram_text[:-1], onegram_text[1:]):
        if skip:
            skip = False
            continue
        if f'{prev}-{curr}' in true_bigr_set:
            res.append(f'{prev}{sep}{curr}')
            skip = True
        else:
            res.append(prev)
    if onegram_text[1:] and not skip:
        res.append(onegram_text[-1])
    if onegram_text and not onegram_text[1:]:
        res = onegram_text
    return res

In [248]:
df['cleaned_bigrams'] = (
    df.cleaned_lyrics.apply(lambda x: join_from_onegrams(join_from_onegrams(x), true_bigr_set=trigrams))
)
df

Unnamed: 0,id,album,release_date,n_tracks_original,id_track,track,lyrics,n_tracks,notes,uk_peak_pos,uk_woc,uk_weeks_n1,us_peak_pos,us_woc,us_weeks_n1,cleaned_lyrics,cleaned_bigrams
0,1,Ring Ring (International Edition) by ABBA,1973-03-26,15,1,Ring Ring,I was sitting by the phone\nI was waiting all ...,14,Third at the 1973 Melodifestivalen,32.0,5.0,0.0,0.0,0.0,0.0,"[sit, phone, wait, alone, baby, sit, wait, won...","[sit, phone, wait, alone, baby, sit, wait, won..."
1,1,Ring Ring (International Edition) by ABBA,1973-03-26,15,2,"Another Town, Another Train","Day is dawning and I must go\nYou're asleep, b...",14,,,,,,,,"[day, dawn, must, go, asleep, still, sure, kno...","[day, dawn, must, go, asleep, still, sure, kno..."
2,1,Ring Ring (International Edition) by ABBA,1973-03-26,15,3,Disillusion,"Changing, moving in a circle\nI can see your f...",14,,,,,,,,"[change, move, circle, see, face, dream, smile...","[change, move, circle, see, face, dream, smile..."
3,1,Ring Ring (International Edition) by ABBA,1973-03-26,15,4,People Need Love,"People need hope, people need loving\nPeople n...",14,First single ever,,,,,,,"[people, need, hope, people, need, love, peopl...","[people, need, hope, people, need, love, peopl..."
4,1,Ring Ring (International Edition) by ABBA,1973-03-26,15,5,I Saw It in the Mirror,"I saw it in the mirror, I saw it in my face\nT...",14,,,,,,,,"[saw, mirror, saw, face, longer, need, anyone,...","[saw, mirror, saw, face, longer, need, anyone,..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94,8,The Visitors by ABBA,1981-11-30,18,14,You Owe Me One,Now there's a shadow falling over our faces\nD...,16,1997 CD edition bonus tracks,,,,,,,"[shadow, fall, face, doubt, forever, hearts, s...","[shadow, fall, face, doubt, forever, hearts, s..."
95,8,The Visitors by ABBA,1981-11-30,18,15,Cassandra,Down in the street they're all singing and sho...,16,2001 CD edition bonus tracks,,,,,,,"[street, sing, shout, stay, alive, though, cit...","[street, sing, shout, stay, alive, though, cit..."
96,8,The Visitors by ABBA,1981-11-30,18,16,Under Attack,"Don't know how to take it, don't know where to...",16,1997 CD edition bonus tracks,26.0,8.0,0.0,0.0,0.0,0.0,"[know, take, know, go, resistance, run, low, e...","[know, take, know, go, resistance, run, low, e..."
97,8,The Visitors by ABBA,1981-11-30,18,17,The Day Before You Came,I must have left my house at eight because I a...,16,1997 CD edition bonus tracks,32.0,6.0,0.0,0.0,0.0,0.0,"[must, leave, house, eight, always, train, cer...","[must, leave, house, eight, always, train, cer..."


In [249]:
# Unique texts: get song lyrics unique words keeping the same order
df['cleaned_lyrics_unique'] = df['cleaned_lyrics'].apply(lambda x: list(pd.unique(x)))
df['cleaned_bigrams_unique'] = df['cleaned_bigrams'].apply(lambda x: list(pd.unique(x)))

### 1.3 Compute some statistics

In [250]:
# Vocabulary length:
tokens = [token for token in df.cleaned_bigrams_unique for token in token]
vocab = set(tokens)
len(vocab)

1613

In [251]:
Counter(tokens).most_common()

[('know', 61),
 ('see', 61),
 ('go', 59),
 ('like', 58),
 ('feel', 51),
 ('say', 44),
 ('think', 43),
 ('get', 41),
 ('make', 39),
 ('love', 38),
 ('take', 38),
 ('one', 38),
 ('never', 36),
 ('come', 36),
 ('look', 33),
 ('way', 32),
 ('time', 32),
 ('oh', 31),
 ('hear', 29),
 ('day', 29),
 ('find', 28),
 ('night', 27),
 ('tell', 27),
 ('dream', 26),
 ('na', 26),
 ('give', 25),
 ('could', 25),
 ('nothing', 24),
 ('yes', 24),
 ('still', 24),
 ('life', 24),
 ('live', 24),
 ('want', 24),
 ('try', 23),
 ('leave', 23),
 ('eye', 23),
 ('every', 23),
 ('let', 23),
 ('seem', 22),
 ('girl', 22),
 ('cry', 21),
 ('would', 21),
 ('cause', 21),
 ('away', 21),
 ('wait', 20),
 ('baby', 20),
 ('really', 20),
 ('better', 20),
 ('little', 20),
 ('face', 20),
 ('world', 20),
 ('play', 20),
 ('always', 19),
 ('something', 18),
 ('everything', 18),
 ('right', 17),
 ('break', 17),
 ('must', 17),
 ('lose', 17),
 ('smile', 17),
 ('gon', 17),
 ('old', 17),
 ('chance', 17),
 ('believe', 16),
 ('end', 16),
 ('m

In [252]:
# LEXICAL DIVERSITY
# Length of song lyrics in terms of unique words (bigrams included):
df['len_bigrams_unique'] = df.cleaned_bigrams_unique.apply(lambda x: len(x))
# proportional to:
# df['lexical_diversity'] = df.cleaned_bigrams_unique.apply(lambda x: len(x))/len(vocab)

In [253]:
# Top 10 songs with the highest lexical diversity 
df.iloc[df.len_bigrams_unique.sort_values(ascending=False).index].head(10)

Unnamed: 0,id,album,release_date,n_tracks_original,id_track,track,lyrics,n_tracks,notes,uk_peak_pos,uk_woc,uk_weeks_n1,us_peak_pos,us_woc,us_weeks_n1,cleaned_lyrics,cleaned_bigrams,cleaned_lyrics_unique,cleaned_bigrams_unique,len_bigrams_unique
97,8,The Visitors by ABBA,1981-11-30,18,17,The Day Before You Came,I must have left my house at eight because I a...,16,1997 CD edition bonus tracks,32.0,6.0,0.0,0.0,0.0,0.0,"[must, leave, house, eight, always, train, cer...","[must, leave, house, eight, always, train, cer...","[must, leave, house, eight, always, train, cer...","[must, leave, house, eight, always, train, cer...",123
98,8,The Visitors by ABBA,1981-11-30,18,18,From a Twinkling Star to a Passing Angel (Demo...,"Twinkle Twinkle Little Star\n\n""Ett, två... et...",16,2012 deluxe edition (The Final Album) bonus tr...,,,,,,,"[twinkle, twinkle, little, star, ett, två, ett...","[twinkle, twinkle, little, star, ett, två, ett...","[twinkle, little, star, ett, två, tre, wonder,...","[twinkle, little, star, ett, två, tre, wonder,...",109
93,8,The Visitors by ABBA,1981-11-30,18,13,I Am the City,Coming through a cloud you're looking at me fr...,16,2012 deluxe edition (The Final Album) bonus tr...,,,,,,,"[come, cloud, look, revelation, spread, eye, f...","[come, cloud, look, revelation, spread, eye, f...","[come, cloud, look, revelation, spread, eye, f...","[come, cloud, look, revelation, spread, eye, f...",104
95,8,The Visitors by ABBA,1981-11-30,18,15,Cassandra,Down in the street they're all singing and sho...,16,2001 CD edition bonus tracks,,,,,,,"[street, sing, shout, stay, alive, though, cit...","[street, sing, shout, stay, alive, though, cit...","[street, sing, shout, stay, alive, though, cit...","[street, sing, shout, stay, alive, though, cit...",100
87,8,The Visitors by ABBA,1981-11-30,18,5,I Let the Music Speak,"I'm hearing images, I'm seeing songs\nNo poet ...",16,,,,,,,,"[hear, image, see, songs, poet, ever, paint, v...","[hear, image, see, songs, poet, ever, paint, v...","[hear, image, see, songs, poet, ever, paint, v...","[hear, image, see, songs, poet, ever, paint, v...",96
77,7,Super Trouper by ABBA,1980-11-03,14,7,Our Last Summer,The summer air was soft and warm\nThe feeling ...,12,,,,,,,,"[summer, air, soft, warm, feel, right, paris, ...","[summer, air, soft, warm, feel, right, paris, ...","[summer, air, soft, warm, feel, right, paris, ...","[summer, air, soft, warm, feel, right, paris, ...",93
72,7,Super Trouper by ABBA,1980-11-03,14,2,The Winner Takes It All,I don't wanna talk\nAbout things we've gone th...,12,,1.0,10.0,2.0,8.0,26.0,0.0,"[wan, na, talk, things, go, though, hurt, hist...","[wan, na, talk, things, go, though, hurt, hist...","[wan, na, talk, things, go, though, hurt, hist...","[wan, na, talk, things, go, though, hurt, hist...",88
46,4,Arrival by ABBA,1976-10-11,10,10,Fernando,"Can you hear the drums, Fernando?\nI remember,...",11,1997 CD edition bonus track,1.0,15.0,4.0,13.0,16.0,0.0,"[hear, drum, fernando, remember, long, ago, an...","[hear, drum, fernando, remember, long, ago, an...","[hear, drum, fernando, remember, long, ago, an...","[hear, drum, fernando, remember, long, ago, an...",79
89,8,The Visitors by ABBA,1981-11-30,18,7,Two for the Price of One,He had what you might call a trivial occupatio...,16,,,,,,,,"[might, call, trivial, occupation, clean, plat...","[might, call, trivial, occupation, clean, plat...","[might, call, trivial, occupation, clean, plat...","[might, call, trivial, occupation, clean, plat...",77
83,8,The Visitors by ABBA,1981-11-30,18,1,The Visitors,I hear the doorbell ring and suddenly the pani...,16,,0.0,0.0,0.0,63.0,8.0,0.0,"[hear, doorbell, ring, suddenly, panic, take, ...","[hear, doorbell, ring, suddenly, panic, take, ...","[hear, doorbell, ring, suddenly, panic, take, ...","[hear, doorbell, ring, suddenly, panic, take, ...",77


In [254]:
# LEXICAL DENSITY
# How much words are repeated in songs on average: 
def average_repetitions(text):
    rep = Counter(text)
    rep_values = [v for k, v in rep.items()]
    return np.mean(rep_values)
df['word_repetition'] = df.cleaned_bigrams.apply(lambda x: average_repetitions(x))

In [255]:
df.len_bigrams_unique.sort_values(ascending=False)

97    123
98    109
93    104
95    100
87     96
     ... 
9      35
32     34
59     34
31     22
80     22
Name: len_bigrams_unique, Length: 99, dtype: int64

In [256]:
# Original lyrics length in terms of characters (will be useful in the next part):
df['lyrics_char_len'] = df.lyrics.apply(lambda x: len(x))

### 1.4 Additional checks

In [257]:
# Explore bigrams with Phrases of Gensim
bigram = Phrases(cleaned_text, min_count=10, threshold=10) 

In [258]:
for el in bigram.vocab.items():
    if (el[1]>10) & ('_' in el[0]):
        print(el)

('feel_blue', 12)
('ring_ring', 14)
('know_mean', 11)
('people_need', 29)
('need_love', 17)
('get_ta', 29)
('la_la', 225)
('think_could', 23)
('every_day', 12)
('embedshare_urlcopyembedcopy', 13)
('sweet_sweet', 14)
('let_go', 16)
('sweet_love', 14)
('love_bittersweet', 12)
('could_see', 15)
('kind_girl', 13)
('make_feel', 11)
('love_life', 12)
('rock_n', 22)
('n_roll', 22)
('gon_na', 81)
('merry_go', 33)
('go_round', 36)
('round_merry', 25)
('merry_merry', 30)
('give_anything', 12)
('anything_back', 12)
('back_home', 13)
('santa_rosa', 11)
('wa_wa', 12)
('feel_like', 12)
('ginny_ginny', 13)
('na_sing', 14)
('king_kong', 13)
('kong_song', 11)
('go_know', 11)
('hasta_mañana', 11)
('know_know', 17)
('mama_say', 13)
('look_do', 13)
('wan_na', 61)
('honey_honey', 20)
('move_like', 11)
('better_watch', 12)
('could_make', 11)
('come_back', 12)
('hey_hey', 18)
('hey_helen', 18)
('dum_dum', 14)
('na_na', 110)
('ah_ha', 57)
('long_long', 13)
('pick_bale', 17)
('midnight_special', 11)
('kiss_tea

In [259]:
# Word-frequency matrix
def create_frequency_dict(text):
    text_dict = {}
    for token in vocab:
        text_dict[token] = Counter(text)[token]
    return text_dict

In [260]:
all_text_dict = []
for text in df.cleaned_lyrics:
    all_text_dict.append(create_frequency_dict(text))

In [261]:
tf_matrix = pd.DataFrame(all_text_dict, index=df.track)
tf_matrix

Unnamed: 0_level_0,doom,reality,hug,worship,laws,discovery,practise,forget,movies,ding-dong,...,away,spectators,none,spell,way,pick,misconceive,cars,pay,stream
track,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Ring Ring,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
"Another Town, Another Train",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
Disillusion,0,0,0,0,0,0,0,2,0,0,...,0,0,0,0,0,0,0,0,0,0
People Need Love,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
I Saw It in the Mirror,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
You Owe Me One,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
Cassandra,0,0,0,0,0,0,0,0,0,0,...,0,0,3,0,0,0,0,0,0,0
Under Attack,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
The Day Before You Came,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,1,0,0,0,0,0


## 2. Emotion-based model application

In [170]:
# Load the model
classifier_bert = pipeline(
    "text-classification",
    model='bhadresh-savani/bert-base-uncased-emotion',
    return_all_scores=True)

In [217]:
# test
sample_text = "I love using transformers. The best part is wide range of support and its easy to use"
sample_result = classifier_bert(sample_text)
sample_result

[[{'label': 'sadness', 'score': 0.0005138238193467259},
  {'label': 'joy', 'score': 0.9972521662712097},
  {'label': 'love', 'score': 0.0007443313952535391},
  {'label': 'anger', 'score': 0.0007404953357763588},
  {'label': 'fear', 'score': 0.00032938597723841667},
  {'label': 'surprise', 'score': 0.0004197492962703109}]]

In [218]:
# Utility function called in the following one
def get_results(prediction):
    labels_scores = [list(i.values()) for i in prediction]
    labels = [el[0] for el in labels_scores]
    scores = [el[1] for el in labels_scores]
    return labels_scores, labels, scores

get_results(sample_result[0])

([['sadness', 0.0005138238193467259],
  ['joy', 0.9972521662712097],
  ['love', 0.0007443313952535391],
  ['anger', 0.0007404953357763588],
  ['fear', 0.00032938597723841667],
  ['surprise', 0.0004197492962703109]],
 ['sadness', 'joy', 'love', 'anger', 'fear', 'surprise'],
 [0.0005138238193467259,
  0.9972521662712097,
  0.0007443313952535391,
  0.0007404953357763588,
  0.00032938597723841667,
  0.0004197492962703109])

In [213]:
# Function to compute the emotion scores for all songs
def get_emotions(df, txt_col, txt_col_length, model=classifier_bert, max_length=1500):
    emotion_labels_scores, emotion_labels, emotion_scores = [], [], []
    for index, row in df.iterrows():
        if isinstance(row[txt_col], str):
            classifier_prediction = model(row[txt_col][:max_length])
            output = get_results(classifier_prediction[0])
            emotion_labels_scores.append(output[0])
            emotion_labels.append(output[1])
            emotion_scores.append(output[2])
        else:
            print(f'index: {index} - track name: {row.track} (no text)')
            emotion_labels_scores.append(np.nan)
            emotion_labels.append(np.nan)
            emotion_scores.append([np.nan]*5)
    return emotion_labels_scores, emotion_labels, emotion_scores

In [232]:
results_full = get_emotions(
    df=df, 
    txt_col='lyrics', 
    txt_col_length='lyrics_char_len')

In [264]:
results = pd.DataFrame(results_full[2], columns=results_full[1][0], index=df.track)
results

Unnamed: 0_level_0,sadness,joy,love,anger,fear,surprise
track,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Ring Ring,0.156454,0.757705,0.056179,0.005206,0.012620,0.011835
"Another Town, Another Train",0.001646,0.000369,0.000661,0.003064,0.993147,0.001112
Disillusion,0.987690,0.008214,0.001154,0.002282,0.000451,0.000209
People Need Love,0.000551,0.001347,0.996278,0.000578,0.000553,0.000693
I Saw It in the Mirror,0.621091,0.270500,0.085397,0.019306,0.002155,0.001551
...,...,...,...,...,...,...
You Owe Me One,0.998192,0.000312,0.000340,0.000385,0.000547,0.000224
Cassandra,0.837561,0.072144,0.009362,0.075559,0.004096,0.001278
Under Attack,0.012646,0.132130,0.023197,0.351168,0.451349,0.029510
The Day Before You Came,0.007125,0.153096,0.007530,0.221696,0.072380,0.538173


In [265]:
# Add the max score and emotion
results['max_score'] = results.max(axis=1)
results['max_emotion'] = results.idxmax(axis=1)
results

Unnamed: 0_level_0,sadness,joy,love,anger,fear,surprise,max_score,max_emotion
track,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Ring Ring,0.156454,0.757705,0.056179,0.005206,0.012620,0.011835,0.757705,joy
"Another Town, Another Train",0.001646,0.000369,0.000661,0.003064,0.993147,0.001112,0.993147,fear
Disillusion,0.987690,0.008214,0.001154,0.002282,0.000451,0.000209,0.987690,sadness
People Need Love,0.000551,0.001347,0.996278,0.000578,0.000553,0.000693,0.996278,love
I Saw It in the Mirror,0.621091,0.270500,0.085397,0.019306,0.002155,0.001551,0.621091,sadness
...,...,...,...,...,...,...,...,...
You Owe Me One,0.998192,0.000312,0.000340,0.000385,0.000547,0.000224,0.998192,sadness
Cassandra,0.837561,0.072144,0.009362,0.075559,0.004096,0.001278,0.837561,sadness
Under Attack,0.012646,0.132130,0.023197,0.351168,0.451349,0.029510,0.451349,fear
The Day Before You Came,0.007125,0.153096,0.007530,0.221696,0.072380,0.538173,0.538173,surprise


In [266]:
# Merge with the original dataset on the track id
df = df.set_index('track')
df.index.names = ['track_id']
df_texts_emotions = df.merge(results, left_index=True, right_index=True)
df_texts_emotions = df_texts_emotions.reset_index(drop=False).drop(columns='lyrics')
df_texts_emotions

Unnamed: 0,index,id,album,release_date,n_tracks_original,id_track,n_tracks,notes,uk_peak_pos,uk_woc,...,word_repetition,lyrics_char_len,sadness,joy,love,anger,fear,surprise,max_score,max_emotion
0,Ring Ring,1,Ring Ring (International Edition) by ABBA,1973-03-26,15,1,14,Third at the 1973 Melodifestivalen,32.0,5.0,...,2.117647,1384,0.156454,0.757705,0.056179,0.005206,0.012620,0.011835,0.757705,joy
1,"Another Town, Another Train",1,Ring Ring (International Edition) by ABBA,1973-03-26,15,2,14,,,,...,2.309091,1231,0.001646,0.000369,0.000661,0.003064,0.993147,0.001112,0.993147,fear
2,Disillusion,1,Ring Ring (International Edition) by ABBA,1973-03-26,15,3,14,,,,...,1.428571,921,0.987690,0.008214,0.001154,0.002282,0.000451,0.000209,0.987690,sadness
3,People Need Love,1,Ring Ring (International Edition) by ABBA,1973-03-26,15,4,14,First single ever,,,...,7.510638,1935,0.000551,0.001347,0.996278,0.000578,0.000553,0.000693,0.996278,love
4,I Saw It in the Mirror,1,Ring Ring (International Edition) by ABBA,1973-03-26,15,5,14,,,,...,2.083333,792,0.621091,0.270500,0.085397,0.019306,0.002155,0.001551,0.621091,sadness
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94,You Owe Me One,8,The Visitors by ABBA,1981-11-30,18,14,16,1997 CD edition bonus tracks,,,...,3.867925,1949,0.998192,0.000312,0.000340,0.000385,0.000547,0.000224,0.998192,sadness
95,Cassandra,8,The Visitors by ABBA,1981-11-30,18,15,16,2001 CD edition bonus tracks,,,...,1.890000,2005,0.837561,0.072144,0.009362,0.075559,0.004096,0.001278,0.837561,sadness
96,Under Attack,8,The Visitors by ABBA,1981-11-30,18,16,16,1997 CD edition bonus tracks,26.0,8.0,...,2.171429,1716,0.012646,0.132130,0.023197,0.351168,0.451349,0.029510,0.451349,fear
97,The Day Before You Came,8,The Visitors by ABBA,1981-11-30,18,17,16,1997 CD edition bonus tracks,32.0,6.0,...,1.382114,1726,0.007125,0.153096,0.007530,0.221696,0.072380,0.538173,0.538173,surprise


In [267]:
df_texts_emotions.to_csv('../data/df_abba_results.csv')