This notebook focuses on the **text-based analysis** carried out on song lyrics and consists of two main parts:
1. Texts preprocessing with the objective of computing statistics such as the lexical diversity and lexical density
2. Emotion-based model application with the aim of getting emotion scores for any song

Some documentation:
- https://towardsdatascience.com/what-songs-tell-us-about-text-mining-with-lyrics-ca80f98b3829
- The model used is a deep neural network from **HuggingFace** (*bert-base-uncased-emotion*)
    - https://huggingface.co/bhadresh-savani/bert-base-uncased-emotion
    - https://huggingface.co/datasets/viewer/?dataset=emotion

In [None]:
import pandas as pd
import numpy as np
from collections import Counter

import re # library for regular expression operations
import string # for string operations

# Stop words:
from nltk.corpus import stopwords  
stop_words = set(stopwords.words('english'))

# Tokenization:
from nltk.tokenize import word_tokenize

# Lemmatizer:
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

# Bigrams:
from gensim.models import Phrases

# HuggingFace:
from transformers import pipeline

In [None]:
df = pd.read_csv('../data/df_abba_charts.csv')
print(f'n_rows: {df.shape[0]}, n_columns: {df.shape[1]}')
df.head()

## 1. Texts preprocessing

### 1.1 Texts exploration

In [None]:
# All song lyrics end with a code such as '1EmbedShare URLCopyEmbedCopy': it has to be deleted
df.lyrics[0]

In [None]:
# Join all the texts to find particular patterns or words, such as s.o.s, which are then to be amended properly
all_texts = ' '.join(df.lyrics)

In [None]:
# Find patterns defined as: some letters, dot, some letters (and more)
p = re.compile("[a-z]+(?:\.[a-z]+)+")
set(p.findall(all_texts.lower()))
# Will be replaced by the string without dots

In [None]:
# Find patterns defined as: some letters, -, some letters (and more)
p = re.compile("[a-z]+(?:-[a-z]+)+")
set(p.findall(all_texts.lower()))
# These cases are helpful to identify bigrams

In [None]:
# Define a dict of patterns that have to be replaced and the related replacements
dict_replace = {
    's.o.s': 'sos',
    '\d?embedshare urlcopyembedcopy': '',
}
# Replacements function:
def replace_words(text, dict_replacements):
    for key, value in dict_replacements.items():
        text = re.sub(key, value, text)
    return text

### 1.2 Texts cleaning 

In [None]:
# See the English stop words list
print(len(stop_words))
stop_words

In [None]:
# Given a song lyrics, the following function performs a number of cleaning steps as for the order below:
# 1. lowercasing
# 2. replacements of the words previously identified
# 3. punctuation removal
# 4. remove multiple spaces
# 5. remove spaces at the beginning and at the end of each song lyrics
# 6. Tokenization
# 7. Stop words removal
# 8. Lemmatization
def cleaning_lyrics(text):
    text = text.lower()
    text = replace_words(text, dict_replacements=dict_replace)
    for el in string.punctuation:
        text = re.sub(f'\\{el}', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    text_tokens = word_tokenize(text)
    text_tokens_stop_words = [i for i in text_tokens if not i in stop_words]
    text_tokens_lemm = [wordnet_lemmatizer.lemmatize(word, pos='v') for word in text_tokens_stop_words]
    return text_tokens_lemm

In [None]:
df['cleaned_lyrics'] = df.lyrics.apply(lambda x: cleaning_lyrics(x))
df

In [None]:
# Based on the previous findings, define a number of bigrams and trigrams to be created and added to the vocabulary:
bigrams = ['voulez-vous', 'mamma-mia', 'merry-go', 'self-confidence', 'santa-rosa', 'suzy-hang',
           'hasta-mañana', 'rock-n', 'king-kong', 'bang-boome', 'dancing-queen', 'ding-dong', 'double-cross',
           'sup-p', 'troup-p', 'ding-dong', 'dog-gone', 'absent-minded', 'play-grind', 'ring-ring']
trigrams = ['rock-n-roll', 'merry-go-round', 'bang-boome-boomerang', 'sup-p-per', 'troup-p-per', 'suzy-hang-around']

In [None]:
# Function to join adjacent words with a given sep
def join_from_onegrams(onegram_text, true_bigr_set=bigrams, sep='-'):
    res = []
    skip = False
    for prev, curr in zip(onegram_text[:-1], onegram_text[1:]):
        if skip:
            skip = False
            continue
        if f'{prev}-{curr}' in true_bigr_set:
            res.append(f'{prev}{sep}{curr}')
            skip = True
        else:
            res.append(prev)
    if onegram_text[1:] and not skip:
        res.append(onegram_text[-1])
    if onegram_text and not onegram_text[1:]:
        res = onegram_text
    return res

In [None]:
df['cleaned_bigrams'] = (
    df.cleaned_lyrics.apply(lambda x: join_from_onegrams(join_from_onegrams(x), true_bigr_set=trigrams))
)
df

In [None]:
# Unique texts: get song lyrics unique words keeping the same order
df['cleaned_lyrics_unique'] = df['cleaned_lyrics'].apply(lambda x: list(pd.unique(x)))
df['cleaned_bigrams_unique'] = df['cleaned_bigrams'].apply(lambda x: list(pd.unique(x)))

### 1.3 Compute some statistics

In [None]:
# Vocabulary length:
tokens = [token for token in df.cleaned_bigrams_unique for token in token]
vocab = set(tokens)
len(vocab)

In [None]:
Counter(tokens).most_common()

In [None]:
# LEXICAL DIVERSITY
# Length of song lyrics in terms of unique words (bigrams included):
df['len_bigrams_unique'] = df.cleaned_bigrams_unique.apply(lambda x: len(x))
# proportional to:
# df['lexical_diversity'] = df.cleaned_bigrams_unique.apply(lambda x: len(x))/len(vocab)

In [None]:
# Top 10 songs with the highest lexical diversity 
df.iloc[df.len_bigrams_unique.sort_values(ascending=False).index].head(10)

In [None]:
# LEXICAL DENSITY
# How much words are repeated in songs on average: 
def average_repetitions(text):
    rep = Counter(text)
    rep_values = [v for k, v in rep.items()]
    return np.mean(rep_values)
df['word_repetition'] = df.cleaned_bigrams.apply(lambda x: average_repetitions(x))

In [None]:
df.len_bigrams_unique.sort_values(ascending=False)

In [None]:
# Original lyrics length in terms of characters (will be useful in the next part):
df['lyrics_char_len'] = df.lyrics.apply(lambda x: len(x))

### 1.4 Additional checks

In [None]:
# Explore bigrams with Phrases of Gensim
bigram = Phrases(cleaned_text, min_count=10, threshold=10) 

In [None]:
for el in bigram.vocab.items():
    if (el[1]>10) & ('_' in el[0]):
        print(el)

In [None]:
# Word-frequency matrix
def create_frequency_dict(text):
    text_dict = {}
    for token in vocab:
        text_dict[token] = Counter(text)[token]
    return text_dict

In [None]:
all_text_dict = []
for text in df.cleaned_lyrics:
    all_text_dict.append(create_frequency_dict(text))

In [None]:
tf_matrix = pd.DataFrame(all_text_dict, index=df.track)
tf_matrix

## 2. Emotion-based model application

In [None]:
# Load the model
classifier_bert = pipeline(
    "text-classification",
    model='bhadresh-savani/bert-base-uncased-emotion',
    return_all_scores=True)

In [None]:
# test
sample_text = "I love using transformers. The best part is wide range of support and its easy to use"
sample_result = classifier_bert(sample_text)
sample_result

In [None]:
# Utility function called in the following one
def get_results(prediction):
    labels_scores = [list(i.values()) for i in prediction]
    labels = [el[0] for el in labels_scores]
    scores = [el[1] for el in labels_scores]
    return labels_scores, labels, scores

get_results(sample_result[0])

In [None]:
# Function to compute the emotion scores for all songs
def get_emotions(df, txt_col, txt_col_length, model=classifier_bert, max_length=1500):
    emotion_labels_scores, emotion_labels, emotion_scores = [], [], []
    for index, row in df.iterrows():
        if isinstance(row[txt_col], str):
            classifier_prediction = model(row[txt_col][:max_length])
            output = get_results(classifier_prediction[0])
            emotion_labels_scores.append(output[0])
            emotion_labels.append(output[1])
            emotion_scores.append(output[2])
        else:
            print(f'index: {index} - track name: {row.track} (no text)')
            emotion_labels_scores.append(np.nan)
            emotion_labels.append(np.nan)
            emotion_scores.append([np.nan]*5)
    return emotion_labels_scores, emotion_labels, emotion_scores

In [None]:
results_full = get_emotions(
    df=df, 
    txt_col='lyrics', 
    txt_col_length='lyrics_char_len')

In [None]:
results = pd.DataFrame(results_full[2], columns=results_full[1][0], index=df.track)
results

In [None]:
# Add the max score and emotion
results['max_score'] = results.max(axis=1)
results['max_emotion'] = results.idxmax(axis=1)
results

In [None]:
# Merge with the original dataset on the track id
df = df.set_index('track')
df.index.names = ['track_id']
df_texts_emotions = df.merge(results, left_index=True, right_index=True)
df_texts_emotions = df_texts_emotions.reset_index(drop=False).drop(columns='lyrics')
df_texts_emotions

In [None]:
df_texts_emotions.to_csv('../data/df_abba_results.csv')