# Visualizing Years & Years with simple Data Analysis and NLP tools

https://github.com/rhnvrm/lyric-api
https://www.azlyrics.com/y/yearsyears.html

## Import Data

Some of the musics don't have their respective lyrics and must be filtered

In [1]:
import pandas as pd
import urllib.parse
raw = pd.read_csv("songlist.txt", header=None, names=['songs'])
lyrics_url = raw.songs.apply(lambda x : 'http://lyric-api.herokuapp.com/api/find/years%20&%20years/' + urllib.parse.quote(x.lower())).values

In [2]:
import asyncio  
import aiohttp
import requests
import concurrent.futures
import json
import time
import re

## Nest_Asyncio is necessary to run loops on jupyter. If running on separated python script it's not needed
## For more see https://markhneedham.com/blog/2019/05/10/jupyter-runtimeerror-this-event-loop-is-already-running/
import nest_asyncio
nest_asyncio.apply()

def repeat_lines_multiplier(text):
    line_list = []
    for line in text.split('\n'):
        match = re.search("(^.*)x ?(\d+).*$", line)
        if match is None:
            line_list.append(line)
        else:
            for repetitions in range(int(match.group(2))):
                line_list.append(match.group(1))
    return '\n'.join(line_list)

async def get_lyrics(lyrics_url):
    with concurrent.futures.ThreadPoolExecutor(max_workers=200) as executor:
        loop = asyncio.get_event_loop()
        futures = [
            loop.run_in_executor(
                executor, 
                requests.get, 
                url
            )
            for url in lyrics_url
        ]
        lyrics_list = []
        for response in await asyncio.gather(*futures):
            if response.status_code != 200:
                lyrics_list.append('')
            elif json.loads(response.text)['lyric'].find("Unfortunately, we are not licensed to display the full lyrics for this song at the moment") > -1:
                lyrics_list.append('')
            else:
                ## Loops through each line to check for a x3 and repeat that line n times
                lyrics_list.append(repeat_lines_multiplier(json.loads(response.text)['lyric']))
    return lyrics_list

start_time = time.time()
loop = asyncio.get_event_loop()
lyrics = loop.run_until_complete(get_lyrics(lyrics_url))
## We cannot close the loop because the notebook itself is using one. Only closes when using script
# loop.close()
print("--- %s seconds ---" % (time.time() - start_time))

--- 2.7396931648254395 seconds ---


In [3]:
lyrics = pd.Series(lyrics)
lyrics.name = "lyrics"
raw = pd.concat([raw, lyrics], axis=1)

I know that Play is a song which is only feated by Years & Years. Therefore the URL doesn't find any lyrics. But if we use the correct main artist we get http://lyric-api.herokuapp.com/api/find/jax%20jones/play I can add this one as seen below

In [4]:
play = requests.get('http://lyric-api.herokuapp.com/api/find/jax%20jones/play')
raw.loc[raw.songs == 'Play', 'lyrics'] = repeat_lines_multiplier(json.loads(play.text)['lyric'])

Below we can still see that some of the songs weren't found in the database. And for those we shall disconsider

In [5]:
## As you can see I'm a huge fan of chaining methods in pandas. Since I've read 
## This post https://towardsdatascience.com/the-unreasonable-effectiveness-of-method-chaining-in-pandas-15c2109e3c69
## I've used for most of my analysis and it helped me a lot debugging and making super complex things without any trouble
(
    raw
    .lyrics
    .apply(lambda x : len(x) > 0)
    .value_counts()
)

True     36
False     8
Name: lyrics, dtype: int64

In [6]:
data = raw[(raw
    .lyrics
    .apply(lambda x : len(x) > 0))]

## Frequentist Analysis (i.e. CountVectorizer)

In [7]:
from sklearn.feature_extraction.text import CountVectorizer
# [NQY18]	J. Nothman, H. Qin and R. Yurchak (2018). “Stop Word Lists in Free Open-source Software Packages”. In Proc. Workshop for NLP Open Source Software.
vectorizer = CountVectorizer(stop_words=['to', 'it', 'the', 'and', 'oh', 'that', 'be', 're', 'are', 'for'])
X = vectorizer.fit_transform(data.lyrics)

In [8]:
freq = pd.DataFrame(data=X.toarray(), columns=vectorizer.get_feature_names(), index=data.songs.values)
freq

Unnamed: 0,13,14,21,about,abused,accelerates,accidental,actin,admission,admit,...,wrong,wrote,ya,yeah,year,years,yesterday,you,young,your
Foundation,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,9,0,4
Real,0,0,0,0,0,0,0,0,0,0,...,2,0,0,1,0,0,0,55,0,0
Shine,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,38,0,4
Take Shelter,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,33,0,2
Worship,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,43,0,8
Eyes Shut,0,0,0,0,0,0,0,0,0,0,...,2,0,0,2,0,0,0,10,0,3
Ties,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,44,0,2
King,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,9,0,5
Desire,0,0,0,0,3,0,0,0,0,0,...,0,0,0,0,0,0,0,26,0,5
Gold,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,7,0,0


In [132]:
from plotly import graph_objs as go
import plotly.offline as py
import numpy as np
py.init_notebook_mode(connected=True)

x = freq.sum().nlargest(10).index
top_words = freq.sum().nlargest(10)
freq_top_words = ((freq.loc[:, top_words.index.values] > 0).sum())
bars = [
    go.Bar(name='# words', 
           y=x[::-1], 
           x=top_words[::-1], 
           text=[str(x)+'/'+str(len(data)) for x in np.round(freq_top_words.values, 2)][::-1], 
           textposition='auto',
           orientation='h')
]
layout = go.Layout(
    title='Top 10 Words and the # of musics ocurrences',
    xaxis=dict(
        title="# words occurences"
    )
)
fig = go.Figure(data=bars, layout=layout)

py.iplot(fig)

In [10]:
from plotly import graph_objs as go
import plotly.offline as py
py.init_notebook_mode(connected=True)

heatmap = [
    go.Heatmap(
        z=freq.apply(lambda x : (x/x.sum())*100),
        x=freq.columns,
        y=freq.index
    )
]
fig = go.Figure(heatmap)
py.iplot(fig)

We then preprocess the lyrics turning them to lower case and remoing common puntuaction. Then we split the yrics into a vector of words and calculate the maximum lenght of the corpus we have. In my case the longest lyrics in the corpus is composed of 1159 words. We use this value to pad all the self similarity matrices to output matrices with the same dimensions.

https://www.youtube.com/watch?v=HzzmqUoQobc

A self similarity matrix is a matrix of correlation across each vector. In our case the input vector is a lyric and each word is treated as a column and row in order. The diagonal of this matrix is always marked as all the words correlate with themselves but if the word appear elsewhere on the lyric the correspondent row and column is also marked. The output matrix is sparse and symmetric and padded in the lower right corner to match the dimensions maxvalue x maxvalue.

The intuition of using self-similarity matrices is that they comprehend the structure and sequence of the lyrics and point out repetitive blocks in the songs. These repetitive blocks in the songs might help determine the genre as pop music easily repeats e refrain many times with a lower variance os words then rap, for example.

In [11]:
preprocessed = data.loc[:, "lyrics"].str.replace('[\(\),:.!?]', ' ').str.lower().str.split()
maxvalue = preprocessed.apply(lambda x:len(x)).max()
matrices = []
for a in preprocessed:
    a = np.array(a)
    a1 = a
    a2 = a[np.newaxis].T
    diff = maxvalue - len(a1)
    g = np.core.defchararray.equal(a1, a2) #Calcula a maldita matriz
    g = g*1
#     g = np.pad(g, ((0, diff), (0, diff)), mode='constant') #Adiciona padding para garantir que todos os input tem o mesmo tamanho
    matrices.append(g)

mymatrix = np.array(matrices)
# mymatrix = mymatrix.reshape((len(data), maxvalue, maxvalue))

In [150]:
from plotly import graph_objs as go
import plotly.offline as py
py.init_notebook_mode(connected=True)

def plot_song_index(i):
    heatmap = [
        go.Heatmap(
            z=mymatrix[i],
            text=np.array([preprocessed.values[i]]*171),
            colorscale=[[0.0, "rgb(0,0,0)"], [1.0, "rgb(255,255,255)"]],
            showscale=False,
            hoverinfo='text'
        )
    ]
    layout = go.Layout(
        title=data.songs.values[i],
        font=dict(size=18, color='#000000'),
        width=900,
        height=900,
        paper_bgcolor='rgba(0,0,0,0)',
        plot_bgcolor='rgba(0,0,0,0)',
        showlegend=False,
        xaxis=dict(visible=False),
        yaxis=dict(visible=False, autorange='reversed'),
        margin=go.layout.Margin(
        l=0,
        r=0,
        b=10,
        t=50,
        pad=2
        )
    )
    fig = go.Figure(data=heatmap, layout=layout)
    py.iplot(fig)
    return fig
    
fig = plot_song_index(33)

In [149]:
print(fig.to_html(include_plotlyjs=False, full_html=False))

<div>
        
        
            <div id="13187444-b324-4538-9389-db185e9716bb" class="plotly-graph-div" style="height:100%; width:100%;"></div>
            <script type="text/javascript">
                
                    window.PLOTLYENV=window.PLOTLYENV || {};
                    
                if (document.getElementById("13187444-b324-4538-9389-db185e9716bb")) {
                    Plotly.newPlot(
                        '13187444-b324-4538-9389-db185e9716bb',
                        {"font": {"color": "#000000", "size": 18}, "margin": {"b": 10, "l": 0, "pad": 2, "r": 0, "t": 50}, "paper_bgcolor": "rgba(0,0,0,0)", "plot_bgcolor": "rgba(0,0,0,0)", "showlegend": false, "template": {"data": {"bar": [{"error_x": {"color": "#2a3f5f"}, "error_y": {"color": "#2a3f5f"}, "marker": {"line": {"color": "#E5ECF6", "width": 0.5}}, "type": "bar"}], "barpolar": [{"marker": {"line": {"color": "#E5ECF6", "width": 0.5}}, "type": "barpolar"}], "carpet": [{"aaxis": {"endlinecolor": "#2a3f5f", 

# Emotion Analysis

In [13]:
filepath = "NRC-emotion-lexicon-wordlevel-alphabetized-v0.92.txt"
emolex_df = pd.read_csv(filepath,  names=["word", "emotion", "association"], skiprows=45, sep='\t')
emolex_df.loc[emolex_df.association == 1, :].groupby('emotion').sum()

Unnamed: 0_level_0,association
emotion,Unnamed: 1_level_1
anger,1247
anticipation,839
disgust,1058
fear,1476
joy,689
negative,3324
positive,2312
sadness,1191
surprise,534
trust,1231


In [14]:
def my_feelings(x):
    j = (
        x
        .to_frame()
        .loc[x > 0, :]
        .reset_index()
        .merge(
            emolex_df.loc[emolex_df.association == 1, :],
            left_on='index',
            right_on='word'
        )
        .loc[:, [False, True, False, True, False]]
        .groupby('emotion')
        .sum().T
        .reset_index(drop=True)
        .to_json(orient='records')
    )
    return j
feelings = pd.DataFrame.from_records(freq.apply(lambda x : json.loads(my_feelings(x)[1:-1]), axis=1).values, index=freq.index).fillna(0)
feelings

Unnamed: 0,anger,anticipation,disgust,joy,negative,positive,sadness,surprise,trust,fear
Foundation,1.0,3.0,1.0,5.0,1,12,3.0,1.0,5,0.0
Real,2.0,6.0,3.0,16.0,11,25,6.0,3.0,10,5.0
Shine,6.0,10.0,0.0,11.0,12,26,6.0,6.0,12,13.0
Take Shelter,0.0,10.0,0.0,2.0,10,17,1.0,1.0,15,0.0
Worship,12.0,11.0,0.0,9.0,14,27,9.0,1.0,8,14.0
Eyes Shut,7.0,2.0,1.0,0.0,11,5,7.0,0.0,3,14.0
Ties,4.0,8.0,3.0,7.0,8,14,1.0,2.0,9,3.0
King,6.0,2.0,2.0,1.0,9,13,3.0,4.0,8,6.0
Desire,8.0,7.0,7.0,14.0,12,17,9.0,8.0,9,10.0
Gold,3.0,3.0,2.0,1.0,11,14,5.0,4.0,3,4.0


In [159]:
from plotly import graph_objs as go
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objects as go

categories = ['anger', 'anticipation', 'disgust', 'joy', 'sadness', 'surprise', 'trust', 'fear']
categories = feelings.columns

fig = go.Figure()
fig.add_trace(go.Scatterpolar(
      r=(feelings[:17].loc[:, categories].sum() / feelings[:17].loc[:, categories].sum().sum())*100,
      theta=categories,
      fill='toself',
      name='Communion'
))
fig.add_trace(go.Scatterpolar(
      r=(feelings[17:33].loc[:, categories].sum() / feelings[17:33].loc[:, categories].sum().sum())*100,
      theta=categories,
      fill='toself',
      name='Palo Santo'
))

fig.update_layout(
    title="Album comparison",
  polar=dict(
    radialaxis=dict(
      visible=True
    )),
    paper_bgcolor='rgba(0,0,0,0)',
    plot_bgcolor='rgba(0,0,0,0)',
  showlegend=True
)

fig.show()

In [163]:
clip.copy(fig.to_html(include_plotlyjs=False, full_html=False))

In [157]:
import pyperclip as clip

In [162]:
from plotly import graph_objs as go
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objects as go

categories = ['anger', 'anticipation', 'disgust', 'joy', 'sadness', 'surprise', 'trust', 'fear']
categories = feelings.columns

fig = go.Figure()

def my_radar(x):
    return go.Scatterpolar(
          r=x/x.sum(),
          theta=categories,
          fill='toself',
          name=x.name
    )
fig.add_traces(feelings.loc[:, categories].apply(my_radar, axis=1).values.tolist())

fig.update_layout(
    title="Songs comparison",
  polar=dict(
    radialaxis=dict(
      visible=True
    )),
    width=800,
    height=800,
    paper_bgcolor='rgba(0,0,0,0)',
    plot_bgcolor='rgba(0,0,0,0)',
  showlegend=True
)

fig.show()

In [111]:
feelings.describe()

Unnamed: 0,anger,anticipation,disgust,joy,negative,positive,sadness,surprise,trust,fear
count,36.0,36.0,36.0,36.0,36.0,36.0,36.0,36.0,36.0,36.0
mean,5.25,8.444444,2.888889,9.972222,12.0,18.861111,6.305556,5.0,10.694444,8.583333
std,5.061761,8.337332,3.04985,8.907416,10.074012,11.774434,5.922449,6.936446,10.539231,9.607661
min,0.0,0.0,0.0,0.0,1.0,3.0,0.0,0.0,1.0,0.0
25%,2.0,3.0,1.0,3.0,5.0,10.0,2.75,1.0,4.0,3.0
50%,3.5,7.0,2.0,7.5,11.0,16.5,5.0,3.0,9.0,5.0
75%,7.0,10.25,4.0,15.25,14.0,26.25,8.25,5.25,12.0,10.25
max,22.0,36.0,16.0,34.0,55.0,51.0,30.0,33.0,51.0,44.0


In [None]:
# t-sne ou word cloud para agrupar palavras representativas por sentimento influenciador

In [91]:
from sklearn.cluster import KMeans
import numpy as np
X = feelings.loc[:, ['anger', 'anticipation', 'disgust', 'joy', 'sadness', 'surprise', 'trust', 'fear']].values
kmeans = KMeans(n_clusters=8, random_state=0).fit(X)
kmeans.labels_

array([0, 5, 5, 0, 5, 0, 0, 0, 5, 0, 0, 0, 0, 0, 6, 3, 5, 1, 3, 5, 7, 5,
       3, 5, 5, 5, 2, 0, 2, 4, 0, 3, 0, 0, 0, 0], dtype=int32)

In [49]:
from sklearn.manifold import TSNE
X_embedded = TSNE(n_components=2).fit_transform(X)

In [92]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
X_embedded = pca.fit_transform(X)  

In [106]:
import plotly.express as px
fig = go.Figure(data=go.Scatter(
    x=X_embedded[:, 0], 
    y=X_embedded[:, 1], 
    mode='markers',
    marker=dict(
        size=16,
        color=kmeans.labels_, #set color equal to a variable
        colorscale='viridis', # one of plotly colorscales
        showscale=False
    ),
    text=feelings.index
))

fig.update_layout(
    title="PCA + KNN Clustering",
    width=1000,
    height=600,
    paper_bgcolor='rgba(0,0,0,0)',
    plot_bgcolor='rgba(0,0,0,0)',
    xaxis=dict(showticklabels=False),
    yaxis=dict(showticklabels=False),
  showlegend=False
)
fig.show()