# Visualizing Years & Years with simple Data Analysis and NLP tools

https://github.com/rhnvrm/lyric-api
https://www.azlyrics.com/y/yearsyears.html

## Import Data

Some of the musics don't have their respective lyrics and must be filtered

In [1]:
import pandas as pd
import urllib.parse
raw = pd.read_csv("songlist.txt", header=None, names=['songs'])
lyrics_url = raw.songs.apply(lambda x : 'http://lyric-api.herokuapp.com/api/find/years%20&%20years/' + urllib.parse.quote(x.lower())).values

In [2]:
import asyncio  
import aiohttp
import requests
import concurrent.futures
import json
import time
import re

## Nest_Asyncio is necessary to run loops on jupyter. If running on separated python script it's not needed
## For more see https://markhneedham.com/blog/2019/05/10/jupyter-runtimeerror-this-event-loop-is-already-running/
import nest_asyncio
nest_asyncio.apply()

def repeat_lines_multiplier(text):
    line_list = []
    for line in text.split('\n'):
        match = re.search("(^.*)x ?(\d+).*$", line)
        if match is None:
            line_list.append(line)
        else:
            for repetitions in range(int(match.group(2))):
                line_list.append(match.group(1))
    return '\n'.join(line_list)

async def get_lyrics(lyrics_url):
    with concurrent.futures.ThreadPoolExecutor(max_workers=200) as executor:
        loop = asyncio.get_event_loop()
        futures = [
            loop.run_in_executor(
                executor, 
                requests.get, 
                url
            )
            for url in lyrics_url
        ]
        lyrics_list = []
        for response in await asyncio.gather(*futures):
            if response.status_code != 200:
                lyrics_list.append('')
            elif json.loads(response.text)['lyric'].find("Unfortunately, we are not licensed to display the full lyrics for this song at the moment") > -1:
                lyrics_list.append('')
            else:
                ## Loops through each line to check for a x3 and repeat that line n times
                lyrics_list.append(repeat_lines_multiplier(json.loads(response.text)['lyric']))
    return lyrics_list

start_time = time.time()
loop = asyncio.get_event_loop()
lyrics = loop.run_until_complete(get_lyrics(lyrics_url))
## We cannot close the loop because the notebook itself is using one. Only closes when using script
# loop.close()
print("--- %s seconds ---" % (time.time() - start_time))

--- 2.415391206741333 seconds ---


In [3]:
lyrics = pd.Series(lyrics)
lyrics.name = "lyrics"
raw = pd.concat([raw, lyrics], axis=1)

I know that Play is a song which is only feated by Years & Years. Therefore the URL doesn't find any lyrics. But if we use the correct main artist we get http://lyric-api.herokuapp.com/api/find/jax%20jones/play I can add this one as seen below

In [4]:
play = requests.get('http://lyric-api.herokuapp.com/api/find/jax%20jones/play')
raw.loc[raw.songs == 'Play', 'lyrics'] = repeat_lines_multiplier(json.loads(play.text)['lyric'])

Below we can still see that some of the songs weren't found in the database. And for those we shall disconsider

In [5]:
## As you can see I'm a huge fan of chaining methods in pandas. Since I've read 
## This post https://towardsdatascience.com/the-unreasonable-effectiveness-of-method-chaining-in-pandas-15c2109e3c69
## I've used for most of my analysis and it helped me a lot debugging and making super complex things without any trouble
(
    raw
    .lyrics
    .apply(lambda x : len(x) > 0)
    .value_counts()
)

True     35
False     9
Name: lyrics, dtype: int64

In [6]:
data = raw[(raw
    .lyrics
    .apply(lambda x : len(x) > 0))]

## Frequentist Analysis (i.e. CountVectorizer)

In [7]:
from sklearn.feature_extraction.text import CountVectorizer
# [NQY18]	J. Nothman, H. Qin and R. Yurchak (2018). “Stop Word Lists in Free Open-source Software Packages”. In Proc. Workshop for NLP Open Source Software.
vectorizer = CountVectorizer(stop_words=['to', 'it', 'the', 'and', 'oh', 'that', 'be', 're', 'are', 'for'])
X = vectorizer.fit_transform(data.lyrics)

In [8]:
freq = pd.DataFrame(data=X.toarray(), columns=vectorizer.get_feature_names(), index=data.songs.values)
freq

Unnamed: 0,13,14,21,about,abused,accelerates,accidental,actin,admission,admit,...,wrong,wrote,ya,yeah,year,years,yesterday,you,young,your
Foundation,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,9,0,4
Real,0,0,0,0,0,0,0,0,0,0,...,2,0,0,1,0,0,0,55,0,0
Shine,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,38,0,4
Take Shelter,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,33,0,2
Worship,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,43,0,8
Eyes Shut,0,0,0,0,0,0,0,0,0,0,...,2,0,0,2,0,0,0,10,0,3
Ties,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,44,0,2
King,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,9,0,5
Desire,0,0,0,0,3,0,0,0,0,0,...,0,0,0,0,0,0,0,26,0,5
Gold,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,7,0,0


In [9]:
from plotly import graph_objs as go
import plotly.offline as py
import numpy as np
py.init_notebook_mode(connected=True)

x = freq.sum().nlargest(10).index
top_words = freq.sum().nlargest(10)
freq_top_words = ((freq.loc[:, top_words.index.values] > 0).sum())
bars = [
    go.Bar(name='# words', 
           y=x[::-1], 
           x=top_words[::-1], 
           text=[str(x)+'/'+str(len(data)) for x in np.round(freq_top_words.values, 2)][::-1], 
           textposition='auto',
           orientation='h')
]
layout = go.Layout(
    title='Top 10 Words and the # of musics ocurrences',
    xaxis=dict(
        title="# words occurences"
    )
)
fig = go.Figure(data=bars, layout=layout)

py.iplot(fig)

In [10]:
from plotly import graph_objs as go
import plotly.offline as py
py.init_notebook_mode(connected=True)

heatmap = [
    go.Heatmap(
        z=freq.apply(lambda x : (x/x.sum())*100),
        x=freq.columns,
        y=freq.index
    )
]
fig = go.Figure(heatmap)
py.iplot(fig)

We then preprocess the lyrics turning them to lower case and remoing common puntuaction. Then we split the yrics into a vector of words and calculate the maximum lenght of the corpus we have. In my case the longest lyrics in the corpus is composed of 1159 words. We use this value to pad all the self similarity matrices to output matrices with the same dimensions.

https://www.youtube.com/watch?v=HzzmqUoQobc

A self similarity matrix is a matrix of correlation across each vector. In our case the input vector is a lyric and each word is treated as a column and row in order. The diagonal of this matrix is always marked as all the words correlate with themselves but if the word appear elsewhere on the lyric the correspondent row and column is also marked. The output matrix is sparse and symmetric and padded in the lower right corner to match the dimensions maxvalue x maxvalue.

The intuition of using self-similarity matrices is that they comprehend the structure and sequence of the lyrics and point out repetitive blocks in the songs. These repetitive blocks in the songs might help determine the genre as pop music easily repeats e refrain many times with a lower variance os words then rap, for example.

In [11]:
preprocessed = data.loc[:, "lyrics"].str.replace('[\(\),:.!?]', ' ').str.lower().str.split()
maxvalue = preprocessed.apply(lambda x:len(x)).max()
matrices = []
for a in preprocessed:
    a = np.array(a)
    a1 = a
    a2 = a[np.newaxis].T
    diff = maxvalue - len(a1)
    g = np.core.defchararray.equal(a1, a2) #Calcula a maldita matriz
    g = g*1
#     g = np.pad(g, ((0, diff), (0, diff)), mode='constant') #Adiciona padding para garantir que todos os input tem o mesmo tamanho
    matrices.append(g)

mymatrix = np.array(matrices)
# mymatrix = mymatrix.reshape((len(data), maxvalue, maxvalue))

In [12]:
from plotly import graph_objs as go
import plotly.offline as py
py.init_notebook_mode(connected=True)

def plot_song_index(i):
    heatmap = [
        go.Heatmap(
            z=mymatrix[i],
            text=np.array([preprocessed[i]]*171),
            colorscale=[[0.0, "rgb(0,0,0)"], [1.0, "rgb(255,255,255)"]],
            showscale=False,
            hoverinfo='text'
        )
    ]
    layout = go.Layout(
        title=data.songs[i],
        font=dict(size=18, color='#000000'),
        width=900,
        height=900,
        paper_bgcolor='rgba(0,0,0,0)',
        plot_bgcolor='rgba(0,0,0,0)',
        showlegend=False,
        xaxis=dict(visible=False),
        yaxis=dict(visible=False, autorange='reversed'),
        margin=go.layout.Margin(
        l=0,
        r=0,
        b=10,
        t=50,
        pad=2
        )
    )
    fig = go.Figure(data=heatmap, layout=layout)
    py.iplot(fig)
    
plot_song_index(3)

# Emotion Analysis

In [11]:
filepath = "NRC-emotion-lexicon-wordlevel-alphabetized-v0.92.txt"
emolex_df = pd.read_csv(filepath,  names=["word", "emotion", "association"], skiprows=45, sep='\t')
emolex_df.loc[emolex_df.association == 1, :].groupby('emotion').sum()

Unnamed: 0_level_0,association
emotion,Unnamed: 1_level_1
anger,1247
anticipation,839
disgust,1058
fear,1476
joy,689
negative,3324
positive,2312
sadness,1191
surprise,534
trust,1231


In [12]:
def my_feelings(x):
    j = (
        x
        .to_frame()
        .loc[x > 0, :]
        .reset_index()
        .merge(
            emolex_df.loc[emolex_df.association == 1, :],
            left_on='index',
            right_on='word'
        )
        .loc[:, [False, True, False, True, False]]
        .groupby('emotion')
        .sum().T
        .reset_index(drop=True)
        .to_json(orient='records')
    )
    return j
feelings = pd.DataFrame.from_records(freq.apply(lambda x : json.loads(my_feelings(x)[1:-1]), axis=1).values, index=freq.index).fillna(0)
feelings

Unnamed: 0,anger,anticipation,disgust,joy,negative,positive,sadness,surprise,trust,fear
Foundation,1.0,3.0,1.0,5.0,1,12,3.0,1.0,5,0.0
Real,2.0,6.0,3.0,16.0,11,25,6.0,3.0,10,5.0
Shine,6.0,10.0,0.0,11.0,12,26,6.0,6.0,12,13.0
Take Shelter,0.0,10.0,0.0,2.0,10,17,1.0,1.0,15,0.0
Worship,12.0,11.0,0.0,9.0,14,27,9.0,1.0,8,14.0
Eyes Shut,7.0,2.0,1.0,0.0,11,5,7.0,0.0,3,14.0
Ties,4.0,8.0,3.0,7.0,8,14,1.0,2.0,9,3.0
King,6.0,2.0,2.0,1.0,9,13,3.0,4.0,8,6.0
Desire,8.0,7.0,7.0,14.0,12,17,9.0,8.0,9,10.0
Gold,3.0,3.0,2.0,1.0,11,14,5.0,4.0,3,4.0


In [13]:
from plotly import graph_objs as go
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objects as go

categories = ['anger', 'anticipation', 'disgust', 'joy', 'sadness', 'surprise', 'trust', 'fear']
categories = feelings.columns

fig = go.Figure()
fig.add_trace(go.Scatterpolar(
      r=(feelings[:17].loc[:, categories].sum() / feelings[:17].loc[:, categories].sum().sum())*100,
      theta=categories,
      fill='toself',
      name='Communion'
))
fig.add_trace(go.Scatterpolar(
      r=(feelings[17:33].loc[:, categories].sum() / feelings[17:33].loc[:, categories].sum().sum())*100,
      theta=categories,
      fill='toself',
      name='Palo Santo'
))

fig.update_layout(
    title="Album comparison",
  polar=dict(
    radialaxis=dict(
      visible=True
    )),
    width=900,
    height=900,
    paper_bgcolor='rgba(0,0,0,0)',
    plot_bgcolor='rgba(0,0,0,0)',
  showlegend=True
)

fig.show()

In [14]:
from plotly import graph_objs as go
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objects as go

categories = ['anger', 'anticipation', 'disgust', 'joy', 'sadness', 'surprise', 'trust', 'fear']
categories = feelings.columns

fig = go.Figure()

def my_radar(x):
    return go.Scatterpolar(
          r=x/x.sum(),
          theta=categories,
          fill='toself',
          name=x.name
    )
fig.add_traces(feelings.loc[:, categories].apply(my_radar, axis=1).values.tolist())

fig.update_layout(
    title="Album comparison",
  polar=dict(
    radialaxis=dict(
      visible=True
    )),
    width=900,
    height=900,
    paper_bgcolor='rgba(0,0,0,0)',
    plot_bgcolor='rgba(0,0,0,0)',
  showlegend=True
)

fig.show()

In [19]:
feelings.loc[:, categories].reset_index().rename(columns = {'index' : 'songname'})

Unnamed: 0,songname,anger,anticipation,disgust,joy,negative,positive,sadness,surprise,trust,fear
0,Foundation,1.0,3.0,1.0,5.0,1,12,3.0,1.0,5,0.0
1,Real,2.0,6.0,3.0,16.0,11,25,6.0,3.0,10,5.0
2,Shine,6.0,10.0,0.0,11.0,12,26,6.0,6.0,12,13.0
3,Take Shelter,0.0,10.0,0.0,2.0,10,17,1.0,1.0,15,0.0
4,Worship,12.0,11.0,0.0,9.0,14,27,9.0,1.0,8,14.0
5,Eyes Shut,7.0,2.0,1.0,0.0,11,5,7.0,0.0,3,14.0
6,Ties,4.0,8.0,3.0,7.0,8,14,1.0,2.0,9,3.0
7,King,6.0,2.0,2.0,1.0,9,13,3.0,4.0,8,6.0
8,Desire,8.0,7.0,7.0,14.0,12,17,9.0,8.0,9,10.0
9,Gold,3.0,3.0,2.0,1.0,11,14,5.0,4.0,3,4.0


In [33]:
import plotly.graph_objects as go

fig = go.Figure(
    data=[],
    layout=go.Layout(
        xaxis=dict(range=[0, 5], autorange=False),
        yaxis=dict(range=[0, 5], autorange=False),
        title="Start Title",
        updatemenus=[dict(
            type="buttons",
            buttons=[dict(label="Play",
                          method="animate",
                          args=[None])])]
    ),
    frames=[go.Frame(data=[go.Scatter(x=[0, 1], y=[0, 1])]),
        go.Frame(data=[go.Scatter(x=[1, 2], y=[1, 2])]),
            go.Frame(data=[go.Scatter(x=[1, 4], y=[1, 4])]),
            go.Frame(data=[go.Scatter(x=[3, 4], y=[3, 4])],
                     layout=go.Layout(title_text="End Title"))]
)

fig.show()

In [40]:
feelings.loc['Foundation', categories]

anger            1.0
anticipation     3.0
disgust          1.0
joy              5.0
negative         1.0
positive        12.0
sadness          3.0
surprise         1.0
trust            5.0
fear             0.0
Name: Foundation, dtype: float64

In [43]:
from plotly import graph_objs as go
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objects as go

categories = ['anger', 'anticipation', 'disgust', 'joy', 'sadness', 'surprise', 'trust', 'fear']
categories = feelings.columns

fig = go.Figure()

def my_radar(x):
    return go.Frame(data=[go.Scatterpolar(
          r=x/x.sum(),
          theta=categories,
          fill='toself',
          name=x.name
    )])
frames = feelings.loc[:, categories].apply(my_radar, axis=1).values.tolist()
fig.add_traces(go.Scatterpolar(
          r=feelings.loc['Foundation', categories]/feelings.loc['Foundation', categories].sum(),
          theta=categories,
          fill='toself'
    ))
fig.frames = frames
fig.update_layout(
    title="Album comparison",
  polar=dict(
    radialaxis=dict(
      visible=True,
      range=[0, 0.3]
    )),
    width=900,
    height=900,
    updatemenus=[{"buttons": [
            {
                "args": [None, {"frame": {"duration": 500, "redraw": False},
                                "fromcurrent": True, "transition": {"duration": 300,
                                                                    "easing": "quadratic-in-out"}}],
                "label": "Play",
                "method": "animate"
            },
            {
                "args": [[None], {"frame": {"duration": 0, "redraw": False},
                                  "mode": "immediate",
                                  "transition": {"duration": 0}}],
                "label": "Pause",
                "method": "animate"
            }
        ]}],
    paper_bgcolor='rgba(0,0,0,0)',
    plot_bgcolor='rgba(0,0,0,0)',
  showlegend=True
)

fig.show()

In [None]:
px.scatter_polar()

In [None]:
# t-sne ou word cloud para agrupar palavras representativas por sentimento influenciador

### Label Encoders

We encode the mood labels in the two most popular enconding techniques: One-hot encoding and Label Encoding.

#### One Hot-Encode

In [18]:
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from keras.utils import np_utils
encoder = LabelEncoder()
encoder.fit(df.genre)
encoded_Y = encoder.transform(df.genre)
# convert integers to dummy variables (i.e. one hot encoded)
dummy_y = np_utils.to_categorical(encoded_Y)

Using TensorFlow backend.


NameError: name 'df' is not defined

#### Label Encoder

In [None]:
#http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(df.genre)
# le.classes_
labeled_y = le.transform(df.genre) 
# le.inverse_transform(le.transform(df.Mood))

## Neural Networks

In this section we import the keras to be used on the top of Tensorflow, define and train a few models. The architecture of these models is purely empiric. Given the self similarity matrices we input and convolve then with maxpooling layers. The idea of the modelis to observe different hyperparameters on the covolutional layers. The second model is based on the VGG-16.

In [None]:
import tensorflow as tf
import keras
from keras import layers
from keras.layers import Dense, Activation,Conv2D,MaxPooling2D,Flatten,Dropout, Input
from keras.layers.advanced_activations import LeakyReLU
from keras.models import Model
from keras import regularizers

from keras.callbacks import TensorBoard
from time import time

import h5py

from IPython.display import SVG, display
from keras.utils.vis_utils import model_to_dot

In [None]:
#Based on https://github.com/fchollet/keras/blob/master/examples/babi_rnn.py
#Kernel_size outputs
#https://adeshpande3.github.io/A-Beginner%27s-Guide-To-Understanding-Convolutional-Neural-Networks-Part-2/
#https://keras.io/layers/convolutional/#conv2d
def model1():
    input_layer = Input(shape=(maxvalue, maxvalue, 1),
        name="input_layer",
        dtype='float32')
    network = Conv2D(filters=50, 
        kernel_size=(9,9),
        strides=(2,2),
        kernel_initializer='glorot_uniform',
        name="conv1")(input_layer)
    network = MaxPooling2D(pool_size=(5,5), name="max_pool1")(network)
    network = Conv2D(filters=30, 
        kernel_size=(5,5),
        strides=(2,2),
        kernel_initializer='glorot_uniform',
        name="conv2")(network)
    network = MaxPooling2D(pool_size=(3,3), name="max_pool2")(network)
    network = Conv2D(filters=30, 
        kernel_size=(3,3),
        strides=(1,1),
        kernel_initializer='glorot_uniform',
        name="conv3")(network)
    network = MaxPooling2D(pool_size=(1,1), name="max_pool3")(network)
    network = Flatten()(network)
    network = Dense(100, activation='sigmoid', name="dense")(network)
    network = Dropout(0.2)(network)
    network = Dense(len(df.genre.unique()), activation='softmax', name="mood_output")(network)    
    model = Model(input_layer, network)
    return model

model = model1()
model.summary()

In [None]:
del model

In [None]:
# model.summary()
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.fit(mymatrix, dummy_y, batch_size=20, epochs=5, validation_split=0.05)
# model.save('./models/model'+str(i+1)+'_'+colorName(j)+'_'+str(epoch)+'.h5')
# epoch += 10

In [None]:
def model2():
    input_layer = Input(shape=(maxvalue, maxvalue, 1),
        name="input_layer",
        dtype='float32')
    network = Conv2D(filters=30, 
        kernel_size=(3,3),
        kernel_initializer='glorot_uniform',
        name="conv_0_1")(input_layer)
    network = Conv2D(filters=30, 
        kernel_size=(3,3),
        kernel_initializer='glorot_uniform',
        name="conv_0_2")(network)
    network = MaxPooling2D(pool_size=(2,2), name="max_pool_0")(network)
    network = Conv2D(filters=30, 
        kernel_size=(3,3),
        kernel_initializer='glorot_uniform',
        name="conv_1_1")(network)
    network = Conv2D(filters=30, 
        kernel_size=(3,3),
        kernel_initializer='glorot_uniform',
        name="conv_1_2")(network)
    network = MaxPooling2D(pool_size=(2,2), name="max_pool_1")(network)
    for i in range(2,7):
        network = Conv2D(filters=30, 
            kernel_size=(3,3),
            kernel_initializer='glorot_uniform',
            name="conv_{}_1".format(i))(network)
        network = Conv2D(filters=30, 
            kernel_size=(3,3),
            kernel_initializer='glorot_uniform',
            name="conv_{}_2".format(i))(network)
        network = Conv2D(filters=30, 
            kernel_size=(3,3),
            kernel_initializer='glorot_uniform',
            name="conv_{}_3".format(i))(network)
        network = MaxPooling2D(pool_size=(2,2), name="max_pool_{}".format(i))(network)
        

    network = Flatten()(network)
    network = Dense(200, activation='sigmoid', name="dense1")(network)
    network = Dropout(0.2)(network)
    network = Dense(200, activation='sigmoid', name="dense2")(network)
    network = Dropout(0.2)(network)
    network = Dense(200, activation='sigmoid', name="dense3")(network)
    network = Dropout(0.2)(network)
    network = Dense(len(df.genre.unique()), activation='softmax', name="mood_output")(network)    
    model = Model(input_layer, network)
    return model
model2 = model2()
# model2.summary()

In [None]:
model2.compile(optimizer='adam', loss='mse', metrics=['accuracy'])
# tensorboard = TensorBoard(log_dir="logs/lyricsCorrelation2")
model2.fit(mymatrix, dummy_y, batch_size=3, epochs=5, validation_split=0.05)
# model.save('./models/model'+str(i+1)+'_'+colorName(j)+'_'+str(epoch)+'.h5')
# epoch += 10

It's easy to observe that the model didn't learn much as the accuracy is close to a random guess (given that are 4 available moods) in both models

## word2vec

We then observe the accuracy of the model if instead of using the self-similarity matrices we used a doc2vec model where each lyric is interpreted as a document.

In [None]:
#https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/doc2vec-lee.ipynb
#https://radimrehurek.com/gensim/models/doc2vec.html
#https://medium.com/scaleabout/a-gentle-introduction-to-doc2vec-db3e8c0cce5e
import gensim
from gensim.models import Doc2Vec
# model = Doc2Vec(matrices, vector_size=100, window=8, min_count=5, workers=4)
word_tagged = []
for i in df.index:
    word_tagged.append(gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(df.loc[i, "lyrics"]), tags=[df.loc[i, "genre"]]))

model = gensim.models.doc2vec.Doc2Vec(vector_size=20, min_count=2, epochs=5)
model.build_vocab(word_tagged)

In [None]:
%time model.train(word_tagged, total_examples=model.corpus_count, epochs=model.epochs)

In [None]:
infered_vectors = []
for i in df.index:
    infered_vectors.append(model.infer_vector(df.loc[i, "lyrics"]))

Then the output of the doc2vec is used as input for a fully connected network

In [None]:
#Based on https://github.com/fchollet/keras/blob/master/examples/babi_rnn.py
#Kernel_size outputs
#https://adeshpande3.github.io/A-Beginner%27s-Guide-To-Understanding-Convolutional-Neural-Networks-Part-2/
def model3():
    input_layer = Input(shape=(50, 1, 1),
        name="input_layer",
        dtype='float32')
    network = Flatten()(input_layer)
    network = Dense(200, activation='sigmoid', name="dense1")(network)
    network = Dropout(0.2)(network)
    network = Dense(200, activation='sigmoid', name="dense2")(network)
    network = Dropout(0.2)(network)
    network = Dense(200, activation='sigmoid', name="dense3")(network)
    network = Dropout(0.2)(network)
    network = Dense(len(df.genre.unique()), activation='softmax', name="mood_output")(network)    
    model = Model(input_layer, network)
    return model

model3 = model3()
# model3.summary()

In [None]:
my_infered_vector = np.array(infered_vectors)
my_infered_vector = my_infered_vector.reshape(len(df), 50, 1, 1)

In [None]:
model3.compile(optimizer='adam', loss='mse', metrics=['accuracy'])
model3.fit(my_infered_vector, dummy_y, batch_size=500, epochs=100, validation_split=0.05)

Then again the model didn't presented good results as its results are also little better than a random guess

## SVM
We finnaly try to use an SVM to observe its accuracy in our doc2vec vector classification

In [None]:
from sklearn import svm
clf = svm.SVC()
clf.fit(np.array(infered_vectors), labeled_y) 

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(df.genre, le.inverse_transform(clf.predict(infered_vectors)))

Giving us a final result little better then the networks