In [1]:
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from tqdm import tqdm
from keras.models import Sequential
from keras.layers.recurrent import LSTM, GRU
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.embeddings import Embedding
from keras.layers import BatchNormalization
from keras.utils import np_utils
from keras.layers import GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
import matplotlib.pyplot as plt
import numpy as np

In [2]:
import pandas as pd
df = pd.read_csv('processed_lyrics.csv')


subset = df.sample(n=1000)

subset

Unnamed: 0,SName,Lyric,Genre,Artist,lyric_length
76674,Naked,You got a girl. That doesn't look a thing like...,Pop,Enrique Iglesias,443
15016,Chemical Prisoner,I walk a fine line between coping and insanity...,Rock,Falling In Reverse,200
82431,Broken,I wear the red shoes with the holes. And to re...,Pop,Katy Perry,381
92219,Look What You Made Me Do,I don't like your little games. Don't like you...,Pop,Taylor Swift,517
1955,Something's Gotta Give,I woke up in a strangers bed. With pins and ne...,Rock,All Time Low,231
...,...,...,...,...,...
33411,Nude,Don't get any big ideas. They're not gonna hap...,Rock,Radiohead,64
81697,The London Bridge Song,"Joni's lighthearted take on ""London Bridge"". w...",Pop,Joni Mitchell,358
40963,Pilgrim,"I am just a pilgrim on this road, boys. This a...",Rock,Steve Earle,92
58966,Camera Ready,Its the crazy mane. wit the crazy change. got ...,Hip Hop,Gucci Mane,437


In [3]:
label_encoder = preprocessing.LabelEncoder()
labels = label_encoder.fit_transform(subset.Genre.values)
np.unique(labels)

array([0, 1, 2])

In [4]:
xtrain, xtest, ytrain, ytest = train_test_split(subset.Lyric.values, 
                                                labels, 
                                                stratify=labels, 
                                                random_state=42, 
                                                test_size=0.1, shuffle=True)

In [5]:
embeddings_index = {}
f = open('glove/glove.6B.100d.txt',encoding='utf8')
for line in tqdm(f):
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Found %s word vectors.' % len(embeddings_index))

400001it [00:07, 54549.03it/s]

Found 400001 word vectors.





## Defining hyperparameters

In [6]:
print('hello')

hello


In [7]:
VOCABULARY_SIZE = 2000
MAX_LENGTH = 60

In [8]:
tokenizer = Tokenizer(num_words=VOCABULARY_SIZE)
tokenizer.fit_on_texts(list(xtrain) + list(xtest))

Turning tokens into lists of sequences

In [9]:
xtrain_sequence = tokenizer.texts_to_sequences(xtrain)
xtest_sequence = tokenizer.texts_to_sequences(xtest)

In [10]:
xtrain_padding = sequence.pad_sequences(xtrain_sequence, maxlen=MAX_LENGTH)
xtest_padding = sequence.pad_sequences(xtest_sequence, maxlen=MAX_LENGTH)
word_index = tokenizer.word_index




In [11]:
embedding_matrix = np.zeros((len(word_index) + 1, 100))
for word, i in tqdm(word_index.items()):
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

100%|████████████████████████████████████████████████████████| 13284/13284 [00:00<00:00, 546417.84it/s]


## LSTM model with glove embeddings and two dense layers.

We could change dropout percentage to further avoid/less avoid overfitting

In [14]:
model = Sequential()
model.add(Embedding(len(word_index) + 1,
                    100,
                    weights=[embedding_matrix],
                    input_length=MAX_LENGTH,
                    trainable=False))
model.add(SpatialDropout1D(0.3))
model.add(Bidirectional(LSTM(100, dropout=0.3, recurrent_dropout=0.3)))
model.add(Dense(1024, activation='relu'))
model.add(Dropout(0.8))
model.add(Dense(1024, activation='relu'))
model.add(Dropout(0.8))
model.add(Dense(3))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

2022-01-03 20:29:59.958872: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


### binarize the labels for the neural network

In [15]:
ytrain_encode = np_utils.to_categorical(ytrain)
ytest_encode = np_utils.to_categorical(ytest)

In [16]:
history = model.fit(xtrain_padding, 
                    y=ytrain_encode, 
                    batch_size=512, 
                    epochs=1, 
                    verbose=1, 
                    validation_data=(xtest_padding, ytest_encode))



In [None]:
def graph_plots(history, string):
    plt.plot(history.history[string])
    plt.plot(history.history[‘val_’+string])
    plt.xlabel(“Epochs”)
    plt.ylabel(string)
    plt.legend([string, ‘val_’+string])
    plt.show()

graph_plots(history, “accuracy”)
graph_plots(history, “loss”)