In [17]:
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from tqdm import tqdm
from keras.models import Sequential
from keras.layers.recurrent import LSTM, GRU
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.embeddings import Embedding
from keras.layers import BatchNormalization
from keras.utils import np_utils
from keras.layers import GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
import matplotlib.pyplot as plt
import numpy as np

In [18]:
import pandas as pd
df = pd.read_csv('preprocessed_lyrics.csv')

df = df.groupby('Genre', group_keys=False).apply(lambda s: s.sample(3333, random_state=42)) #




In [19]:
label_encoder = preprocessing.LabelEncoder()
labels = label_encoder.fit_transform(df.Genre.values)
np.unique(labels)

array([0, 1, 2])

In [20]:
xtrain, xtest, ytrain, ytest = train_test_split(df.Lyric.values, 
                                                labels, 
                                                stratify=labels, 
                                                random_state=42, 
                                                test_size=0.1, shuffle=True)

In [21]:
embeddings_index = {} # Will be a 100dim vector for each word in glove, found as embeddings_index[word]
f = open('glove.6B.100d.txt',encoding='utf8')
for line in tqdm(f):
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Found %s word vectors.' % len(embeddings_index))

400001it [00:10, 38733.62it/s]

Found 400001 word vectors.





## Defining hyperparameters

In [38]:
VOCABULARY_SIZE = 30000
MAX_LENGTH = 500

In [39]:
tokenizer = Tokenizer(num_words=VOCABULARY_SIZE)
tokenizer.fit_on_texts(list(xtrain) + list(xtest))

Turning tokens into lists of sequences

In [40]:
xtrain_sequence = tokenizer.texts_to_sequences(xtrain)
xtest_sequence = tokenizer.texts_to_sequences(xtest)

In [41]:
xtrain_padding = sequence.pad_sequences(xtrain_sequence, maxlen=MAX_LENGTH)
xtest_padding = sequence.pad_sequences(xtest_sequence, maxlen=MAX_LENGTH)
print(xtest_padding.shape)
word_index = tokenizer.word_index


(1000, 500)


In [42]:
embedding_matrix = np.zeros((len(word_index) + 1, 100))
print(embedding_matrix.shape)
for word, i in tqdm(word_index.items()):
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

100%|██████████| 29633/29633 [00:00<00:00, 508330.31it/s]

(29634, 100)





- Embedding_matrix is now just a matrix over the embeddings. i.e as embedding_index but in matrix form with 29000 ish rows
- word_index is a mapping of all of the words appearing in lyrics to an integer

## LSTM model with glove embeddings and two dense layers.

We could change dropout percentage to further avoid/less avoid overfitting

In [None]:
model = Sequential()
model.add(Embedding(len(word_index) + 1,
                    100,
                    weights=[embedding_matrix],
                    input_length=MAX_LENGTH,
                    trainable=False))
model.add(SpatialDropout1D(0.3))
model.add(Bidirectional(LSTM(100, dropout=0.3, recurrent_dropout=0.3)))
model.add(Dense(1024, activation='relu'))
model.add(Dropout(0.8))
model.add(Dense(1024, activation='relu'))
model.add(Dropout(0.8))
model.add(Dense(3))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])












### binarize the labels for the neural network

In [None]:
y = list(df['label'])
x = list(df['Lyric'])

le = preprocessing.LabelEncoder()
le.fit(y)

def encode(le, labels):
    enc = le.transform(labels)
    return keras.utils.np_utils.to_categorical(enc)

def decode(le, one_hot):
    print(one_hot)
    dec = np.argmax(one_hot, axis=1)
    return le.inverse_transform(dec)


x_enc = x
y_enc = encode(le, y)

In [None]:
ytrain_encode = np_utils.to_categorical(ytrain)
ytest_encode = np_utils.to_categorical(ytest)

In [None]:
with tf.Session(config=tf.ConfigProto(log_device_placement=True)) as session:
    K.set_session(session)
    session.run(tf.global_variables_initializer())  
    session.run(tf.tables_initializer())
    history = model.fit(xtrain_padding, 
                    y=ytrain_encode, 
                    batch_size=512, 
                    epochs=1, 
                    verbose=1, 
    predicts = model.predict(x_test, batch_size=16

In [None]:
from sklearn import metrics

print(metrics.confusion_matrix(y_test, y_preds))

print(metrics.classification_report(y_test, y_preds))

from sklearn.metrics import accuracy_score

print("Accuracy of ELMO is:",accuracy_score(y_test,y_preds))

In [None]:
def graph_plots(history, string):
    plt.plot(history.history[string])
    plt.plot(history.history[‘val_’+string])
    plt.xlabel(“Epochs”)
    plt.ylabel(string)
    plt.legend([string, ‘val_’+string])
    plt.show()

graph_plots(history, “accuracy”)
graph_plots(history, “loss”)