## Imports

In [None]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Conv1D
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import GlobalMaxPooling1D
from tensorflow.keras.layers import LSTM
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import classification_report, confusion_matrix

## Read/Transform Data

In [None]:
#read data
df = pd.read_csv('cleaned-data.csv')

In [None]:
#join and shuffle data
df = df.sample(frac=1).reset_index(drop=True) #sample shuffles the data
df.head()

In [None]:
indexNames = []
for index, row in df.iterrows():
    if len(row.Genres.split("; "))>1 or (row.Genres != "Pop" and row.Genres!="Rap"):
        indexNames.append(index)
df = df.drop(indexNames)

In [None]:
x = np.array(df['Lyric']) #"text"
y = np.array(df['Genres'])

In [None]:
len(set(df['Genres']))

In [None]:
df['Genres'].value_counts().plot.bar()

In [None]:
# tokenizer to transform text into tokens
tokenizer = tf.keras.preprocessing.text.Tokenizer(
    num_words=200000,
    filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
    lower=True, split=' ', char_level=False, oov_token=None,
    #document_count=0, **kwargs
)

# Updates internal vocabulary based on a list of texts.
tokenizer.fit_on_texts(x)

#Transforms each text in x to a sequence of integers.
x = tokenizer.texts_to_sequences(x)

#Pads sequences to the same length. In this case, with maxlen of 100 integers
x = pad_sequences(x, maxlen = 250)

# tokenizer to transform text into tokens
tokenizer = tf.keras.preprocessing.text.Tokenizer(
    num_words=100000,
    filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
    lower=True, split=' ', char_level=False, oov_token=None,
    #document_count=0, **kwargs
)

# Updates internal vocabulary based on a list of texts.
tokenizer.fit_on_texts(y)

#Transforms each text in x to a sequence of integers.
y = tokenizer.texts_to_sequences(y)

y = np.array([temp[0]-1 for temp in y])

#Pads sequences to the same length. In this case, with maxlen of 100 integers
#y = pad_sequences(x, maxlen = 1)

In [None]:
def LSTM_Model(vocab_size=200000, features=500, input_length=250, learning_rate=0.00001, dropout=0):
    model=Sequential()
    # Embedding layer
    model.add(Embedding(vocab_size,features,input_length=input_length))
    # Long Short Term Memory layer
    model.add(LSTM(200, dropout=dropout))
    # Output layer
    model.add(Dense(1,activation='sigmoid'))
    model.compile(loss=keras.losses.binary_crossentropy, optimizer=keras.optimizers.Adam(learning_rate=learning_rate), metrics=['accuracy'])
    return model

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)
model = LSTM_Model()
res = model.fit(x_train,y_train,validation_data=(x_test,y_test),epochs=20,batch_size=64)