In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Conv1D, Dense, Embedding,GlobalMaxPooling1D, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import classification_report, confusion_matrix



In [2]:
df = pd.read_csv('cleaned-data.csv')

In [None]:
indexNames = []
for index, row in df.iterrows():
    if len(row.Genres.split("; "))>1:
        indexNames.append(index)
df = df.drop(indexNames)

In [None]:
x = np.array(df['Lyric']) #"text"
y = np.array(df['Genres'])

In [None]:
df['Genres'].value_counts().plot.bar()
loss_weights = len(set(df["Genres"]))
print(set(df["Genres"]))

In [None]:
# tokenizer to transform text into tokens
tokenizer = tf.keras.preprocessing.text.Tokenizer(
    num_words=200000,
    filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
    lower=True, split=' ', char_level=False, oov_token=None,
    #document_count=0, **kwargs
)

# Updates internal vocabulary based on a list of texts.
tokenizer.fit_on_texts(x)

#Transforms each text in x to a sequence of integers.
x = tokenizer.texts_to_sequences(x)

#Pads sequences to the same length. In this case, with maxlen of 100 integers
x = pad_sequences(x, maxlen = 250)

# tokenizer to transform text into tokens
tokenizer = tf.keras.preprocessing.text.Tokenizer(
    num_words=100000,
    filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
    lower=True, split=' ', char_level=False, oov_token=None,
    #document_count=0, **kwargs
)

# Updates internal vocabulary based on a list of texts.
tokenizer.fit_on_texts(y)

#Transforms each text in x to a sequence of integers.
y = tokenizer.texts_to_sequences(y)

y = np.array([[1, 0] if temp[0]==1 else [0, 1] for temp in y])

In [None]:
#CNN Model
def CNN_Model(loss_weights, vocab_size=120000, features=50, input_length=250, learning_rate=0.001, kernel_size=8):
    model = Sequential()
    # Embedding layer
    model.add(Embedding(vocab_size,features,input_length=input_length))
    # Convolutional Layer
    model.add(Conv1D(128, kernel_size, strides=1, activation='relu'))
    # Pooling Layer
    model.add(GlobalMaxPooling1D())

    model.add(Dense(128, activation='relu'))
    model.add(Dropout(.25))

    # Output Layer
    model.add(Dense(2, activation='sigmoid'))
    model.compile(loss=keras.losses.binary_crossentropy, optimizer=keras.optimizers.Adam(learning_rate=learning_rate), metrics=['accuracy'], loss_weights=loss_weights)
    return model

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)
model = CNN_Model(loss_weights)
print(y_test.shape)
print(y_train.shape)

res = model.fit(x_train,y_train,validation_data=(x_test,y_test),epochs=20,batch_size=64)


In [None]:
def plot_accuracy_comparison(accs, title, legend):
    epochs = len(accs[0])
    plt.figure(figsize = (10,5))
    for acc in accs:
        plt.plot(range(1, epochs+1), acc)

    plt.xticks(range(1, epochs+1))
    plt.title(title)
    plt.legend(legend)
    plt.xlabel("Epoch")
    plt.ylabel("Accuracy")
    plt.show()


In [None]:
def plot_loss_comparison(losses, title, legend):
    epochs = len(losses[0])
    plt.figure(figsize = (10,5))
    for loss in losses:
        plt.plot(range(1, epochs+1), loss)

    plt.xticks(range(1, epochs+1))
    plt.title(title)
    plt.legend(legend)
    plt.xlabel("Epochs")
    plt.ylabel("Loss")
    plt.show()

In [None]:
def plot_confusion_matrix(y_test, y_pred):
    plt.matshow(confusion_matrix(y_test, y_pred))
    plt.ylabel("Predicted Category", fontsize=14)
    plt.title("Category", fontsize=14)
    plt.show()

In [None]:
plot_accuracy_comparison([res.history["accuracy"], res.history["val_accuracy"]],
                        "Training/Validation Accuracy Comparison",
                        ["Training Accuracy", "Validation Accuracy"])


In [None]:
plot_accuracy_comparison([res.history["loss"], res.history["val_loss"]],
                        "Training/Validation Loss Comparison",
                        ["Training Accuracy", "Validation Accuracy"])

In [None]:
print(np.argmax(model.predict(x_test), axis=-1))
print(y_test)
#plot_confusion_matrix(y_test,)