In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn import preprocessing

df = pd.read_csv('unprocessed_lyrics.csv')
df = df.groupby('Genre', group_keys=False).apply(lambda s: s.sample(5000, random_state=42)) #


In [None]:
df

Unnamed: 0,SName,Lyric,Genre,Artist,lyric_length
2157,Family Affair,Refrain:. Let's get it crunkupon. We gon' have...,Hip Hop,Mary J. Blige,567
3187,Ransom (ft. Lil' Wayne),"Ransom,. . Yeah,. its Drizzy Baby. you already...",Hip Hop,Drake,967
9164,Close To Me,"T.O.S.. (50 Cent). Unstoppable, incredible, im...",Hip Hop,G-Unit,557
2145,Zone,"uhh, yea. uh uh uh. alright, well alright. . i...",Hip Hop,Drake,400
6442,"Why You Up In Here (feat. Ludacris, Git Fresh ...",Flo-Rida. Gucci!. Bird!. I done bought all thi...,Hip Hop,Flo Rida,524
...,...,...,...,...,...
31738,What do You Need?,What do you need from me tonight?. I feel you ...,Rock,Goo Goo Dolls,197
31425,Rebel Heart,"(R. Stewart, J. Golub, C. Kentis, C. Rojas). I...",Rock,Rod Stewart,412
35407,Before The Dawn,Meet me after dark again and I'll hold you. I ...,Rock,Evanescence,114
35281,Spanish is the Loving Tongue,"Broke my heart, lost my soul. Adios,mi cora so...",Rock,Bob Dylan,126


In [None]:

possible_labels = df.Genre.unique()

label_dict = {}
for index, possible_label in enumerate(possible_labels):
    label_dict[possible_label] = index
label_dict

df['label'] = df.Genre.replace(label_dict)

In [None]:
import numpy as np
label_encoder = preprocessing.LabelEncoder()
labels = label_encoder.fit_transform(df.Genre.values)


In [None]:
from sklearn.model_selection import train_test_split

xtrain, xtest, ytrain, ytest = train_test_split(df.Lyric.values, 
                                                df.label.values, 
                                                random_state=42, 
                                                test_size=0.1, 
                                                shuffle=True)

In [None]:
xtrain

array(["When you won't go along. Some will feel like you can't find your way. Heaven knows. Heaven knows I've seen it all before. Now don't get me wrong. I've seen many a life gone astray. When someone goes. All your bows won't make it any better so let it go. CHORUS. Shut up and take it like a man. You need us to get a life. For your own good we'll take you by the hand. 'Cause you need a little more. . Maybe no one will say it. Maybe no one's aware it goes on. All I know. All I know it's like going off the deep end. Could you make a decision?. Could you think for yourself and go on?. Everyone knows. Everyone knows you've got to fit into the mainstream so save your woes. . CHORUS. Close your eyes. Look away now. Make believe while you can. So you'd best go along. Don't let on if you can't find your way. Many more. Many more do it for you. Because heaven knows. Heaven knows it's like playing with fire. Your life's a throw. . CHORUS.",
       "East New York!! oh god!!. Yeah, got that gan

In [None]:
from tensorflow.keras.layers import TextVectorization
import tensorflow as tf
vectorizer = TextVectorization(max_tokens=20000, output_sequence_length=500)
text_ds = tf.data.Dataset.from_tensor_slices(xtrain).batch(128)
vectorizer.adapt(text_ds)

In [None]:
vectorizer.get_vocabulary()[:5]


['', '[UNK]', 'the', 'i', 'you']

In [None]:
output = vectorizer([["the cat sat on the mat"]])
output.numpy()[0, :6]

array([   2, 1290, 1726,   14,    2, 7317])

In [None]:
voc = vectorizer.get_vocabulary()
word_index = dict(zip(voc, range(len(voc))))

In [None]:
test = ["the", "cat", "sat", "on", "the", "mat"]
[word_index[w] for w in test]

[2, 1290, 1726, 14, 2, 7317]

In [None]:
#!wget http://nlp.stanford.edu/data/glove.6B.zip
#!unzip glove*.zip


In [None]:
from tqdm import tqdm

embedding_index = {}
f = open('glove.6B.300d.txt',encoding='utf8')
for line in tqdm(f):
  word, coefs = line.split(maxsplit=1)
  coefs = np.fromstring(coefs, "f", sep=" ")
  embedding_index[word] = coefs

print("Found %s word vectors." % len(embedding_index))

400000it [00:19, 20950.44it/s]

Found 400000 word vectors.





In [None]:
num_tokens = len(voc) + 2
embedding_dim = 300
hits = 0
misses = 0

# Prepare embedding matrix
embedding_matrix = np.zeros((num_tokens, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))

Converted 17108 words (2892 misses)


In [None]:
from tensorflow.keras.layers import Embedding
import keras
embedding_layer = Embedding(
    num_tokens,
    embedding_dim,
    embeddings_initializer=keras.initializers.Constant(embedding_matrix),
    trainable=True,
)


In [None]:
from tensorflow.keras import layers
from keras.layers import Input, Flatten

int_sequences_input = keras.Input(shape=(500,), dtype="int64")
embedded_sequences = embedding_layer(int_sequences_input)
flat_emb = Flatten()(embedded_sequences)

#x = layers.Conv1D(128, 5, activation="relu")(flat_emb)
#x = layers.GlobalMaxPooling1D()(x)
x = tf.keras.layers.Dense(1024, activation='relu')(flat_emb)
preds = tf.keras.layers.Dense(3, activation='softmax', name='outputs')(x)
model = keras.Model(int_sequences_input, preds)
model.summary()


Model: "model_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_10 (InputLayer)       [(None, 500)]             0         
                                                                 
 embedding_3 (Embedding)     (None, 500, 300)          6000600   
                                                                 
 flatten_3 (Flatten)         (None, 150000)            0         
                                                                 
 dense_7 (Dense)             (None, 1024)              153601024 
                                                                 
 outputs (Dense)             (None, 3)                 3075      
                                                                 
Total params: 159,604,699
Trainable params: 159,604,699
Non-trainable params: 0
_________________________________________________________________


In [None]:
from keras.utils import np_utils

ytrain_encode = np_utils.to_categorical(ytrain)
ytest_encode = np_utils.to_categorical(ytest)

In [None]:
x_train = vectorizer(np.array([[s] for s in xtrain])).numpy()
x_test = vectorizer(np.array([[s] for s in xtest])).numpy()

y_train = np.array(ytrain)
y_test = np.array(ytest)

In [None]:
x_train.shape

(13500, 500)

In [None]:
model.compile(
    loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["acc"]
)
model.fit(x_train, y_train, batch_size=128, epochs=10,validation_data=(x_test, y_test))
y_pred=model.predict(x_test)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


Large diff between acc on training vs testing (might be overfit)?

In [None]:
from sklearn.metrics import classification_report

y_pred_bool = np.argmax(y_pred, axis=1)
print(classification_report(y_test, y_pred_bool))

              precision    recall  f1-score   support

           0       0.87      0.79      0.83       516
           1       0.57      0.59      0.58       503
           2       0.63      0.67      0.65       481

    accuracy                           0.68      1500
   macro avg       0.69      0.68      0.68      1500
weighted avg       0.69      0.68      0.69      1500

