In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn import preprocessing

df = pd.read_csv('preprocessed_lyrics.csv')
df = df.groupby('Genre', group_keys=False).apply(lambda s: s.sample(5000, random_state=42)) #
df = df[df['Lyric'].notna()]


In [2]:
df

Unnamed: 0,SName,Lyric,Genre,Artist,lyric_length_processed
2157,Family Affair,refrain let crunkupon gon fun dancery ya open ...,Hip Hop,Mary J. Blige,244
3187,Ransom (ft. Lil' Wayne),ransom yeah Drizzy Baby know time high time sm...,Hip Hop,Drake,426
9164,Close To Me,Cent unstoppable incredible impeccable unit wh...,Hip Hop,G-Unit,242
2145,Zone,uhh yea uh uh uh alright alright love bump bad...,Hip Hop,Drake,164
6442,"Why You Up In Here (feat. Ludacris, Git Fresh ...",Flo Rida Gucci bird buy ciroc lil mama jock st...,Hip Hop,Flo Rida,199
...,...,...,...,...,...
31738,What do You Need?,need tonight feel look right pretend right may...,Rock,Goo Goo Dolls,67
31425,Rebel Heart,Stewart Golub Kentis Rojas pick quarter brothe...,Rock,Rod Stewart,181
35407,Before The Dawn,meet dark hold maybe tonight fly far away lose...,Rock,Evanescence,43
35281,Spanish is the Loving Tongue,break heart lose soul Adios mi cora sole spani...,Rock,Bob Dylan,65


In [3]:

possible_labels = df.Genre.unique()

label_dict = {}
for index, possible_label in enumerate(possible_labels):
    label_dict[possible_label] = index
label_dict

df['label'] = df.Genre.replace(label_dict)

In [4]:
import numpy as np
label_encoder = preprocessing.LabelEncoder()
labels = label_encoder.fit_transform(df.Genre.values)


In [5]:
from sklearn.model_selection import train_test_split

xtrain, xtest, ytrain, ytest = train_test_split(df.Lyric.values, 
                                                df.label.values, 
                                                random_state=42, 
                                                test_size=0.1, 
                                                shuffle=True)

In [6]:
xtrain

array(['crazy relieve time beg sweet relief blessing sky die tired eye loose sleep come tonight hand miracle hand miracle leave hand miracle way let away survive go alright lucky alive vision blind search way right sight hand miracle hand miracle leave hand miracle way let away hand miracle hand miracle leave hand miracle way let away',
       'East New York oh god yeah gangsta gangsta gully gully yeah big business Joe crack don Terror Squad baby BX boro holdin death nothin realer hear uh huh verse like prove somethin everytime stop block set shop try somethin talkin kilo pound fuck desert eagle shit spit round tell scar neck spar good Joey boombay ay hit hard leave sharp right know bother retarded man ya know squadron like let die slow death probably collectin food deadin ya crew tell ya truth stoppin like lil lease b street man poppin street shit d watchin shift east glock fifth leave chump frame right standin daughter slaughter maim pay ransom chorus t e r r o r squad nigga right ni

In [7]:
from tensorflow.keras.layers import TextVectorization
import tensorflow as tf
vectorizer = TextVectorization(max_tokens=20000, output_sequence_length=500)
text_ds = tf.data.Dataset.from_tensor_slices(xtrain).batch(128)
vectorizer.adapt(text_ds)

In [8]:
vectorizer.get_vocabulary()[:5]


['', '[UNK]', 'know', 'like', 'love']

In [9]:
output = vectorizer([["the cat sat on the mat"]])
output.numpy()[0, :6]

array([10275,   609,     1,     1, 10275,  5409])

In [10]:
voc = vectorizer.get_vocabulary()
word_index = dict(zip(voc, range(len(voc))))

In [12]:
#!wget http://nlp.stanford.edu/data/glove.6B.zip
#!unzip glove*.zip


In [13]:
from tqdm import tqdm

embedding_index = {}
f = open('glove.6B.300d.txt',encoding='utf8')
for line in tqdm(f):
  word, coefs = line.split(maxsplit=1)
  coefs = np.fromstring(coefs, "f", sep=" ")
  embedding_index[word] = coefs

print("Found %s word vectors." % len(embedding_index))

400000it [00:17, 22255.33it/s]

Found 400000 word vectors.





In [14]:
num_tokens = len(voc) + 2
embedding_dim = 300
hits = 0
misses = 0

# Prepare embedding matrix
embedding_matrix = np.zeros((num_tokens, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))

Converted 16627 words (3373 misses)


In [15]:
from tensorflow.keras.layers import Embedding
import keras
embedding_layer = Embedding(
    num_tokens,
    embedding_dim,
    embeddings_initializer=keras.initializers.Constant(embedding_matrix),
    trainable=True,
)


In [16]:
from tensorflow.keras import layers
from keras.layers import Input, Flatten

int_sequences_input = keras.Input(shape=(500,), dtype="int64")
embedded_sequences = embedding_layer(int_sequences_input)
flat_emb = Flatten()(embedded_sequences)

#x = layers.Conv1D(128, 5, activation="relu")(flat_emb)
#x = layers.GlobalMaxPooling1D()(x)
x = tf.keras.layers.Dense(1024, activation='relu')(flat_emb)
preds = tf.keras.layers.Dense(3, activation='softmax', name='outputs')(x)
model = keras.Model(int_sequences_input, preds)
model.summary()


Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 500)]             0         
                                                                 
 embedding (Embedding)       (None, 500, 300)          6000600   
                                                                 
 flatten (Flatten)           (None, 150000)            0         
                                                                 
 dense (Dense)               (None, 1024)              153601024 
                                                                 
 outputs (Dense)             (None, 3)                 3075      
                                                                 
Total params: 159,604,699
Trainable params: 159,604,699
Non-trainable params: 0
_________________________________________________________________


In [17]:
from keras.utils import np_utils

ytrain_encode = np_utils.to_categorical(ytrain)
ytest_encode = np_utils.to_categorical(ytest)

In [18]:
x_train = vectorizer(np.array([[s] for s in xtrain])).numpy()
x_test = vectorizer(np.array([[s] for s in xtest])).numpy()

y_train = np.array(ytrain)
y_test = np.array(ytest)

In [19]:
x_train.shape

(13499, 500)

In [20]:
model.compile(
    loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["acc"]
)
model.fit(x_train, y_train, batch_size=128, epochs=2,validation_data=(x_test, y_test))
y_pred=model.predict(x_test)

Epoch 1/2
Epoch 2/2


Large diff between acc on training vs testing (might be overfit)?

In [21]:
from sklearn.metrics import classification_report

y_pred_bool = np.argmax(y_pred, axis=1)
print(classification_report(y_test, y_pred_bool))

              precision    recall  f1-score   support

           0       0.88      0.83      0.85       498
           1       0.64      0.40      0.49       499
           2       0.59      0.84      0.69       503

    accuracy                           0.69      1500
   macro avg       0.70      0.69      0.68      1500
weighted avg       0.70      0.69      0.68      1500

