## **Restart Kernel if some error shows up since reusing vairables will cause problems here**

In [1]:
import numpy as np
import pandas as pd # for loading data
import re # for removing special characters
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
import nltk
nltk.download('wordnet')
nltk.download('stopwords')
from nltk.corpus import stopwords # For removing stopwords

[nltk_data] Downloading package wordnet to
[nltk_data]     /home/nicktehrany/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/nicktehrany/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
df = pd.read_csv("dataset.csv")
df = df.drop(['Singer', 'Date', 'Tags'], axis=1)

In [3]:
def get_index(word, word_list):
    for index, value in enumerate(word_list):
        if word == value:
            return index

## Creating a word list of all possible words from all Song Lyrics

In [4]:
corpus = []
lyrics = df['Lyrics']
sw = stopwords.words("english")
word_list = []
for i in range(0, 1000):
    text = re.sub('[^a-zA-Z]', ' ', str(lyrics[i])) # removes special characters
    text = text.lower() # lowercases everything
    text = text.split() # splits words
    text = [wordnet_lemmatizer.lemmatize(word, pos="v") for word in text if not word in set(sw)]
    formatted_text = ""
    for word in text:
        if word not in word_list:
            word_list.append(word)
        formatted_text+=word+" "
    corpus.append(formatted_text)
    df['Lyrics'][i] = formatted_text
lyrics = corpus

### Function that creates a one-hot vector for the given text from the word_list

In [5]:
from tensorflow.keras.utils import to_categorical

def text_vectorizer(text, word_list):
    word_vector = np.zeros(shape=(1,len(word_list)))
    text = text.split() # splits words
    for w in text:
        word_vector[0, get_index(w, word_list)] = 1
    return word_vector[0]

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])



## Creating one-hot vector for genres

In [6]:
from tensorflow.keras.utils import to_categorical
genres = []

for i in range(0, 1000):
    genre = df['Genre'][i]
    index = 0
    if genre == '[\'Pop\']': index = 0
    elif genre == '[\'Rock\']': index = 1
    elif genre == '[\'Hip-Hop/Rap\']': index = 2
    elif genre == '[\'Country\']': index = 3
    elif genre == '[\'R&B/Soul\']': index = 4
    elif genre == '[\'Metal\']': index = 5
    elif genre == '[\'Alternative/Indie\']': index = 6
    elif genre == '[\'Folk\']': index = 7
    genres.append(index)

genres = to_categorical(genres, 8)

## Replace all the lyrics with their respctive vector of word occurrences to be used as the input layer for the neural network

In [7]:
all_lyrics = np.zeros(shape=(1000,len(word_list)))
for index in range(0, 1000):
    all_lyrics[index] = text_vectorizer(str(df['Lyrics'][index]), word_list)
print(all_lyrics.shape)

(1000, 12926)


## Neural Network stuff

In [8]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation

print(len(word_list))
model = Sequential()
model.add(Dense(128, input_shape=(len(word_list),))) # first dense layer, 32 hidden units
model.add(Activation('relu'))            # activation layer
model.add(Dense(8))                     # second dense layer
model.add(Activation('softmax'))         # output class probabilities

model.summary()

12926
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 128)               1654656   
_________________________________________________________________
activation (Activation)      (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 8)                 1032      
_________________________________________________________________
activation_1 (Activation)    (None, 8)                 0         
Total params: 1,655,688
Trainable params: 1,655,688
Non-trainable params: 0
_________________________________________________________________


In [9]:
from tensorflow.keras.optimizers import SGD, Adam

optimizer = Adam(lr=0.001) # lr is the learning rate
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

In [10]:
from sklearn.model_selection import train_test_split
print(genres.shape)

lyrics_train, lyrics_test, genre_train, genre_test = train_test_split(all_lyrics, genres, train_size = 0.85, test_size = 0.15, shuffle=True) 
model.fit(lyrics_train, genre_train, epochs=5, batch_size=32);

(1000, 8)
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
