In [1]:
import numpy as np
import pandas as pd # for loading data
import re # for removing special characters
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
import nltk
nltk.download('wordnet')
nltk.download('stopwords')
from nltk.corpus import stopwords # For removing stopwords

[nltk_data] Downloading package wordnet to
[nltk_data]     /home/nicktehrany/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/nicktehrany/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
df = pd.read_csv("dataset.csv")
df = df.drop(['Singer', 'Date', 'Tags'], axis=1)

In [3]:
def get_index(word, word_list):
    for index, value in enumerate(word_list):
        if word == value:
            return index

## Creating a word list of all possible words from all Song Lyrics

In [4]:
corpus = []
lyrics = df['Lyrics']
sw = stopwords.words("english")
word_list = []
for i, value in df.iterrows():
    text = re.sub('[^a-zA-Z]', ' ', str(lyrics[i])) # removes special characters
    text = text.lower() # lowercases everything
    text = text.split() # splits words
    text = [wordnet_lemmatizer.lemmatize(word, pos="v") for word in text if not word in set(sw)]
    formatted_text = ""
    for word in text:
        if word not in word_list:
            word_list.append(word)
        formatted_text+=word+" "
    corpus.append(formatted_text)
    df['Lyrics'][i] = formatted_text
lyrics = corpus

#Cleaning variables to save some memory
del corpus, sw, text

### Function that creates a one-hot vector for the given text from the word_list

In [5]:
from tensorflow.keras.utils import to_categorical

def one_hot(text, word_list):
    word_vector = np.zeros(shape=(1,len(word_list)))
    text = text.split() # splits words
    for w in text:
        word_vector[0, get_index(w, word_list)] = 1
    return word_vector[0]

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])



## Creating one-hot vector for genres

In [6]:
from tensorflow.keras.utils import to_categorical
genres = []

for i, value in df.iterrows():
    genre = df['Genre'][i]
    index = 0
    if genre == '[\'Pop\']': index = 0
    elif genre == '[\'Rock\']': index = 1
    elif genre == '[\'Hip-Hop/Rap\']': index = 2
    elif genre == '[\'Country\']': index = 3
    elif genre == '[\'R&B/Soul\']': index = 4
    elif genre == '[\'Metal\']': index = 5
    elif genre == '[\'Alternative/Indie\']': index = 6
    elif genre == '[\'Folk\']': index = 7
    genres.append(index)

genres = to_categorical(genres, 8)

## Replace all the lyrics with their respctive vector of word occurrences to be used as the input layer for the neural network

In [7]:
all_lyrics = np.zeros(shape=(len(df),len(word_list)))
for index, value in df.iterrows():
    all_lyrics[index] = one_hot(str(df['Lyrics'][index]), word_list)
print(all_lyrics.shape)

#Cleaning variables to save some memory
del df

(14400, 40677)


## **Don't rerun the cell below, as this will result in new test/train sets. Can rerun all other cells except this one. Probably also don't want to rerun all cells above since they always do the exact same thing**

In [8]:
from sklearn.model_selection import train_test_split
import random

lyrics_train, lyrics_test, genre_train, genre_test = train_test_split(all_lyrics, genres, train_size = 0.85, test_size = 0.15, shuffle=True,random_state=random.randint(0,9999999)) 

#Cleaning Lists, since they take up almost 5GB
del all_lyrics, genres

# Neural Network stuff 
## **Modify and run only the cells below to change the neural network**

In [15]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Dropout

model = Sequential()
model.add(Dense(units=128, input_shape=(len(word_list),)))
model.add(Activation('relu'))            # activation layer
model.add(Dropout(0.8))
model.add(Dense(20))
model.add(Dense(8))
model.add(Activation('softmax'))         # output class probabilities

model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_6 (Dense)              (None, 128)               5206784   
_________________________________________________________________
activation_4 (Activation)    (None, 128)               0         
_________________________________________________________________
dropout_2 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_7 (Dense)              (None, 20)                2580      
_________________________________________________________________
dense_8 (Dense)              (None, 8)                 168       
_________________________________________________________________
activation_5 (Activation)    (None, 8)                 0         
Total params: 5,209,532
Trainable params: 5,209,532
Non-trainable params: 0
____________________________________________

In [16]:
from tensorflow.keras.optimizers import SGD, Adam

optimizer = Adam(lr=0.001) # lr is the learning rate
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

In [17]:
model.fit(lyrics_train, genre_train, epochs=10, validation_split=1/10, batch_size=16);

Train on 11016 samples, validate on 1224 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


## Testing the Neural Network's performace + Confusion Matrix

In [18]:
results = model.evaluate(lyrics_test, genre_test, batch_size=16)



In [19]:
def one_hot_reverse():
    genres = []
    genre_list = list(genre_test)

    for i in range(len(genre_list)):
        x = genre_list[i]
        for index in range(0, 8):
            if x[index] == 1.0:
                genres.append(index)

    return genres

In [20]:
from sklearn.metrics import confusion_matrix

y_pred=model.predict_classes(lyrics_test)
cm=confusion_matrix(one_hot_reverse(),y_pred)
print(cm)

[[ 87  53  13  23  30  12  22  24]
 [ 30 121   0  19   6  25  26  19]
 [  3   2 207   0  23   1  10   3]
 [  9  36   0 189   7   2  14  32]
 [ 16  23  25  16 150   2  43  11]
 [  6  60   2   4   1 169  16  22]
 [  7  34  21   6  30  14 122  30]
 [  8  47   0  52   2  11  25 137]]


In [21]:
import seaborn as sn
import pandas as pd
import matplotlib.pyplot as plt

genre_list = ['Pop','Rock', 'Hip-Hop/Rap', 'Country', 'R&B/Soul', 'Metal', 'Alternative/Indie', 'Folk']

df_cm = pd.DataFrame(cm, index = genre_list, columns = genre_list)
print(df_cm)
plt.figure(figsize = (10,7))
sn.heatmap(df_cm, annot=True, linewidth=0.5, fmt='g', cmap="BuPu")
b, t = plt.ylim() # discover the values for bottom and top
b += 0.5 # Add 0.5 to the bottom
t -= 0.5 # Subtract 0.5 from the top
plt.ylim(b, t) # update the ylim(bottom, top) values

                   Pop  Rock  Hip-Hop/Rap  Country  R&B/Soul  Metal  \
Pop                 87    53           13       23        30     12   
Rock                30   121            0       19         6     25   
Hip-Hop/Rap          3     2          207        0        23      1   
Country              9    36            0      189         7      2   
R&B/Soul            16    23           25       16       150      2   
Metal                6    60            2        4         1    169   
Alternative/Indie    7    34           21        6        30     14   
Folk                 8    47            0       52         2     11   

                   Alternative/Indie  Folk  
Pop                               22    24  
Rock                              26    19  
Hip-Hop/Rap                       10     3  
Country                           14    32  
R&B/Soul                          43    11  
Metal                             16    22  
Alternative/Indie                122    30  


(8.0, 0.0)

## **Don't rerun the cell below, as this will result in adding dimesnions to the sets, which will not work! Can rerun all other cells except this one.**

In [22]:
cnn_lyrics_train=lyrics_train[:, :, None]
cnn_lyrics_test=lyrics_test[:, :, None]

## Convolutional Neural Network

In [25]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Conv1D, MaxPool2D, Dropout, Flatten

model = Sequential()
model.add(Conv1D(16, kernel_size=(1), activation='relu', input_shape=(len(word_list),1)))
model.add(Conv1D(16, (1), activation='relu'))
model.add(Dropout(0.25)) # Dropout 25% of the nodes of the previous layer during training
model.add(Flatten())     # Flatten, and add a fully connected layer
model.add(Dense(32, activation='relu')) 
model.add(Dropout(0.5))
model.add(Dense(8, activation='softmax')) # Last layer: 10 class nodes, with dropout
model.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_2 (Conv1D)            (None, 40677, 16)         32        
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 40677, 16)         272       
_________________________________________________________________
dropout_5 (Dropout)          (None, 40677, 16)         0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 650832)            0         
_________________________________________________________________
dense_11 (Dense)             (None, 32)                20826656  
_________________________________________________________________
dropout_6 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_12 (Dense)             (None, 8)                

In [26]:
from tensorflow.keras.optimizers import Adam

optimizer = Adam()
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model, iterating on the data in batches of 32 samples
model.fit(cnn_lyrics_train, genre_train, epochs=5, batch_size=32, validation_split=1/6)

Train on 10200 samples, validate on 2040 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f91f012c668>

In [27]:
results = model.evaluate(cnn_lyrics_test, genre_test, batch_size=16)

