- i) Removing of stop-words, punctuations, URLs, etc.
- ii) Slang lookup
- iii) Spelling correction
- iv) Escaping HTML entries.
- v) Lemmatisation
- vi) Stemming

In [10]:
#library imports
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import re
import spacy
from collections import Counter
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import string
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from sklearn.metrics import mean_squared_error

In [11]:
import pandas as pd
df = pd.read_csv('processed_lyrics.csv')


In [12]:
subset = df.sample(n=1000)

In [13]:
subset

Unnamed: 0,SName,Lyric,Genre,Artist,lyric_length
45730,On Most Surfaces (Inuit),The frost hits me in the eye. and wakes me. th...,Rock,The Gathering,104
19500,The Eleventh Hour,trace the shape of my heart. 'til it becomes m...,Rock,Jars Of Clay,127
14894,Honorable Mention,i served out my detention. and in the end i go...,Rock,Fall Out Boy,203
50383,Springhill Mining Disaster,In the town of Springhill Nova Scotia. Down in...,Rock,U2,201
79448,Maybe Tomorrow,I don't know how many stars there are. Up in t...,Pop,Jackson 5,189
...,...,...,...,...,...
16607,Providence,d'you think the end of the world is coming?. t...,Rock,Godspeed You! Black Emperor,80
45586,Stand In Line,Ten men stand in line. At the gates of the cem...,Rock,The Flaming Lips,81
6354,The Hills Of Mexico,'Twas in the town of griffin. In the year of s...,Rock,Bob Dylan,143
38458,This Song Saved My Life,I wanna start by letting you know this. Becaus...,Rock,Simple Plan,317


In [14]:
#tokenization
tok = spacy.load("en_core_web_lg")
def tokenize (text):
    text = re.sub(r"[^\x00-\x7F]+", " ", text)
    regex = re.compile('[' + re.escape(string.punctuation) + '0-9\\r\\t\\n]') # remove punctuation and numbers
    nopunct = regex.sub(" ", text.lower())
    return [token.text for token in tok.tokenizer(nopunct)]

In [15]:
#count number of occurences of each word
counts = Counter()
for index, row in subset.iterrows():
    counts.update(tokenize(row['Lyric']))

In [16]:
#creating vocabulary
vocab2index = {"":0, "UNK":1}
words = ["", "UNK"]
for word in counts:
    vocab2index[word] = len(words)
    words.append(word)

In [17]:
def encode_sentence(text, vocab2index, N=70):
    tokenized = tokenize(text)
    encoded = np.zeros(N, dtype=int)
    enc1 = np.array([vocab2index.get(word, vocab2index["UNK"]) for word in tokenized])
    length = min(N, len(enc1))
    encoded[:length] = enc1[:length]
    return encoded, length

In [18]:
subset['encoded'] = subset['Lyric'].apply(lambda x: np.array(encode_sentence(x,vocab2index )))
subset.head()

  subset['encoded'] = subset['Lyric'].apply(lambda x: np.array(encode_sentence(x,vocab2index )))


Unnamed: 0,SName,Lyric,Genre,Artist,lyric_length,encoded
45730,On Most Surfaces (Inuit),The frost hits me in the eye. and wakes me. th...,Rock,The Gathering,104,"[[2, 3, 4, 5, 6, 2, 7, 8, 9, 10, 5, 8, 11, 12,..."
19500,The Eleventh Hour,trace the shape of my heart. 'til it becomes m...,Rock,Jars Of Clay,127,"[[58, 2, 59, 23, 31, 60, 61, 62, 29, 63, 64, 6..."
14894,Honorable Mention,i served out my detention. and in the end i go...,Rock,Fall Out Boy,203,"[[15, 116, 55, 31, 117, 8, 9, 6, 2, 118, 15, 1..."
50383,Springhill Mining Disaster,In the town of Springhill Nova Scotia. Down in...,Rock,U2,201,"[[6, 2, 163, 23, 164, 165, 166, 8, 39, 6, 2, 1..."
79448,Maybe Tomorrow,I don't know how many stars there are. Up in t...,Pop,Jackson 5,189,"[[15, 180, 85, 207, 208, 209, 210, 170, 12, 8,..."


In [19]:
possible_labels = df.Genre.unique()

label_dict = {}
for index, possible_label in enumerate(possible_labels):
    label_dict[possible_label] = index
label_dict

subset['label'] = subset.Genre.replace(label_dict)


In [23]:
from sklearn.model_selection import train_test_split
 
X_train, X_valid, y_train, y_valid = train_test_split(
    subset.index.values,
    subset.label.values,
    test_size = 0.2,
    random_state=42,
    stratify=subset.label.values
)

In [37]:
class LyricsDataset(Dataset):
    def __init__(self, X, Y):
        self.X = X
        self.y = Y
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        print(self.X[idx][0])
        print(self.X[idx][1])
        return torch.from_numpy(self.X[idx][0].astype(np.int32)), self.y[idx], self.X[idx][1]

In [25]:
train_ds = LyricsDataset(X_train, y_train)
valid_ds = LyricsDataset(X_valid, y_valid)

# Training

In [26]:
def train_model(model, epochs=10, lr=0.001):
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = torch.optim.Adam(parameters, lr=lr)
    for i in range(epochs):
        model.train()
        sum_loss = 0.0
        total = 0
        for x, y, l in train_dl:
            x = x.long()
            y = y.long()
            y_pred = model(x, l)
            optimizer.zero_grad()
            loss = F.cross_entropy(y_pred, y)
            loss.backward()
            optimizer.step()
            sum_loss += loss.item()*y.shape[0]
            total += y.shape[0]
        val_loss, val_acc, val_rmse = validation_metrics(model, val_dl)
        if i % 5 == 1:
            print("train loss %.3f, val loss %.3f, val accuracy %.3f, and val rmse %.3f" % (sum_loss/total, val_loss, val_acc, val_rmse))

def validation_metrics (model, valid_dl):
    model.eval()
    correct = 0
    total = 0
    sum_loss = 0.0
    sum_rmse = 0.0
    for x, y, l in valid_dl:
        x = x.long()
        y = y.long()
        y_hat = model(x, l)
        loss = F.cross_entropy(y_hat, y)
        pred = torch.max(y_hat, 1)[1]
        correct += (pred == y).float().sum()
        total += y.shape[0]
        sum_loss += loss.item()*y.shape[0]
        sum_rmse += np.sqrt(mean_squared_error(pred, y.unsqueeze(-1)))*y.shape[0]
    return sum_loss/total, correct/total, sum_rmse/totalb

In [27]:
batch_size = 5000
vocab_size = len(words)
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
val_dl = DataLoader(valid_ds, batch_size=batch_size)

In [31]:
def load_glove_vectors(glove_file="glove/glove.6B.50d.txt"):
    """Load the glove word vectors"""
    word_vectors = {}
    with open(glove_file) as f:
        for line in f:
            split = line.split()
            word_vectors[split[0]] = np.array([float(x) for x in split[1:]])
    return word_vectors

In [32]:
def get_emb_matrix(word_vecs, word_counts, emb_size = 50):
    """ Creates embedding matrix from word vectors"""
    vocab_size = len(word_counts) + 2
    vocab_to_idx = {}
    vocab = ["", "UNK"]
    W = np.zeros((vocab_size, emb_size), dtype="float32")
    W[0] = np.zeros(emb_size, dtype='float32') # adding a vector for padding
    W[1] = np.random.uniform(-0.25, 0.25, emb_size) # adding a vector for unknown words 
    vocab_to_idx["UNK"] = 1
    i = 2
    for word in word_counts:
        if word in word_vecs:
            W[i] = word_vecs[word]
        else:
            W[i] = np.random.uniform(-0.25,0.25, emb_size)
        vocab_to_idx[word] = i
        vocab.append(word)
        i += 1   
    return W, np.array(vocab), vocab_to_idx

In [33]:
word_vecs = load_glove_vectors()
pretrained_weights, vocab, vocab2index = get_emb_matrix(word_vecs, counts)

In [34]:
class LSTM_glove_vecs(torch.nn.Module) :
    def __init__(self, vocab_size, embedding_dim, hidden_dim, glove_weights) :
        super().__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.embeddings.weight.data.copy_(torch.from_numpy(glove_weights))
        self.embeddings.weight.requires_grad = False ## freeze embeddings
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.linear = nn.Linear(hidden_dim, 5)
        self.dropout = nn.Dropout(0.2)
        
    def forward(self, x, l):
        x = self.embeddings(x)
        x = self.dropout(x)
        lstm_out, (ht, ct) = self.lstm(x)
        return self.linear(ht[-1])

In [35]:
model = LSTM_glove_vecs(vocab_size, 50, 50, pretrained_weights)

In [None]:
train_model(model, epochs=30, lr=0.1)

In [None]:
train_model(model, epochs=30, lr=0.05)