# CS Project 175 Phase 2

In [1]:
!pip install gensim
!pip install torch
!pip install torchmetrics

Collecting gensim
  Downloading https://files.pythonhosted.org/packages/02/d7/ca19d4dac1722d0f21ce10e0a8551fc2c2f093263d288639370565445ca6/gensim-4.2.0-cp38-cp38-win_amd64.whl (24.0MB)
Collecting smart-open>=1.8.1 (from gensim)
  Downloading https://files.pythonhosted.org/packages/3e/07/36678c6ff0dfa6cf445d0e00bf4f013de3b86ec1a2e8bfd1e5df69b2d91d/smart_open-6.2.0-py3-none-any.whl (58kB)
Collecting Cython==0.29.28 (from gensim)
  Downloading https://files.pythonhosted.org/packages/9f/79/311cfbca90332ab37ef8ea08f1af3266f20a9a0e7a1d652842db832226bb/Cython-0.29.28-py2.py3-none-any.whl (983kB)
Installing collected packages: smart-open, Cython, gensim
Successfully installed Cython-0.29.28 gensim-4.2.0 smart-open-6.2.0


You should consider upgrading via the 'python -m pip install --upgrade pip' command.




You should consider upgrading via the 'python -m pip install --upgrade pip' command.


Collecting torchmetrics
  Downloading https://files.pythonhosted.org/packages/08/b7/f1e49be0e076c8ec981f1d4cea1f32da2bd754eaeaf6ed74d5add3f840b4/torchmetrics-0.10.3-py3-none-any.whl (529kB)
Installing collected packages: torchmetrics
Successfully installed torchmetrics-0.10.3


You should consider upgrading via the 'python -m pip install --upgrade pip' command.


In [2]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn import linear_model 
from sklearn import metrics 

## Building the Pipeline

In [3]:
RANDOM_STATE = 42

def gather_data():
    data = pd.read_csv('../data/data.csv')
    return data['lyrics'], data['genre']

def vectorize_labels(labels, classes=None):
    '''
    Vectorizes the labels.
    Returns as (indexes, labels)
    '''
    if classes is None:
        return pd.factorize(labels)
    return pd.Categorical(labels, categories=classes).codes, classes


# PHASE 1 START ----------------------------------------------------------------
def features_bow(data):
    vectorizer = CountVectorizer(stop_words='english', min_df=0.01, ngram_range=(1, 2))
    text = data.to_list()
    X = vectorizer.fit_transform(text)
    return X, vectorizer

def train_model_logistic(X, Y):
    classifier = linear_model.LogisticRegression(penalty='l2', multi_class='multinomial', class_weight='balanced', random_state=RANDOM_STATE, fit_intercept=True)
    classifier.fit(X, Y)
    return classifier

def evaluate_model_sklearn(model, X_train, Y_train, X_test, Y_test):
    train_accuracy = model.score(X_train, Y_train)
    print('\nTraining:')
    print(' accuracy:',format( 100*train_accuracy , '.2f') ) 

    # Compute and print accuracy on the test data
    print('\nTesting: ')
    test_accuracy = model.score(X_test, Y_test)
    print(' accuracy:', format( 100*test_accuracy , '.2f') )

    # Compute and print AUC on the test data
    class_probabilities = model.predict_proba(X_test)[:, 1]
    test_auc_score = metrics.roc_auc_score(Y_test, class_probabilities)
    print(' AUC value:', format( 100*test_auc_score , '.2f') )
    return train_accuracy, test_accuracy

def sample_incorrect_predictions(predictions, probabilities, actuals, classes, titles, lyrics):
    np.random.seed(RANDOM_STATE)
    NUM_EXAMPLES = 10
    for _ in range(NUM_EXAMPLES):
        i = np.random.choice(np.where(predictions != actuals)[0])
        print("Song Title:", titles[i])
        print('Predicted:', classes[predictions[i]], 'Actual:', classes[actuals[i]])
        print('Probability:', probabilities[i][predictions[i]])
        print("Lyrics: ")
        print('"' + lyrics[i][:100] + '..."')
        print()

# PHASE 1 END ------------------------------------------------------------------

# Phase 2 separted into different blocks

## Executing the Pipeline

### MLP on BOW

Let's start with the MLP classifier on BOW, then we'll try RNN on embeddings

In [4]:
# Phase 2 pipeline
inputs, labels = gather_data()

In [5]:
# convert to classes
Y, classes = vectorize_labels(labels)

In [6]:
# Let's try MLP on BOW first
X_bow, vectorizer = features_bow(inputs)
X_bow_train, X_bow_test, Y_bow_train, Y_bow_test = train_test_split(X_bow, Y, test_size=0.2, random_state=RANDOM_STATE)

In [7]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV

def train_model_mlp(X, Y, solver='lbfgs', **kwargs):
    classifier = MLPClassifier(solver=solver, random_state=RANDOM_STATE, **kwargs)
    classifier.fit(X, Y)
    return classifier
    

In [8]:
model_nn = train_model_mlp(X_bow_train, Y_bow_train)

In [9]:
# evaluate MLP on bow
evaluate_model_sklearn(model_nn, X_bow_train, Y_bow_train, X_bow_test, Y_bow_test)


Training:
 accuracy: 99.89

Testing: 
 accuracy: 85.75
 AUC value: 90.47


(0.9988590107314667, 0.8575481006413419)

In [10]:
# axis search
table = {}
params = ['lbfgs', 'sgd', 'adam']
for param in params:
    model = train_model_mlp(X_bow_train, Y_bow_train, solver=param)
    train, test = evaluate_model_sklearn(model, X_bow_train, Y_bow_train, X_bow_test, Y_bow_test)
    table[param] = (train, test)


Training:
 accuracy: 99.89

Testing: 
 accuracy: 85.75
 AUC value: 90.47





Training:
 accuracy: 96.86

Testing: 
 accuracy: 86.93
 AUC value: 92.02

Training:
 accuracy: 99.85

Testing: 
 accuracy: 87.36
 AUC value: 92.64


### DNN on BOW

In [11]:
# 60.18 (100, 50, 50, 50, 50) 
# 60.21 (100, 100, 100, 100, 100) 3x longer to train
# 79.61 60.32 (200, 200) 21m 49.5s
# 76.09 60.66 (200, 100, 100, 100) 69m 21.4s
# 84.28 60.76 (500, 100, 100, 100) 183m 23.2s
# 89.32 61 (750, 250) 48m (on 6-core)
# 89.76 61.85 82.00 (1000, 500) 57 min (on 6-core)
model_dnn = train_model_mlp(X_bow_train, Y_bow_train, hidden_layer_sizes=(500, 100, 100, 100))
evaluate_model_sklearn(model_dnn, X_bow_train, Y_bow_train, X_bow_test, Y_bow_test)


Training:
 accuracy: 99.89

Testing: 
 accuracy: 87.09
 AUC value: 90.95


(0.9988898482792649, 0.8708682782437099)

In [12]:
table = {}
params = [(256, 256, 256), (512, 512), (128, 128, 128, 128), (512, 256, 128)]
for param in params:
    model = train_model_mlp(X_bow_train, Y_bow_train, solver='sgd', hidden_layer_sizes=param)
    train, test = evaluate_model_sklearn(model, X_bow_train, Y_bow_train, X_bow_test, Y_bow_test)
    table[param] = (train, test)


Training:
 accuracy: 99.82

Testing: 
 accuracy: 85.08
 AUC value: 90.29





Training:
 accuracy: 99.82

Testing: 
 accuracy: 86.74
 AUC value: 91.66

Training:
 accuracy: 99.85

Testing: 
 accuracy: 85.01
 AUC value: 89.70

Training:
 accuracy: 99.82

Testing: 
 accuracy: 86.27
 AUC value: 91.03


In [13]:
table

{(256, 256, 256): (0.9982422597755026, 0.8507646768623581),
 (512, 512): (0.9982114222277044, 0.8674148988653182),
 (128, 128, 128, 128): (0.9984889601578882, 0.8501480019733596),
 (512, 256, 128): (0.9982422597755026, 0.8627281697089294)}

### RNN on Embeddings

Now we're gonna try RNN on embeddings

In [14]:
X_str = inputs.tolist()

In [15]:

from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import os

if not os.path.exists("word2vec.wordvectors"):
    print("Did not find pre-trained embeddings, training now...")
    model = Word2Vec(sentences=inputs.str.split(), vector_size=100, window=5, min_count=1, workers=4)
    word_vectors = model.wv
    word_vectors.save("word2vec.wordvectors")
else:
    print("Found pre-trained embeddings, loading now...")
    word_vectors = KeyedVectors.load("word2vec.wordvectors", mmap='r')

Did not find pre-trained embeddings, training now...


In [16]:
# split into training and test
X_vec_train, X_vec_test, Y_vec_train, Y_vec_test = train_test_split(X_str, Y, test_size = 0.2, random_state=RANDOM_STATE)

In [17]:

def word2vec(tokens):
    return np.array([word_vectors[word] for word in tokens if word in word_vectors])

import torch
import torch.nn as nn
from torchmetrics import R2Score
import torch.multiprocessing as mp
torch.manual_seed(RANDOM_STATE)
import gc

# RNN code inspired from HW2
class RNN(nn.Module):
    def __init__(self, input_size, output_size, hidden_size):
        super(RNN, self).__init__()
        self.hidden_size = hidden_size

        # self.hidden_layers = [nn.Linear(input_size + hidden_size, hidden_size) for _ in range(n_layers)]
        self.hidden_layer =  nn.Linear(input_size + hidden_size, hidden_size) # create hidden layer
        self.output_layer =  nn.Linear(input_size + hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1) # finish with a log softmax

    def forward(self, input_, hidden):
        # Put the computation for forward pass here
        combined = torch.cat((input_, hidden), 1) # concatenate the input and hidden layers
        output = self.output_layer(combined) # compute the output
        output = self.softmax(output) # apply softmax
        hidden = self.hidden_layer(combined) # compute the hidden layer

        return output, hidden

    def init_hidden(self):
        return torch.zeros(1, self.hidden_size)

    def score_old(self, X, Y):
        self.predictions = torch.zeros((len(Y),), dtype=torch.long)
        self.actuals = torch.zeros((len(Y),), dtype=torch.long)
        processes = []
        for i in range(len(X)):
            song = X[i]
            genre = Y[i]
            p = mp.Process(target=self.score_song, args=(song, genre, i))
            p.start()
            processes.append(p)
            if i % 100 == 0:
                print(i, '/', len(X))
        i = 0
        for p in processes:
            p.join()
            if i % 1000 == 0:
                print(i, '/', len(X))
        r2 = R2Score()
        return r2(self.predictions, self.actuals)

    def predict(self, song):
        hidden = self.init_hidden()
        song_embedding = word2vec(song.split())
        song_tensor = torch.tensor(np.expand_dims(song_embedding, axis=1), dtype=torch.float).detach()
        for j in range(song_tensor.size()[0]):
            output, hidden = self.forward(song_tensor[j], hidden.detach())
        return output.topk(1)[1]

    def score(self, X, Y):
        total_accuracy = 0
        for i in range(len(X)):
            song = X[i]
            genre = Y[i]
            prediction = self.predict(song)
            total_accuracy += genre == prediction
            if i % 100 == 0:
                print(i, '/', len(X))
        return total_accuracy / len(X)


def train_model_rnn(X, Y):
    model = RNN(input_size=100, output_size=7, hidden_size=512)

    n_iters = 100000
    lr=1e-4
    criterion = nn.NLLLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    # reshape X
    for iter in range(1, n_iters + 1):
        optimizer.zero_grad()
        hidden = model.init_hidden()
        r_i = np.random.randint(0, len(X))
        song_embedding = word2vec(X[r_i].split())
        if len(song_embedding) == 0:
            continue
        song_tensor = torch.tensor(np.expand_dims(song_embedding, axis=1), dtype=torch.float)
        genre_tensor = torch.tensor(np.expand_dims(Y[r_i], axis=0), dtype=torch.long)
        for i in range(song_tensor.size()[0]):
            output, hidden = model.forward(song_tensor[i], hidden)
        loss = criterion(output, genre_tensor)
        loss.backward()
        optimizer.step()
        if iter % 1000 == 0:
            print(f'{iter}/{n_iters}')
    return model

# train model
model_rnn = train_model_rnn(X_vec_train, Y_vec_train)

  from .autonotebook import tqdm as notebook_tqdm


1000/100000
2000/100000
3000/100000
4000/100000
5000/100000
6000/100000
7000/100000


KeyboardInterrupt: 

## Evaluating the Model

In [None]:

def evaluate_model_rnn(model, X_train, Y_train, X_test, Y_test):
    train_accuracy = model.score(X_train, Y_train)
    print('\nTraining:')
    print(' accuracy:',format( 100*train_accuracy , '.2f') ) 

    # Compute and print accuracy on the test data
    print('\nTesting: ')
    test_accuracy = model.score(X_test, Y_test)
    print(' accuracy:', format( 100*test_accuracy , '.2f') )

evaluate_model_rnn(model_rnn, X_vec_train, Y_vec_train, X_vec_test, Y_vec_test)

0 / 236193
100 / 236193
200 / 236193
300 / 236193


KeyboardInterrupt: 

In [None]:
torch.sum(model_rnn.predictions == model_rnn.actuals) / len(X_vec_train)

tensor(0.1117)

# Comparing the Models

Comparing the results of Logistic Regression on BOW, Neural Network on BOW, Deep Neural Neural Network on BOW, and RNN on Word Embeddings: