# CS Project 175 Phase 2

In [None]:
!pip install gensim
!pip install torch
!pip install torchmetrics

In [None]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn import linear_model 
from sklearn import metrics 

## Building the Pipeline

In [None]:
RANDOM_STATE = 42

def gather_data():
    data = pd.read_csv('./data/data.csv')
    return data['lyrics'], data['genre']

def vectorize_labels(labels, classes=None):
    '''
    Vectorizes the labels.
    Returns as (indexes, labels)
    '''
    if classes is None:
        return pd.factorize(labels)
    return pd.Categorical(labels, categories=classes).codes, classes


# PHASE 1 START ----------------------------------------------------------------
def features_bow(data):
    vectorizer = CountVectorizer(stop_words='english', min_df=0.01, ngram_range=(1, 2))
    text = data.to_list()
    X = vectorizer.fit_transform(text)
    return X, vectorizer

def train_model_logistic(X, Y):
    classifier = linear_model.LogisticRegression(penalty='l2', multi_class='multinomial', class_weight='balanced', random_state=RANDOM_STATE, fit_intercept=True)
    classifier.fit(X, Y)
    return classifier

def evaluate_model_sklearn(model, X_train, Y_train, X_test, Y_test):
    train_accuracy = model.score(X_train, Y_train)
    print('\nTraining:')
    print(' accuracy:',format( 100*train_accuracy , '.2f') ) 

    # Compute and print accuracy on the test data
    print('\nTesting: ')
    test_accuracy = model.score(X_test, Y_test)
    print(' accuracy:', format( 100*test_accuracy , '.2f') )

    # Compute and print AUC on the test data
    class_probabilities = model.predict_proba(X_test)
    test_auc_score = metrics.roc_auc_score(Y_test, class_probabilities, multi_class='ovo')
    print(' AUC value:', format( 100*test_auc_score , '.2f') )

def sample_incorrect_predictions(predictions, probabilities, actuals, classes, titles, lyrics):
    np.random.seed(RANDOM_STATE)
    NUM_EXAMPLES = 10
    for _ in range(NUM_EXAMPLES):
        i = np.random.choice(np.where(predictions != actuals)[0])
        print("Song Title:", titles[i])
        print('Predicted:', classes[predictions[i]], 'Actual:', classes[actuals[i]])
        print('Probability:', probabilities[i][predictions[i]])
        print("Lyrics: ")
        print('"' + lyrics[i][:100] + '..."')
        print()

# PHASE 1 END ------------------------------------------------------------------

# PHASE 2 START ----------------------------------------------------------------
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV

def train_model_mlp(X, Y, hidden_layer_sizes=(100,), max_iter=200):
    classifier = MLPClassifier(solver='lbfgs', hidden_layer_sizes=hidden_layer_sizes, max_iter=max_iter, random_state=RANDOM_STATE)
    classifier.fit(X, Y)
    return classifier
    
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import os

def features_word2vec(data):
    word_lists = data.str.split()
    if not os.path.exists("word2vec.wordvectors"):
        print("Did not find pre-trained embeddings, training now...")
        model = Word2Vec(sentences=word_lists, vector_size=200, window=5, min_count=1, workers=4)
        word_vectors = model.wv
        word_vectors.save("word2vec.wordvectors")
    else:
        print("Found pre-trained embeddings, loading now...")
        word_vectors = KeyedVectors.load("word2vec.wordvectors", mmap='r')
    print("Vectorizing features...")
    res = []
    i = 0
    for word_list in word_lists:
        sub = np.array([word_vectors[word] for word in word_list if word in word_vectors])
        res.append(sub)
        if i % 1000 == 0:
            print(i, "/", len(word_lists))
        i += 1
    return res

def clean_vectors(X, Y):
    # remove inputs with too few vectors
    small_inputs = [i for i, x in enumerate(X) if len(x) < 10]
    small_inputs_set = set(small_inputs)
    X = [v for i, v in enumerate(X) if i not in small_inputs_set]
    Y = np.delete(Y, small_inputs)
    return X, Y

import torch
import torch.nn as nn
from torchmetrics import R2Score
import torch.multiprocessing as mp
torch.manual_seed(RANDOM_STATE)
    
# RNN code inspired from HW2
class RNN(nn.Module):
    def __init__(self, input_size, output_size, hidden_size, n_layers):
        super(RNN, self).__init__()
        self.hidden_size = hidden_size
        self.n_layers = n_layers

        # self.hidden_layers = [nn.Linear(input_size + hidden_size, hidden_size) for _ in range(n_layers)]
        self.hidden_layer =  nn.Linear(input_size + hidden_size, hidden_size) # create hidden layer
        self.output_layer =  nn.Linear(input_size + hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1) # finish with a log softmax

    def forward(self, input_, hidden):
        # Put the computation for forward pass here
        combined = torch.cat((input_, hidden), 1) # concatenate the input and hidden layers
        output = self.output_layer(combined) # compute the output
        output = self.softmax(output) # apply softmax
        hidden = self.hidden_layer(combined) # compute the hidden layer

        return output, hidden

    def init_hidden(self):
        return torch.zeros(1, self.hidden_size)

    def predict_proba(self, X):
        probabilities = torch.zeros((len(X), 7))
        softmax = nn.Softmax(dim=1)
        i = 0
        for song in X:
            hidden = self.init_hidden()
            song_tensor = torch.tensor(np.expand_dims(song, axis=1), dtype=torch.float)
            for i in range(song_tensor.size()[0]):
                output, hidden = self.forward(song_tensor[i], hidden)
            probabilities[i] = softmax(output)
            if i % 1000 == 0:
                print(i, "/", len(X))
            i += 1
        return probabilities

    def score_song(self, song, genre, i):
        hidden = self.init_hidden()
        song_tensor = torch.tensor(np.expand_dims(song, axis=1), dtype=torch.float)
        for i in range(song_tensor.size()[0]):
            output, hidden = self.forward(song_tensor[i], hidden)
        one_hot = torch.nn.functional.one_hot(torch.tensor(genre, dtype=torch.long), num_classes=7)
        softmax = nn.Softmax(dim=1)
        self.predictions[i] = softmax(output)
        self.actuals[i] = one_hot

    def score(self, X, Y):
        self.predictions = torch.zeros((len(Y), 7))
        self.actuals = torch.zeros((len(Y), 7))
        processes = []
        for i in range(len(X)):
            song = X[i]
            genre = Y[i]
            p = mp.Process(target=self.score_song, args=(song, genre, i))
            p.start()
            processes.append(p)
            if i % 100 == 0:
                print(i, '/', len(X))
        i = 0
        for p in processes:
            p.join()
            if i % 1000 == 0:
                print(i, '/', len(X))
        r2 = R2Score()
        return r2(self.predictions, self.actuals)

def train_model_rnn(X, Y):
    model = RNN(input_size=len(X[0][0]), output_size=7, hidden_size=100, n_layers=2)

    n_iters = 2
    lr=1e-4
    criterion = nn.NLLLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    # reshape X
    for iter in range(1, n_iters + 1):
        optimizer.zero_grad()
        hidden = model.init_hidden()
        r_i = np.random.randint(0, len(X))
        song_tensor = torch.tensor(np.expand_dims(X[r_i], axis=1), dtype=torch.float)
        genre_tensor = torch.tensor(np.expand_dims(Y[r_i], axis=0), dtype=torch.long)
        for i in range(song_tensor.size()[0]):
            output, hidden = model.forward(song_tensor[i], hidden)
        loss = criterion(output, genre_tensor)
        loss.backward()
        optimizer.step()
        if iter % 1000 == 0:
            print(f'{iter}/{n_iters}')
    return model

def evaluate_model_rnn(model, X_train, Y_train, X_test, Y_test):
    train_accuracy = model.score(X_train, Y_train)
    print('\nTraining:')
    print(' accuracy:',format( 100*train_accuracy , '.2f') ) 

    # Compute and print accuracy on the test data
    print('\nTesting: ')
    test_accuracy = model.score(X_test, Y_test)
    print(' accuracy:', format( 100*test_accuracy , '.2f') )

# PHASE 2 END ------------------------------------------------------------------

## Executing the Pipeline

### MLP on BOW

Let's start with the MLP classifier on BOW, then we'll try RNN on embeddings

In [None]:
# Phase 2 pipeline
inputs, labels = gather_data()

In [None]:
len(inputs)

In [None]:
# convert to classes
Y, classes = vectorize_labels(labels)

In [None]:
# Let's try MLP on BOW first
X_bow, vectorizer = features_bow(inputs)
X_bow_train, X_bow_test, Y_bow_train, Y_bow_test = train_test_split(X_bow, Y, test_size=0.2, random_state=RANDOM_STATE)

In [None]:
model_nn = train_model_mlp(X_bow_train, Y_bow_train)

In [None]:
# evaluate MLP on bow
evaluate_model_sklearn(model_nn, X_bow_train, Y_bow_train, X_bow_test, Y_bow_test)

### DNN on BOW

In [None]:
# 60.18 (100, 50, 50, 50, 50) 
# 60.21 (100, 100, 100, 100, 100) 3x longer to train
# 79.61 60.32 (200, 200) 21m 49.5s
# 76.09 60.66 (200, 100, 100, 100) 69m 21.4s
# 84.28 60.76 (500, 100, 100, 100) 183m 23.2s
# 89.32 61 (750, 250) 48m (on 6-core)
# 89.76 61.85 82.00 (1000, 500) 57 min (on 6-core)
model_dnn = train_model_mlp(X_bow_train, Y_bow_train, hidden_layer_sizes=(500, 100, 100, 100))
evaluate_model_sklearn(model_dnn, X_bow_train, Y_bow_train, X_bow_test, Y_bow_test)

In [None]:
# Grid Search
from sklearn.model_selection import GridSearchCV

parameters = {
    'hidden_layer_sizes':[(500, 500), (500, 250, 250)]
}
mlp = MLPClassifier(solver='lbfgs', random_state=RANDOM_STATE)
clf = GridSearchCV(mlp, parameters, n_jobs=4)
clf.fit(X_bow, Y)

In [None]:
print(clf.best_params_)
print(clf.best_score_)

### RNN on Embeddings

Now we're gonna try RNN on embeddings

In [None]:
# takes a long time to run!
X_raw = features_word2vec(inputs)

In [None]:
X_vec, Y_vec = clean_vectors(X_raw, Y)

In [None]:
# split into training and test
X_vec_train, X_vec_test, Y_vec_train, Y_vec_test = train_test_split(X_vec, Y_vec, test_size = 0.2, random_state=RANDOM_STATE)

In [None]:
# train model
model_rnn = train_model_rnn(X_vec_train, Y_vec_train)

## Evaluating the Model

In [None]:
evaluate_model_sklearn(model_rnn, X_vec_train, Y_vec_train, X_vec_test, Y_vec_test)

# Comparing the Models

Comparing the results of Logistic Regression on BOW, Neural Network on BOW, Deep Neural Neural Network on BOW, and RNN on Word Embeddings: