# CS Project 175 Phase 2

In [None]:
!pip install gensim

In [1]:
import pandas as pd
import numpy as np

from nltk import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn import linear_model 
from sklearn import metrics 
from sklearn import preprocessing

import seaborn

## Building the Pipeline

In [5]:
RANDOM_STATE = 42

def gather_data():
    data = pd.read_csv('./data/data.csv')
    return data['lyrics'], data['genre']

def vectorize_labels(labels, classes=None):
    '''
    Vectorizes the labels.
    Returns as (indexes, labels)
    '''
    if classes is None:
        return pd.factorize(labels)
    return pd.Categorical(labels, categories=classes).codes, classes


# PHASE 1 START ----------------------------------------------------------------
def features_bow(data):
    vectorizer = CountVectorizer(stop_words='english', min_df=0.01, ngram_range=(1, 2))
    text = data.to_list()
    X = vectorizer.fit_transform(text)
    return X, vectorizer

def train_model_logistic(X, Y):
    classifier = linear_model.LogisticRegression(penalty='l2', multi_class='multinomial', class_weight='balanced', random_state=RANDOM_STATE, fit_intercept=True)
    classifier.fit(X, Y)
    return classifier

def evaluate_model_logistic(model, X_train, Y_train, X_test, Y_test):
    train_accuracy = model.score(X_train, Y_train)
    print('\nTraining:')
    print(' accuracy:',format( 100*train_accuracy , '.2f') ) 

    # Compute and print accuracy on the test data
    print('\nTesting: ')
    test_accuracy = model.score(X_test, Y_test)
    print(' accuracy:', format( 100*test_accuracy , '.2f') )

    # Compute and print AUC on the test data
    class_probabilities = model.predict_proba(X_test)
    test_auc_score = metrics.roc_auc_score(Y_test, class_probabilities, multi_class='ovo')
    print(' AUC value:', format( 100*test_auc_score , '.2f') )
# PHASE 1 END ------------------------------------------------------------------

# PHASE 2 START ----------------------------------------------------------------
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import os

def features_word2vec(data):
    word_lists = data.str.split()
    if not os.path.exists("word2vec.wordvectors"):
        print("Did not find pre-trained embeddings, training now...")
        model = Word2Vec(sentences=word_lists, vector_size=200, window=5, min_count=1, workers=4)
        word_vectors = model.wv
        word_vectors.save("word2vec.wordvectors")
    else:
        print("Found pre-trained embeddings, loading now...")
        word_vectors = KeyedVectors.load("word2vec.wordvectors", mmap='r')
    print("Vectorizing features...")
    res = []
    i = 0
    for word_list in word_lists:
        sub = []
        for word in word_list:
            if word in word_vectors:
                sub.append(word_vectors[word])
        res.append(sub)
        if i % 1000 == 0:
            print(i, "/", len(word_lists))
        i += 1
    return res

def train_model_rnn(X, Y):
    pass

def evaluate_model_rnn(model, X_train, Y_train, X_test, Y_test):
    pass

# PHASE 2 END ------------------------------------------------------------------

## Executing the Pipeline

In [6]:
# Phase 2 pipeline
inputs, labels = gather_data()

In [7]:
# convert to features
Y_raw, classes = vectorize_labels(labels)
# takes a long time to run!
X_raw = features_word2vec(inputs)

0 / 295288
1000 / 295288


In [3]:
# remove inputs with too few vectors
small_inputs = [i for i, x in enumerate(X_raw) if len(x) < 10]
small_inputs_set = set(small_inputs)
X = [v for i, v in enumerate(X_raw) if i not in small_inputs_set]
Y = np.delete(Y_raw, small_inputs)

NameError: name 'X_raw' is not defined

In [None]:
# split into training and test
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state=RANDOM_STATE)

In [None]:
# train model
model = train_model_rnn(X_train, Y_train)

## Evaluating the Model

In [None]:
evaluate_model_rnn(model, X_train, Y_train, X_test, Y_test)

# Comparing the 2 Models