# CS Project 175 Phase 3

In [10]:
import pandas as pd
import numpy as np
import os

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn import linear_model 
from sklearn import metrics 
from nltk import word_tokenize


In [11]:
# Phase 1 Code
RANDOM_STATE = 42

def gather_data():
    data = pd.read_csv('./data/data.csv')
    return data['lyrics'], data['genre']

def vectorize_labels(labels, classes=None):
    '''
    Vectorizes the labels.
    Returns as (indexes, labels)
    '''
    if classes is None:
        return pd.factorize(labels)
    return pd.Categorical(labels, categories=classes).codes, classes

# PHASE 1 START ----------------------------------------------------------------
def features_bow(data):
    vectorizer = CountVectorizer(stop_words='english', min_df=0.01, ngram_range=(1, 2))
    text = data.to_list()
    X = vectorizer.fit_transform(text)
    return X, vectorizer

def train_model_logistic(X, Y):
    classifier = linear_model.LogisticRegression(penalty='l2', multi_class='multinomial', class_weight='balanced', random_state=RANDOM_STATE, fit_intercept=True)
    classifier.fit(X, Y)
    return classifier

def evaluate_model_sklearn(model, X_train, Y_train, X_test, Y_test):
    train_accuracy = model.score(X_train, Y_train)
    print('\nTraining:')
    print(' accuracy:',format( 100*train_accuracy , '.2f') ) 

    # Compute and print accuracy on the test data
    print('\nTesting: ')
    test_accuracy = model.score(X_test, Y_test)
    print(' accuracy:', format( 100*test_accuracy , '.2f') )

    # Compute and print AUC on the test data
    class_probabilities = model.predict_proba(X_test)
    test_auc_score = metrics.roc_auc_score(Y_test, class_probabilities, multi_class='ovo')
    print(' AUC value:', format( 100*test_auc_score , '.2f') )
    return train_accuracy, test_accuracy

# PHASE 1 END ------------------------------------------------------------------

# doc2vec

In [12]:
inputs, labels = gather_data()

In [13]:
# convert to classes
Y, classes = vectorize_labels(labels)

In [15]:
inputs_tokens = inputs.map(word_tokenize)

In [16]:
inputs_train, inputs_test, Y_train, Y_test = train_test_split(inputs_tokens, Y, test_size = 0.2, random_state=RANDOM_STATE)

In [17]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk import word_tokenize

documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(inputs_train)]

In [28]:
doc2vec_model_name = "doc2vec_model"
if os.path.exists(doc2vec_model_name):
    doc2vec = Doc2Vec.load(doc2vec_model_name)
else:
    doc2vec = Doc2Vec(documents, vector_size=1024, window=5, min_count=1, workers=4)
    doc2vec.save(doc2vec_model_name)

In [29]:
X_train = [doc2vec.dv[i] for i in range(len(doc2vec.dv))]
X_test = [doc2vec.infer_vector(input_test) for input_test in inputs_test]

In [None]:
logistic_model = linear_model.LogisticRegression(penalty='l2', multi_class='multinomial', random_state=RANDOM_STATE, fit_intercept=True)
logistic_model.fit(X_train, Y_train)

NameError: name 'linear_model' is not defined

In [7]:
evaluate_model_sklearn(logistic_model, X_train, Y_train, X_test, Y_test)

NameError: name 'logistic_model' is not defined

In [30]:
from sklearn.neural_network import MLPClassifier

mlp_model = MLPClassifier(solver='sgd', hidden_layer_sizes=(200,), random_state=RANDOM_STATE)
mlp_model.fit(X_train, Y_train)



In [32]:
evaluate_model_sklearn(mlp_model, X_train, Y_train, X_test, Y_test)


Training:
 accuracy: 72.70

Testing: 
 accuracy: 56.48
 AUC value: 82.23


(0.7270075773610465, 0.5648345694063464)