# Phase 3 Improvements

Attempting to improve the models from phase2

In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn import linear_model 
from sklearn import metrics 

## Building the Pipeline

In [2]:
RANDOM_STATE = 42

def gather_data():
    data = pd.read_csv('./data/data.csv')
    return data['lyrics'], data['genre']

def vectorize_labels(labels, classes=None):
    '''
    Vectorizes the labels.
    Returns as (indexes, labels)
    '''
    if classes is None:
        return pd.factorize(labels)
    return pd.Categorical(labels, categories=classes).codes, classes


# PHASE 1 START ----------------------------------------------------------------
def features_bow(data, min_df=0.01):
    vectorizer = CountVectorizer(stop_words='english', min_df=min_df, ngram_range=(1, 2))
    text = data.to_list()
    X = vectorizer.fit_transform(text)
    return X, vectorizer

def train_model_logistic(X, Y):
    classifier = linear_model.LogisticRegression(penalty='l2', multi_class='multinomial', class_weight='balanced', random_state=RANDOM_STATE, fit_intercept=True)
    classifier.fit(X, Y)
    return classifier

def evaluate_model_sklearn(model, X_train, Y_train, X_test, Y_test):
    train_accuracy = model.score(X_train, Y_train)
    print('\nTraining:')
    print(' accuracy:',format( 100*train_accuracy , '.2f') ) 

    # Compute and print accuracy on the test data
    print('\nTesting: ')
    test_accuracy = model.score(X_test, Y_test)
    print(' accuracy:', format( 100*test_accuracy , '.2f') )

    # Compute and print AUC on the test data
    class_probabilities = model.predict_proba(X_test)
    test_auc_score = metrics.roc_auc_score(Y_test, class_probabilities, multi_class='ovo')
    print(' AUC value:', format( 100*test_auc_score , '.2f') )
    return train_accuracy, test_accuracy

def sample_incorrect_predictions(predictions, probabilities, actuals, classes, titles, lyrics):
    np.random.seed(RANDOM_STATE)
    NUM_EXAMPLES = 10
    for _ in range(NUM_EXAMPLES):
        i = np.random.choice(np.where(predictions != actuals)[0])
        print("Song Title:", titles[i])
        print('Predicted:', classes[predictions[i]], 'Actual:', classes[actuals[i]])
        print('Probability:', probabilities[i][predictions[i]])
        print("Lyrics: ")
        print('"' + lyrics[i][:100] + '..."')
        print()

# PHASE 1 END ------------------------------------------------------------------

# Phase 2 separted into different blocks

## Executing the Pipeline

### MLP on BOW

Let's start with the MLP classifier on BOW, then we'll try RNN on embeddings

In [3]:
# Phase 2 pipeline
inputs, labels = gather_data()

In [4]:
# convert to classes
Y, classes = vectorize_labels(labels)

In [None]:
# Let's try MLP on BOW first
X_bow, vectorizer = features_bow(inputs, min_df=0.001)
X_bow_train, X_bow_test, Y_bow_train, Y_bow_test = train_test_split(X_bow, Y, test_size=0.2, random_state=RANDOM_STATE)

In [5]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV

def train_model_mlp(X, Y, solver='lbfgs', **kwargs):
    classifier = MLPClassifier(solver=solver, random_state=RANDOM_STATE, **kwargs)
    classifier.fit(X, Y)
    return classifier
    

In [None]:
table = {}
params = [0.001, 0.005, 0.02]
for param in params:
    X_bow, vectorizer = features_bow(inputs, min_df=param)
    X_bow_train, X_bow_test, Y_bow_train, Y_bow_test = train_test_split(X_bow, Y, test_size=0.2, random_state=RANDOM_STATE)
    model = train_model_mlp(X_bow_train, Y_bow_train, solver='sgd')
    train, test = evaluate_model_sklearn(model, X_bow_train, Y_bow_train, X_bow_test, Y_bow_test)
    table[param] = (train, test)




Training:
 accuracy: 94.71

Testing: 
 accuracy: 62.72
 AUC value: 82.19





Training:
 accuracy: 85.92

Testing: 
 accuracy: 60.81
 AUC value: 81.49





Training:
 accuracy: 68.33

Testing: 
 accuracy: 58.62
 AUC value: 81.83


In [None]:
params = [0.001, 0.005, 0.02]
for param in params:
    X_bow, vectorizer = features_bow(inputs, min_df=param)
    X_bow_train, X_bow_test, Y_bow_train, Y_bow_test = train_test_split(X_bow, Y, test_size=0.2, random_state=RANDOM_STATE)
    print(len(vectorizer.vocabulary_))

10678
2216
553


In [7]:
table = {}
X_bow, vectorizer = features_bow(inputs, min_df=0.001)
X_bow_train, X_bow_test, Y_bow_train, Y_bow_test = train_test_split(X_bow, Y, test_size=0.2, random_state=RANDOM_STATE)

params = ['constant', 'invscaling', 'adaptive']
for param in params:
    model = train_model_mlp(X_bow_train, Y_bow_train, solver='sgd', early_stopping=True, learning_rate=param)
    train, test = evaluate_model_sklearn(model, X_bow_train, Y_bow_train, X_bow_test, Y_bow_test)
    table[param] = (train, test)


Training:
 accuracy: 80.08

Testing: 
 accuracy: 62.83
 AUC value: 83.80

Training:
 accuracy: 55.33

Testing: 
 accuracy: 54.44
 AUC value: 75.55

Training:
 accuracy: 84.98

Testing: 
 accuracy: 63.28
 AUC value: 83.61


In [None]:
table

{0.001: (0.9470771784091824, 0.6271571068095988),
 0.005: (0.8592083592655159, 0.6081220681129231),
 0.02: (0.6833479400320924, 0.5862419346644312)}