# Multi-genre classification of heavy metal lyrics - Naive Bayes models

This notebook demonstrates the tuning of hyperparameters for a multi-label classification model using Naive Bayes classifiers as the base model for a binary relevance meta-model. See [the parent notebook](./song-lyrics-multi-genre-bow.ipynb) for an in-depth walkthrough of the general problem and the classification framework.

# Table of contents

1. [Imports](#imports)

1. [Hyperparamters](#hyperparameters)

1. [Evaluation metrics](#metrics)

1. [Tuning](#tuning)

    * [Multinomial Naive Bayes](#multinomialnb)

    * [Complement Multinomial Naive Bayes](#complementnb)

    * [Bernoulli Naive Bayes](#bernoullinb)

<a id='imports'></a>
# Imports

In [1]:
import itertools
import warnings
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB, ComplementNB, BernoulliNB
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import RandomOverSampler, SMOTE

# local imports
from multilabel import BinaryRelevance, MultiLabelClassification
from nlp import get_stopwords

Using TensorFlow backend.


<a id='data'></a>
# Data

In [2]:
df = pd.read_csv('songs-ml-10pct.csv')
X = df.pop('lyrics').values
y = df.values
genres = df.columns
print(f"number of songs: {X.shape[0]}")
print(f"number of labels: {y.shape[1]}")
print(f"labels: {list(genres)}")

number of songs: 60964
number of labels: 5
labels: ['black', 'death', 'heavy', 'power', 'thrash']


In [3]:
stop_words = get_stopwords()
print(len(stop_words))

9412


<a id='metrics'></a>
# Hyperparameters

<a id='metrics'></a>
# Evaluation metrics

<a id='tuning'></a>
# Tuning

<a id='multinomialnb'></a>
### Multinomial Naive Bayes

In [4]:
def get_param_sets(grid):
    first_param = list(grid.keys())[0]
    first_values = param_grid.pop(first_param)
    out = [{first_param: value} for value in first_values]
    for param, values in grid.items():
        new = []
        prod = itertools.product(range(len(out)), values)
        for i, j in prod:
            new_dict = out[i].copy()
            new_dict.update({param: j})
            new.append(new_dict)
        out = new
    return out

def cross_validation(pipeline):
    br = BinaryRelevance(pipeline, genres)
    mlc = br.cross_validate(X, y, n_splits=3)
    mlc.print_report()
    auc = mlc.roc_auc_score()
    print(auc)
    print("AUC ROC score = {:.2f} +/- {:.2f}".format(auc.mean(), auc.std()))
    mlc.plot_roc_curve()

In [None]:
param_grid = {
    'vectorizer' : [CountVectorizer, TfidfVectorizer],
    'oversampler': [RandomOverSampler, SMOTE],
#     'alpha'      : [0.1, 0.5, 1.0],
#     'fit_prior'  : [True, False]
}
results = []
param_sets = get_param_sets(param_grid)
for i, params in enumerate(param_sets):
    print(i)
    with warnings.catch_warnings():
        warnings.simplefilter('ignore', UserWarning)
        pipeline = Pipeline(
            [
                ('vectorizer', params['vectorizer'](stop_words=stop_words)),
                ('oversampler', params['oversampler'](random_state=0)),
                ('multinomialnb', MultinomialNB())#alpha=params['alpha'], fit_prior=params['fit_prior']))
            ]
        )
        br = BinaryRelevance(pipeline, genres)
        mlc = br.cross_validate(X, y, n_splits=3)
        results.append((params, mlc))

0
1


<a id='complementnb'></a>
### Complement multinomial Naive Bayes

In [None]:
param_grid = {
    'vectorizer' : [CountVectorizer, TfidfVectorizer],
    'oversampler': [RandomOverSampler, SMOTE],
    'alpha'      : [0.1, 0.5, 1.0],
    'fit_prior'  : [True, False]
}
for params in get_param_sets(param_grid):
    print(params)
    with warnings.catch_warnings():
        warnings.simplefilter('ignore', UserWarning)
        cross_validation(
            Pipeline(
                [
                    ('vectorizer', params['vectorizer'](stop_words=stop_words)),
                    ('oversampler', params['oversampler'](random_state=0)),
                    ('complementnb', ComplementNB(alpha=params['alpha'], fit_prior=params['fit_prior']))
                ]
            )
        )

{'vectorizer': <class 'sklearn.feature_extraction.text.CountVectorizer'>, 'oversampler': <class 'imblearn.over_sampling._random_over_sampler.RandomOverSampler'>, 'alpha': 0.1, 'fit_prior': True}


<a id='bernoullinb'></a>
### Bernoulli Naive Bayes

In [None]:
param_grid = {
    'vectorizer' : [CountVectorizer, TfidfVectorizer],
    'oversampler': [RandomOverSampler, SMOTE],
    'alpha'      : [0.1, 0.5, 1.0],
    'fit_prior'  : [True, False]
}
for params in get_param_sets(param_grid):
    print(params)
    with warnings.catch_warnings():
        warnings.simplefilter('ignore', UserWarning)
        cross_validation(
            Pipeline(
                [
                    ('vectorizer', params['vectorizer'](stop_words=stop_words)),
                    ('oversampler', params['oversampler'](random_state=0)),
                    ('bernoullinb', BernoulliNB(alpha=params['alpha'], fit_prior=params['fit_prior']))
                ]
            )
        )

{'vectorizer': <class 'sklearn.feature_extraction.text.CountVectorizer'>, 'oversampler': <class 'imblearn.over_sampling._random_over_sampler.RandomOverSampler'>, 'alpha': 0.1, 'fit_prior': True}
