In [5]:
# all NB classifiers trained at the same time
def process_all_naive_bayes_classifiers (data_tuple):
    process_naive_bayes_GNB(data_tuple)
    process_naive_bayes_MNB(data_tuple)
    process_naive_bayes_BNB(data_tuple)
    process_naive_bayes_CNB(data_tuple)

In [4]:
# methods for training a single NB classifier

#Import Gaussian Naive Bayes model
from sklearn.naive_bayes import GaussianNB, MultinomialNB, ComplementNB, BernoulliNB
#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics

def process_naive_bayes_GNB(data_tuple):
    data_train, data_test, target_train, target_test = data_tuple

    #Create a Gaussian Classifier
    gnb = GaussianNB()

    #Train the model using the training sets
    gnb.fit(data_train, target_train.values.ravel())

    #Predict the response for test dataset
    target_pred = gnb.predict(data_test)

    # Model Accuracy, how often is the classifier correct?
    print("Gaussian Naive Bayes:")
    print("Accuracy:",metrics.accuracy_score(target_test, target_pred))
    print("F2-Score:",metrics.fbeta_score(target_test, target_pred, beta=2.0, average='binary'))
    print("\n")

def process_naive_bayes_MNB(data_tuple):
    data_train, data_test, target_train, target_test = data_tuple

    #Create a Multinominal Classifier
    gnb = MultinomialNB()

    #Train the model using the training sets
    gnb.fit(data_train, target_train.values.ravel())

    #Predict the response for test dataset
    target_pred = gnb.predict(data_test)

    # Model Accuracy, how often is the classifier correct?
    print("Multinominal Naive Bayes:")
    print("Accuracy:",metrics.accuracy_score(target_test, target_pred))
    print("F2-Score:",metrics.fbeta_score(target_test, target_pred, beta=2.0, average='binary'))
    print("\n")

def process_naive_bayes_CNB(data_tuple):
    data_train, data_test, target_train, target_test = data_tuple

    #Create a Complement Classifier
    gnb = ComplementNB()

    #Train the model using the training sets
    gnb.fit(data_train, target_train.values.ravel())

    #Predict the response for test dataset
    target_pred = gnb.predict(data_test)

    # Model Accuracy, how often is the classifier correct?
    print("Complement Naive Bayes:")
    print("Accuracy:",metrics.accuracy_score(target_test, target_pred))
    print("F2-Score:",metrics.fbeta_score(target_test, target_pred, beta=2.0, average='binary'))
    print("\n")

def process_naive_bayes_BNB(data_tuple):
    data_train, data_test, target_train, target_test = data_tuple

    #Create a Bernoulli Classifier
    gnb = BernoulliNB()

    #Train the model using the training sets
    gnb.fit(data_train, target_train.values.ravel())

    #Predict the response for test dataset
    target_pred = gnb.predict(data_test)

    # Model Accuracy, how often is the classifier correct?
    print("Bernoulli Naive Bayes:")
    print("Accuracy:",metrics.accuracy_score(target_test, target_pred))
    print("F2-Score:",metrics.fbeta_score(target_test, target_pred, beta=2.0, average='binary'))
    print("\n")

In [17]:
# methods for evaluating best performing classifiers

def process_naive_bayes_MNB_with_extra_split(data_tuple):
    data_train, data_validation, data_test, target_train, target_validation, target_test = data_tuple

    #Create a Multinominal Classifier
    gnb = MultinomialNB()

    #Train the model using the training sets
    gnb.fit(data_train, target_train.values.ravel())

    #Predict the response for test dataset
    target_pred = gnb.predict(data_test)

    # Model Accuracy, how often is the classifier correct?
    print("Multinominal Naive Bayes:")
    print("Accuracy:",metrics.accuracy_score(target_test, target_pred))
    print("F2-Score:",metrics.fbeta_score(target_test, target_pred, beta=2.0, average='binary'))
    print("\n")

def process_naive_bayes_BNB_with_extra_split(data_tuple):
    data_train, data_validation, data_test, target_train, target_validation, target_test = data_tuple

    #Create a Bernoulli Classifier
    gnb = BernoulliNB()

    #Train the model using the training sets
    gnb.fit(data_train, target_train.values.ravel())

    #Predict the response for test dataset
    target_pred = gnb.predict(data_test)

    # Model Accuracy, how often is the classifier correct?
    print("Bernoulli Naive Bayes:")
    print("Accuracy:",metrics.accuracy_score(target_test, target_pred))
    print("F2-Score:",metrics.fbeta_score(target_test, target_pred, beta=2.0, average='binary'))
    print("\n")

def process_naive_bayes_CNB_with_extra_split(data_tuple):
    data_train, data_validation, data_test, target_train, target_validation, target_test = data_tuple

    #Create a Bernoulli Classifier
    cnb = ComplementNB()

    #Train the model using the training sets
    cnb.fit(data_train, target_train.values.ravel())

    #Predict the response for test dataset
    target_pred = cnb.predict(data_test)

    # Model Accuracy, how often is the classifier correct?
    print("Complement Naive Bayes:")
    print("Accuracy:",metrics.accuracy_score(target_test, target_pred))
    print("F2-Score:",metrics.fbeta_score(target_test, target_pred, beta=2.0, average='binary'))
    print("\n")

# One-Hot Encoded

In [7]:
from preprocessing.preprocessing_label_encoding import *
from preprocessing.preprocessing_one_hot_encoding import *

data_train, data_test, target_train, target_test = get_preprocessed_brfss_dataset_one_hot_encoded_train_test_split()

process_all_naive_bayes_classifiers((data_train, data_test, target_train, target_test))

Gaussian Naive Bayes:
Accuracy: 0.5649447543255054
F2-Score: 0.5527952409923612


Multinominal Naive Bayes:
Accuracy: 0.8411553629161189
F2-Score: 0.4553975522883693


Bernoulli Naive Bayes:
Accuracy: 0.8120936511884037
F2-Score: 0.5314358434693228


Complement Naive Bayes:
Accuracy: 0.7322449994670835
F2-Score: 0.5854774037256704




### Oversampled

In [8]:
data_train, data_test, target_train, target_test = get_preprocessed_brfss_dataset_one_hot_encoded_train_test_split_oversampled()

process_all_naive_bayes_classifiers((data_train, data_test, target_train, target_test))

Gaussian Naive Bayes:
Accuracy: 0.5386897360287064
F2-Score: 0.5458325116676528


Multinominal Naive Bayes:
Accuracy: 0.7326713326464632
F2-Score: 0.5852108849224595


Bernoulli Naive Bayes:
Accuracy: 0.7263651543681388
F2-Score: 0.5876454900663621


Complement Naive Bayes:
Accuracy: 0.7326713326464632
F2-Score: 0.5852108849224595




### Undersampled

In [9]:
data_train, data_test, target_train, target_test = get_preprocessed_brfss_dataset_one_hot_encoded_all_columns_train_test_split_undersampled()

process_all_naive_bayes_classifiers((data_train, data_test, target_train, target_test))

Gaussian Naive Bayes:
Accuracy: 0.7040359540981277
F2-Score: 0.5772555484078482


Multinominal Naive Bayes:
Accuracy: 0.7458698973247593
F2-Score: 0.5927413357132232


Bernoulli Naive Bayes:
Accuracy: 0.7476818133371229
F2-Score: 0.5877577181925008


Complement Naive Bayes:
Accuracy: 0.7458698973247593
F2-Score: 0.5927413357132232




## All Columns

In [10]:
data_train, data_test, target_train, target_test = get_preprocessed_brfss_dataset_one_hot_encoded_all_columns_train_test_split()

process_all_naive_bayes_classifiers((data_train, data_test, target_train, target_test))

Gaussian Naive Bayes:
Accuracy: 0.5368422922513945
F2-Score: 0.5441953589129221


Multinominal Naive Bayes:
Accuracy: 0.8303193946068853
F2-Score: 0.5092415062504793


Bernoulli Naive Bayes:
Accuracy: 0.8042953067822504
F2-Score: 0.5466915816204303


Complement Naive Bayes:
Accuracy: 0.7458521334422852
F2-Score: 0.5931339225016995




### Oversampled

In [11]:
data_train, data_test, target_train, target_test = get_preprocessed_brfss_dataset_one_hot_encoded_all_columns_train_test_split_oversampled()
data_train.head()

process_all_naive_bayes_classifiers((data_train, data_test, target_train, target_test))

Gaussian Naive Bayes:
Accuracy: 0.5163427718762212
F2-Score: 0.5372419116228211


Multinominal Naive Bayes:
Accuracy: 0.7459231889721818
F2-Score: 0.5931843290558341


Bernoulli Naive Bayes:
Accuracy: 0.7480015632216577
F2-Score: 0.5887191539365452


Complement Naive Bayes:
Accuracy: 0.7459231889721818
F2-Score: 0.5931843290558341




### Undersampled

In [12]:
data_train, data_test, target_train, target_test = get_preprocessed_brfss_dataset_one_hot_encoded_all_columns_train_test_split_undersampled()
data_train.head()

process_all_naive_bayes_classifiers((data_train, data_test, target_train, target_test))

Gaussian Naive Bayes:
Accuracy: 0.5508224677585533
F2-Score: 0.5489154861585072


Multinominal Naive Bayes:
Accuracy: 0.7457988417948627
F2-Score: 0.5932581405722297


Bernoulli Naive Bayes:
Accuracy: 0.7476640494546488
F2-Score: 0.5886435465960537


Complement Naive Bayes:
Accuracy: 0.7457988417948627
F2-Score: 0.5932581405722297




# Label Encoding

In [13]:
data_train, data_test, target_train, target_test = get_preprocessed_brfss_dataset_label_encoded_train_test_split()

process_all_naive_bayes_classifiers((data_train, data_test, target_train, target_test))

Gaussian Naive Bayes:
Accuracy: 0.8115607347141791
F2-Score: 0.4402773158689145


Multinominal Naive Bayes:
Accuracy: 0.8620812164706718
F2-Score: 0.24444581015886668


Bernoulli Naive Bayes:
Accuracy: 0.8721178100685686
F2-Score: 0.0


Complement Naive Bayes:
Accuracy: 0.7222794613990834
F2-Score: 0.5736873693578095




### Oversampled

In [14]:
data_train, data_test, target_train, target_test = get_preprocessed_brfss_dataset_label_encoded_train_test_split_oversampled()

process_all_naive_bayes_classifiers((data_train, data_test, target_train, target_test))

Gaussian Naive Bayes:
Accuracy: 0.7840622446441894
F2-Score: 0.5155099833023682


Multinominal Naive Bayes:
Accuracy: 0.7224215724588766
F2-Score: 0.5739418400082791


Bernoulli Naive Bayes:
Accuracy: 0.6728781042384624
F2-Score: 0.3963196635120925


Complement Naive Bayes:
Accuracy: 0.7224215724588766
F2-Score: 0.5739418400082791




### Undersampled

In [15]:
data_train, data_test, target_train, target_test = get_preprocessed_brfss_dataset_label_encoded_train_test_split_undersampled()

process_all_naive_bayes_classifiers((data_train, data_test, target_train, target_test))

Gaussian Naive Bayes:
Accuracy: 0.784595161118414
F2-Score: 0.5163407919813651


Multinominal Naive Bayes:
Accuracy: 0.722688030695989
F2-Score: 0.5734818987656366


Bernoulli Naive Bayes:
Accuracy: 0.6425018652076598
F2-Score: 0.4102219907036384


Complement Naive Bayes:
Accuracy: 0.722688030695989
F2-Score: 0.5734818987656366




# Evaluating Best Performing Model from Train/Validation Split with Train/Validation/Test Split

## One-hot Encoding - All Columns Undersampled

In [19]:
data_train, data_validation, data_test, target_train, target_validation, target_test = get_preprocessed_brfss_dataset_one_hot_encoded_all_columns_train_test_split_undersampled(False, True)

process_naive_bayes_MNB_with_extra_split((data_train, data_validation, data_test, target_train, target_validation, target_test))
process_naive_bayes_CNB_with_extra_split((data_train, data_validation, data_test, target_train, target_validation, target_test))

Multinominal Naive Bayes:
Accuracy: 0.7467625899280576
F2-Score: 0.5887370194255497


Complement Naive Bayes:
Accuracy: 0.7467625899280576
F2-Score: 0.5887370194255497




## One Hot Encoding - All Columns Oversampling

In [20]:
data_train, data_validation, data_test, target_train, target_validation, target_test = get_preprocessed_brfss_dataset_one_hot_encoded_all_columns_train_test_split_oversampled(False, True)

process_naive_bayes_MNB_with_extra_split((data_train, data_validation, data_test, target_train, target_validation, target_test))
process_naive_bayes_CNB_with_extra_split((data_train, data_validation, data_test, target_train, target_validation, target_test))

Multinominal Naive Bayes:
Accuracy: 0.7468869348965272
F2-Score: 0.5884987841815623


Complement Naive Bayes:
Accuracy: 0.7468869348965272
F2-Score: 0.5884987841815623




# Extras

In [None]:
import seaborn as sns

from preprocessing.preprocessing_one_hot_encoding import *

data_train, data_test, target_train, target_test = get_preprocessed_brfss_dataset_one_hot_encoded_train_test_split()

# calculate the correlations
correlations = data_train.corr()

# plot the heatmap
sns.heatmap(correlations, xticklabels=correlations.columns, yticklabels=correlations.columns, annot=True)

# plot the clustermap
sns.clustermap(correlations, xticklabels=correlations.columns, yticklabels=correlations.columns, annot=True)