In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from transformers import BertModel, BertTokenizer
from keras_preprocessing.sequence import pad_sequences
from sklearn.feature_extraction.text import CountVectorizer

2023-04-14 14:48:54.641896: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [16]:
# Stanford IMDB dataset - Downloaded from:
# https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews
df = pd.read_csv('IMDBDataset.csv')
# Our own test data - screenshot of data in report
oxymoron_df = pd.read_csv('Oxymorons.csv')

In [8]:
# Due to our kernel crashing, we had to reduce the test and training size for our model
oxymoron_size = 20
train_size = 10000
test_size = 2000
total_size = oxymoron_size + test_size + train_size

data = df[0:total_size].to_dict(orient='records')

data_text, data_labels = list(zip(*map(lambda d: (d['review'], d['sentiment']), data)))

vectorizer = CountVectorizer()

data_vectorized = vectorizer.fit_transform(data_text)

data_boolean_labels = np.array(data_labels) == 'positive'
data_binary_labels = data_boolean_labels.astype(int)

oxymoron_x = data_vectorized[0:oxymoron_size]
oxymoron_y = data_binary_labels[0:oxymoron_size]

train_vectorized = data_vectorized[oxymoron_size:train_size]
test_vectorized = data_vectorized[test_size:total_size]

train_y = data_binary_labels[oxymoron_size:train_size]
test_y = data_binary_labels[test_size:total_size]

print(train_vectorized.shape, train_y.shape, test_vectorized.shape, test_y.shape)


(9980, 57055) (9980,) (10020, 57055) (10020,)


In [9]:
def dataframeToVectorizedData(data):
    
    data = data.to_dict(orient='records')
    
    data_text, data_labels = list(zip(*map(lambda d: (d['review'], d['sentiment']), data)))
    
    vectorizer = CountVectorizer()

    data_vectorized = vectorizer.fit_transform(data_text)

    data_boolean_labels = np.array(data_labels) == 'positive'
    data_binary_labels = data_boolean_labels.astype(int)
    
    data_y = data_binary_labels
    
    return data_vectorized, data_y

In [10]:
class GaussianNaiveBayes:
    def __init__(self):
        return
    
    def fit(self, x, y):
        N, D = x.shape
        C = np.max(y) + 1
        # one parameter for each feature conditioned on each class
        mu, sigma = np.zeros((C,D)), np.zeros((C,D))
        Nc = np.zeros(C) # number of instances in class c
        # calculate MLE for the mean and std
        for c in range(C):
            x_c = x[y == c]                
            Nc[c] = x_c.shape[0]                     
            mu[c,:] = np.mean(x_c,0)                 
            sigma[c,:] = np.std(x_c, 0)               
        self.mu = mu                                  # C x D
        self.sigma = sigma                            # C x D
        # Laplace Smoothing
        self.pi = (Nc+1)/(N+C)
        return self
    
    def logsumexp(self, Z):                                            
        Zmax = np.max(Z,axis=0)[None,:]                              
        log_sum_exp = Zmax + np.log(np.sum(np.exp(Z - Zmax), axis=0))
        return log_sum_exp
    
    def predict(self, xt):
        Nt, D = xt.shape
        log_prior = np.log(self.pi)[:, None]
        EPSILON = 1e-8
        sigma = self.sigma[:, None, :] + EPSILON  # add epsilon to sigma to avoid division by zero or negative values in the logarithm
        log_likelihood = -.5 * np.log(2*np.pi) - np.log(sigma) - .5 * (((xt[None, :, :] - self.mu[:, None, :])/sigma)**2)
        log_likelihood = np.sum(log_likelihood, axis=2)
        log_posterior = log_prior + log_likelihood
        posterior = np.exp(log_posterior - self.logsumexp(log_posterior))
        return posterior.T 
    
    def dataframeToVectorizedData(data):
        
        data = data.to_dict(orient='records')

        data_text, data_labels = list(zip(*map(lambda d: (d['review'], d['sentiment']), data)))

        vectorizer = CountVectorizer()

        data_vectorized = vectorizer.fit_transform(data_text)

        data_boolean_labels = np.array(data_labels) == 'positive'
        data_binary_labels = data_boolean_labels.astype(int)

        data_y = data_binary_labels

        return data_vectorized, data_y

    

In [11]:
model = GaussianNaiveBayes()
model.fit(train_vectorized.toarray(), train_y)

<__main__.GaussianNaiveBayes at 0x7f97344cb640>

In [12]:
# Normal test data
y_prob = model.predict(test_vectorized.toarray())
y_pred = np.argmax(y_prob, 1)
accuracy = np.sum(y_pred == test_y)/y_pred.shape[0]

In [13]:
accuracy

0.8631736526946108

In [14]:
# Oxymoron test data
oxy_y_prob = model.predict(oxymoron_x.toarray())
oxy_y_pred = np.argmax(oxy_y_prob, 1)
oxy_accuracy = np.sum(oxy_y_pred == oxymoron_y)/oxy_y_pred.shape[0]

In [15]:
oxy_accuracy

0.45