<a href="https://colab.research.google.com/github/nabeel-gulzar/binary_sentiment_analysis_imdb/blob/main/sentiment_analysis_imdb_nb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%pip install gdown
!gdown --id '1HKHlbOmzsOHcjQ7msreZWAtYOTmkha7_'
!unzip IMDB_Dataset.zip

In [None]:
import glob
import re
import numpy as np
from collections import Counter
from time import time

In [None]:
train_dir = 'Dataset/train/'
test_dir = 'Dataset/test/'
labels = ['neg', 'pos']
stop_words_path = 'Dataset/stop_words.txt'

# regex to remove punctuations
garbage_words_re = r"[();:',.\/@#$%*+!`^_0-9><?\"\-]*"

In [None]:
class Time(object):
    def __init__(self, description):
      self.description = description

    def __enter__(self): 
        self.start_time = time()
  
    def __exit__(self, et, ev, tb):
        print("Time for {}: {:.4f}".format(self.description, time()-self.start_time))

In [None]:
#reading auxilary data from files
with open(stop_words_path, 'r') as file:
  stop_words = set()
  for word in file.read().split('\n'):
    stop_words.add(word)

In [None]:
def preprocess_review(review):
  # split words and filter words that are not in stop words
  without_stop_words = ' '.join([token for token in review.lower().split() if token not in stop_words])
  return re.sub(garbage_words_re, '', without_stop_words)

In [None]:
def get_documents(directory, labels):
  sentiments = []
  features = []
  for i, label in enumSerate(labels):
    for filename in glob.iglob(directory+label + '/*.txt', recursive=False):
      with open(filename, 'r') as file:
        review = file.read()
        review = preprocess_review(review)
        features.append(review)
        sentiments.append(i)
  return np.array(features), np.array(sentiments)

In [None]:
def create_vocabulary(documents):
  vocabulary = set()
  for document in documents:
    tokens = document.split()
    [vocabulary.add(w) for w in tokens]
  return list(vocabulary)

In [None]:
with Time("Getting Documents"):
  train_documents, train_sentiments = get_documents(train_dir, labels)

Time for Getting Documents: 5.0821


In [None]:
with Time("Creating Vocabulary"):
  vocabulary = create_vocabulary(train_documents)

Time for Creating Vocabulary: 1.6153


In [None]:
def train_naive_bayes(documents, actual_labels, classes, vocabulary):
  # total number of documents
  total_documents_count = np.size(documents, axis=0)
  # initially empty matrix to record liklihood of token against each class 
  likelihood_matrix = []
  # vector to record log of prior
  log_prior_vector = np.zeros([len(classes)])
  # loop to be run oven the classes (distinct labels)
  for class_index in range(len(classes)):
    # initialize likelihood for given class with ones
    class_likelihood = np.zeros(len(vocabulary))
    # filter document belonging the a particular class
    class_documents = documents[actual_labels==class_index]
    # total number of documents in the particular class
    class_documents_count = np.size(class_documents, axis=0)
    # log of the proir for the particular class
    log_prior_vector[class_index] = (np.log(class_documents_count/total_documents_count))
    # concatenation of all the documents in the particular class
    conditioned_corpus = ' '.join(class_documents)
    # count of distinct tooken (words)
    word_counter = Counter(conditioned_corpus.split())
    # loop oven the vocabulary to update likehoods
    for i, word in enumerate(vocabulary):
      # update vector with count of each word in the particular class with index class_index
      class_likelihood[i] = word_counter[word]
    # compute log likelihood with laplace smooting
    class_likelihood = (class_likelihood+1)/(class_likelihood.sum()+len(vocabulary))
    # add class likelihood vector to likelihood matrix
    likelihood_matrix.append(class_likelihood)
  return log_prior_vector, np.log(likelihood_matrix)

In [None]:
with Time("Training"):
  log_prior, log_likelihood = train_naive_bayes(train_documents, train_sentiments, labels, vocabulary)

Time for Training: 2.5074


# Testing

In [None]:
with Time("Getting Test Documents"):
  test_documents, test_sentiments = get_documents(test_dir, labels)

Time for Getting Test Documents: 4.8380


In [None]:
def test_naive_bayes(documents, log_prior, log_likelihood, classes, vocabulary):
  total_documents_count = np.size(documents, axis=0)
  max_a_posteriori = np.zeros(total_documents_count)
  for doc_index, document in enumerate(documents):
    posteriori = np.zeros(len(classes))
    for class_index in range(len(classes)):
      likelihood = log_prior[class_index]
      for word in document.split():
        likelihood += log_likelihood[class_index].get(word, 0)
      posteriori[class_index] = likelihood
    max_a_posteriori[doc_index] = np.argmax(posteriori)
  return max_a_posteriori

In [None]:
log_likelihood_dict = [{}, {}]
with Time("Converting Likelihoods to Dictionary"):
  for class_index in range(len(labels)):
    for i, p in enumerate(log_likelihood[class_index]):
      log_likelihood_dict[class_index][vocabulary[i]] = p
with Time("Predicting on Test"):
  predicted_labels = test_naive_bayes(test_documents, log_prior, log_likelihood_dict, labels, vocabulary)

Time for Converting Likelihoods to Dictionary: 0.1161
Time for Predicting on Test: 4.8782


In [None]:
with Time("Computing Performance Metrics"):
  true_positives = len(predicted_labels[predicted_labels==test_sentiments])
  total_documents = len(test_sentiments)
  accuracy_score = true_positives/total_documents

Time for Computing Performance Metrics: 0.0081


Accuracy

In [None]:
print("Accuracy of naive bayes (implemented from scratch) on test data is {:.4f}".format(accuracy_score))

Accuracy of naive bayes (implemented from scratch) on test data is 0.8258


# Part 2

In [None]:
#imports
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
import pandas as pd

In [None]:
# creating feature space i.e bag of words
with Time("Creating Vocabulary"):
  vectorizer = CountVectorizer().fit(train_documents)
# converting train data (tweets) into bow features
with Time("Transforming Training Doc to Features"):
  train_features = vectorizer.transform(train_documents)

Time for Creating Vocabulary: 3.7877
Time for Transforming Training Doc to Features: 3.4732


In [None]:
# training multinomial naive bayes on training data
with Time("Training multinomial model"):
  model = MultinomialNB().fit(train_features, train_sentiments)

Time for Training multinomial model: 0.0349


In [None]:
# converting test data (tweets) into bow features using only train vocabulary
with Time("Converting Testing Doc to Features"):
  test_features = vectorizer.transform(test_documents)
# predicting sentiment of test data
with Time("Getting Prediction of test"):
  predicted_sentiments = model.predict(test_features)
# computing accuracy using sklearn.metrics
with Time("Computing Performance Metrics"):
  accuracy_score = metrics.accuracy_score(test_sentiments, predicted_sentiments)
  # computing confusion matrix using sklearn.metrics
  confusion_matrix = metrics.confusion_matrix(test_sentiments, predicted_sentiments, labels=[0,1])

Time for Converting Testing Doc to Features: 3.3941
Time for Getting Prediction of test: 0.0248
Time for Computing Performance Metrics: 0.0210


Accuracy for test data

In [None]:
print("Accuracy for test data using sklearn is {:.4f}".format(accuracy_score))

Accuracy for test data using sklearn is 0.8256


Confusion Matrix for test data

In [None]:
consfusion_dataframe = pd.DataFrame(data=confusion_matrix, index=["Negative", "Positive"], columns=["Negative", "Positive"])
consfusion_dataframe

Unnamed: 0,Negative,Positive
Negative,11002,1498
Positive,2861,9639
