Import all needed libraries

In [1]:
import re
import nltk
import math
import os
import numpy as np
import matplotlib.pyplot as plt
from gensim.models import Word2Vec
from sklearn.decomposition import TruncatedSVD
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score

Tokenizing function:

This function gets a list of texts and tokenize each text, remove stopwords and do stemming on tokens

In [2]:
def tokenize_filtering(list_of_contexts):
    all_tokens = []         # a list to save all of tokens of all documents
    for i, doc in enumerate(list_of_contexts):
        lower_doc = doc.lower()              # make all of contexts lower case
        list_of_contexts[i] = lower_doc          
        tokens = re.findall(r'\d+(?:,\d+)*(?:\.\d+)?|\w+', list_of_contexts[i])   # tokenize the text with regex
        all_tokens.append(tokens)

    for i, doc in enumerate(all_tokens):
        new_tokens = []
        for token in doc:     # delete all of stopwords and single character tokens except numbers from token list
            if (len(token) < 2 and token.isalpha()) or (token in stop_words):          
                continue
            else:
                new_tokens.append(token)
        stemmer = nltk.stem.PorterStemmer()                      # stemming each token
        new_tokens = [stemmer.stem(token) for token in new_tokens]
        all_tokens[i] = new_tokens

    return all_tokens       # return a 2D array with contains lists of tokens of each document

# Download the stopwords
nltk.download('stopwords')
nltk.download('punkt')
# Get the list of stopwords for English
stop_words = set(nltk.corpus.stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ASC\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ASC\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Reading dataset

In [3]:
def dataset_proccessing(dataset):    

    list_of_contexts = []      # all of file's contexts
    for doc in dataset:                            # do for each document in a block
        context = "" 
        with open(doc, 'r', encoding='cp437') as f:  
            context = f.read()      # reading all files
            list_of_contexts.append(context)

    all_tokens = tokenize_filtering(list_of_contexts)          # list of all tokens in the dataset

    return all_tokens

Save address of files

In [4]:
def get_files_addresses(path):
    address_list = []
    os.chdir(path) 
    for file in os.listdir():                # save the file passes in a list
        file_path = f"{path}\{file}"
        address_list.append(file_path)
    
    return address_list

Find unique terms in given documents

In [6]:
def find_unique_terms(pos_tokens, neg_tokens):
    vocab = {}       # a dictionary that maps each term with its frequency (TF)
    voc_list = []    # list of all unique terms for ID mapping

    for doc in pos_tokens:
        for w in doc:
            if w not in vocab:
                # the value of each element in vocabulary dictionary is a list with 2 numebrs.
                # the first number is TF in positive data and the second one is TF in negative data
                vocab[w] = [1, 0]  
                voc_list.append(w)   # add new word 
            else:
                vocab[w][0] += 1   # add counter of the word

    for doc in neg_tokens:
        for w in doc:
            if w not in vocab:
                vocab[w]  = [0, 1]
                voc_list.append(w)
            else:
                vocab[w][1] += 1

    return vocab, voc_list

Main code

In [5]:
path_train_pos = "C:\\Users\\ASC\\OneDrive\\Desktop\\temp\\aclImdb_v1\\aclImdb\\train\\pos" 
path_train_neg = "C:\\Users\\ASC\\OneDrive\\Desktop\\temp\\aclImdb_v1\\aclImdb\\train\\neg" 
path_test_pos = "C:\\Users\\ASC\\OneDrive\\Desktop\\temp\\aclImdb_v1\\aclImdb\\test\\pos" 
path_test_neg = "C:\\Users\\ASC\\OneDrive\\Desktop\\temp\\aclImdb_v1\\aclImdb\\test\\neg" 

list_of_train_pos_address = get_files_addresses(path_train_pos)  # list of address of positive training docs
list_of_train_neg_address = get_files_addresses(path_train_neg)  # list of address of negative training docs
list_of_test_pos_address = get_files_addresses(path_test_pos)  # list of address of positive test docs
list_of_test_neg_address = get_files_addresses(path_test_neg)  # list of address of negative test docs

# get the token list of each class
pos_tokens = dataset_proccessing(list_of_train_pos_address)
neg_tokens = dataset_proccessing(list_of_train_neg_address)

In [7]:
vocab, voc_list = find_unique_terms(pos_tokens, neg_tokens)  # find the vocabulary of dataset

Naive Bayes Classifier

Training

$$\sum_{w\in V}^{}\frac{TF_{w,c}+\alpha}{L_{c}+B}$$

In [None]:
def train_naive_bayes(pos_tokens, neg_tokens, vocab, alpha=0.01):
    train_matrix = {}     
    B = alpha * len(vocab)  
    len_pos = 0
    for doc in pos_tokens:
        len_pos += len(doc)
    len_neg = 0
    for doc in neg_tokens:
        len_neg += len(doc)

    len_pos += B   # length of each class documents
    len_neg += B
    
    for i, w in enumerate(vocab):
        count = vocab[w][0]
        train_matrix[w] = []
        train_matrix[w].append((count + alpha)/len_pos)   # calculate the formula for each term and save it in matrix
        count = vocab[w][1]
        train_matrix[w].append((count + alpha)/len_neg)
    
    return train_matrix

In [None]:
train_matrix = train_naive_bayes(pos_tokens, neg_tokens, vocab)    # train the dataset with naive bayes