# Import libraries
- ### Note that f1 score and accuracy score is imported only to sanity check

In [1]:
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

# For tokenization
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords
nltk.download('stopwords')

# For word lemmatization
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('omw-1.4')

# For performance evaluation
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/niketan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/niketan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/niketan/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


# Importing the dataset

In [2]:
# Reading the csv file
full_dataset = pd.read_csv("imdb_master.csv", encoding="ISO-8859-1")

- Used ISO-8859-1 encoding

# Data Exploration

In [3]:
# printing the dataframe with 5 entries
full_dataset.head()

Unnamed: 0.1,Unnamed: 0,type,review,label,file
0,0,test,Once again Mr. Costner has dragged out a movie...,neg,0_2.txt
1,1,test,This is an example of why the majority of acti...,neg,10000_4.txt
2,2,test,"First of all I hate those moronic rappers, who...",neg,10001_1.txt
3,3,test,Not even the Beatles could write songs everyon...,neg,10002_3.txt
4,4,test,Brass pictures (movies is not a fitting word f...,neg,10003_3.txt


In [4]:
# getting the information on the dataset
full_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 5 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   Unnamed: 0  100000 non-null  int64 
 1   type        100000 non-null  object
 2   review      100000 non-null  object
 3   label       100000 non-null  object
 4   file        100000 non-null  object
dtypes: int64(1), object(4)
memory usage: 3.8+ MB


- From the above exploration we can say that we will need review and label and type as it tells about type of data (but still it is not clear)
- We can also confirm from info that there is no null object

## Further exploration for how many values of type and label

In [5]:
# finding type of values in "type" column
full_dataset.type.value_counts()

train    75000
test     25000
Name: type, dtype: int64

In [6]:
# finding type of values in "label" column
full_dataset.label.value_counts()

unsup    50000
neg      25000
pos      25000
Name: label, dtype: int64

### From above we can say that 
- `type` - `train` and `test` as type of data
- `label` - `unsup`, `neg` and `pos`

# Data Preprocessing
- Remove the unwanted data
    - dropping the columns - `Unnamed and file`
    - dropping the rows that has label= `unsup`

In [7]:
# taking only the useful columns
full_dataset = full_dataset.iloc[:, 1:-1]

In [8]:
# dropping the rows with label = unsup
full_dataset = full_dataset[full_dataset.label != "unsup"]

In [9]:
full_dataset

Unnamed: 0,type,review,label
0,test,Once again Mr. Costner has dragged out a movie...,neg
1,test,This is an example of why the majority of acti...,neg
2,test,"First of all I hate those moronic rappers, who...",neg
3,test,Not even the Beatles could write songs everyon...,neg
4,test,Brass pictures (movies is not a fitting word f...,neg
...,...,...,...
49995,train,"Seeing as the vote average was pretty low, and...",pos
49996,train,"The plot had some wretched, unbelievable twist...",pos
49997,train,I am amazed at how this movie(and most others ...,pos
49998,train,A Christmas Together actually came before my t...,pos


# Text Preprocessing
- Data Cleaning - Removiing punctuation symbols and removing numbers and converting text to lowercase
- Tokenization - Essentially basic of splitting the data with space
- Stop Words Removal
- Lemmatization

## Data Cleaning
### 1. Removing punctuation 
### 2. Removing numbers
### 3. Removing html tags
### 4. Removing urls
### 6. Converting the text to lower case

In [10]:
def remove_punctuation(review):
    return re.sub(r'[^\w\s]', "", review)

def remove_numbers(review):
    return re.sub(r'[\d+]', "", review)

def remove_html_tags(review):
    """
        This takes care of the html tags as well as &nsbm similar characters
        which are not specifically enclosed in html tags
    """  
    c_rule = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
    return re.sub(c_rule,'', review)

def remove_urls(review):
    return re.sub('https://.*', '', review)


def convert_to_lowercase(review):
    return review.lower()

In [11]:
full_dataset

Unnamed: 0,type,review,label
0,test,Once again Mr. Costner has dragged out a movie...,neg
1,test,This is an example of why the majority of acti...,neg
2,test,"First of all I hate those moronic rappers, who...",neg
3,test,Not even the Beatles could write songs everyon...,neg
4,test,Brass pictures (movies is not a fitting word f...,neg
...,...,...,...
49995,train,"Seeing as the vote average was pretty low, and...",pos
49996,train,"The plot had some wretched, unbelievable twist...",pos
49997,train,I am amazed at how this movie(and most others ...,pos
49998,train,A Christmas Together actually came before my t...,pos


In [12]:
%%time
# Applying data cleaning
full_dataset["review"] = full_dataset.review.apply(remove_punctuation)
full_dataset["review"] = full_dataset.review.apply(remove_numbers)
full_dataset["review"] = full_dataset.review.apply(remove_html_tags)
full_dataset["review"] = full_dataset.review.apply(remove_urls)

full_dataset["review"] = full_dataset.review.apply(convert_to_lowercase)

CPU times: user 1.91 s, sys: 28.2 ms, total: 1.94 s
Wall time: 1.94 s


In [13]:
full_dataset

Unnamed: 0,type,review,label
0,test,once again mr costner has dragged out a movie ...,neg
1,test,this is an example of why the majority of acti...,neg
2,test,first of all i hate those moronic rappers who ...,neg
3,test,not even the beatles could write songs everyon...,neg
4,test,brass pictures movies is not a fitting word fo...,neg
...,...,...,...
49995,train,seeing as the vote average was pretty low and ...,pos
49996,train,the plot had some wretched unbelievable twists...,pos
49997,train,i am amazed at how this movieand most others h...,pos
49998,train,a christmas together actually came before my t...,pos


## Tokenization and stop words removal

In [14]:
# getting the stopwords list
stopwords_list = set(stopwords.words("english"))
def apply_tokenization_and_remove_stopwords(review):
    # Applying tokenization
    tokenizer = ToktokTokenizer()
    tokens = tokenizer.tokenize(review)
    tokens = [token.strip() for token in tokens]
    # applying removal of stopwords
    review_no_stopwords = [word for word in tokens if word not in stopwords_list]
    return " ".join(review_no_stopwords)

In [15]:
# Removing stop words from the text
full_dataset["review"] = full_dataset.review.apply(apply_tokenization_and_remove_stopwords)

## Lemmatization

In [16]:
# Converting the text to root word - eg. low, lower, lowest converted to low
def apply_lemmatization(review):
    lemmatized_review = []
    lemmatizer = WordNetLemmatizer()
    tokenizer = ToktokTokenizer()
    tokens = tokenizer.tokenize(review)
    tokens = [token.strip() for token in tokens]
    for w in tokens:
        lemmatized_review.append(lemmatizer.lemmatize(w))
    return " ".join(lemmatized_review)

In [17]:
# Applying lemmatization
full_dataset["review"] = full_dataset.review.apply(apply_lemmatization)

In [18]:
full_dataset

Unnamed: 0,type,review,label
0,test,mr costner dragged movie far longer necessary ...,neg
1,test,example majority action film generic boring th...,neg
2,test,first hate moronic rapper couldnt act gun pres...,neg
3,test,even beatles could write song everyone liked a...,neg
4,test,brass picture movie fitting word really somewh...,neg
...,...,...,...
49995,train,seeing vote average pretty low fact clerk vide...,pos
49996,train,plot wretched unbelievable twist however chemi...,pos
49997,train,amazed movieand others average star lower crap...,pos
49998,train,christmas together actually came time ive rais...,pos


## Removing duplicates from the dataset

In [19]:
# checking the shape before removing duplicates
full_dataset.shape

(50000, 3)

In [20]:
# dropping duplicates
full_dataset = full_dataset.drop_duplicates()

In [21]:
# checking the shape after removing duplicates
full_dataset.shape

(49700, 3)

## Separating the data into training dataset and test dataset

In [22]:
train_dataset = full_dataset[full_dataset.type == "train"]
test_dataset = full_dataset[full_dataset.type == "test"]

In [23]:
# checking rows and columns of train dataset
train_dataset.shape

(24902, 3)

In [24]:
# checking rows and columns of test dataset
test_dataset.shape

(24798, 3)

## Converting the label pos neg values to 1 and 0 respectively

In [25]:
train_dataset["label"] = np.where(train_dataset["label"] == "pos", 1, 0)
test_dataset["label"] = np.where(test_dataset["label"] == "pos", 1, 0)

In [26]:
# Taking the list of independent variable for training set
X = train_dataset.review.to_list()
# Taking the list of independent variable for test set
X_test = test_dataset.review.to_list()

# Choosing the target variable
y = train_dataset.label.to_list()

# actual y values
y_test = test_dataset.label.to_list()

# Implementing Naive Bayes Classifier 
- ## Since we are finding the binary classification i.e the sentimental analysis,We will be using multinomial Naive Bayes to get values in respect with classes (0 and 1)

- **NOTE** First uncomment the two lines suggested in the predict method of CustomMultinomialNB class. Later when sanity check is done, comment it again and run the code

In [27]:
class CustomMultinomialNB:
    
    def __init__(self, k=1):
        self.k = k
        self.neg_prior_probability = 0
        self.pos_prior_probability = 0
        self.total_words_pos_class = 0
        self.total_words_neg_class = 0
        self.cond_prob_word_with_class = {}
        self.vocab = []
        
        
    def prior_probabilities(self, X, y):
        
        # Calculate the total number of documents (in this case reviews)
        total_number_of_documents = len(X)
        
        # Calculate the number of each class in all the documents
        num_of_pos_class_in_docs = num_of_neg_class_in_docs = 0
        
        for i in y:
            # if i is 1
            if i:
                num_of_pos_class_in_docs += 1
            else:
                num_of_neg_class_in_docs += 1
        
        # Calculating prior probability for each class
        pos_prior_prob = num_of_pos_class_in_docs / total_number_of_documents
        neg_prior_prob = num_of_neg_class_in_docs / total_number_of_documents
        
        return neg_prior_prob, pos_prior_prob
    
    
    def get_count_words_with_class(self, X, y):
        # {word: [count of this word in neg class, count of this word in pos class]}
        counts = {}
        for idx, review in enumerate(X):
            current_sentiment = y[idx]
            for w in review.split():
                if w not in counts:
                    counts[w] = [0,0]
                counts[w][current_sentiment] += 1
        
        return counts
    
    
    def generate_vocabulary(self, word_dict):
        return [w for w in word_dict]
    
    def get_total_words_each_class(self, word_dict):
        pos_count = neg_count = 0
        for key, val in word_dict.items():
            neg_count += val[0]
            pos_count += val[1]
        
        return neg_count, pos_count
    
    # maps the likelihood of each word with class
    def get_conditional_probabilities_with_class(self, counts):
        word_prob_dict = {}
        # for each word return word, p(w | 0), p(w | 1)
        for word in counts:
            prob_word_neg = (counts[word][0] + self.k) / (self.total_words_neg_class + (len(self.vocab)*self.k))
            prob_word_pos = (counts[word][1] + self.k) / (self.total_words_pos_class + (len(self.vocab)*self.k))
            word_prob_dict.update(
                {
                    word: [prob_word_neg, prob_word_pos]
                }
            )
        return word_prob_dict
     
        
    def fit(self, X, y):
        # Step 1: Get the prior probabilities
        self.neg_prior_probability, self.pos_prior_probability = self.prior_probabilities(X, y)
        
        # Step 2: count words with each class
        count_words_with_class = self.get_count_words_with_class(X, y)
        
        # Step 3: Generating vocab
        self.vocab = self.generate_vocabulary(count_words_with_class)
        
        # Step 4: Count total words in each class
        self.total_words_neg_class, self.total_words_pos_class = self.get_total_words_each_class(count_words_with_class)
        
        # Step 3: Counting Likelihood of each word and generating a dict
        self.cond_prob_word_with_class = self.get_conditional_probabilities_with_class(count_words_with_class)
        
    
    def predict(self, test):
        y_pred = []
        
        # Iterate through each word
        for document in test:
            log_prob_neg = log_prob_pos = 0
            
            tokens = document.split()
            # if word in conditional probability dict then add the log of probabilities
            for word in tokens:
                if word in self.cond_prob_word_with_class:
                    prob_word_neg, prob_word_pos = self.cond_prob_word_with_class[word]
                    log_prob_neg += np.log(prob_word_neg)
                    log_prob_pos += np.log(prob_word_pos)
    
            # Converting the log values back to original values using np.exp method
            # getting the predicted probabilities for each class 
            neg_pred = self.neg_prior_probability * np.exp(log_prob_neg)
            pos_pred = self.pos_prior_probability * np.exp(log_prob_pos)
            
            # Uncomment below code when you want to run sanity check for example 1
            # print(f"Probability for chinese prediction : {pos_pred}")
            # print(f"Probability for japanese prediction : {neg_pred}")
            
            
            if neg_pred >= pos_pred:
                y_pred.append(0)
            else:
                y_pred.append(1)
        return y_pred

# Sanity Check with two Examples

# Instantiating the instance of Multinomial NB class

In [28]:
nb = CustomMultinomialNB(k=1)

# Training the model on some examples dataset 
- This is done to see if the classifier predicts the correct probabilities

In [29]:
X_example = [
    "Chinese Beijing Chinese", 
    "Chinese Chinese Shanghai",
    "Chinese Macao",
    "Tokyo Japan Chinese"
]
y_example = [1, 1, 1, 0]
X_test_example = ["Chinese Chinese Chinese Tokyo Japan"]

- **NOTE** For this example the classes are chinese = 1 and japanese = 0
- Also the Probability of predicting chinese is 0.00030121377997263015
- and Probability of predicting japanese is 0.00013548070246744218

In [30]:
%%time
# training the model
nb.fit(X_example, y_example)

CPU times: user 18 µs, sys: 1 µs, total: 19 µs
Wall time: 19.1 µs


## Predicting the class for example

In [31]:
y_pred_example = nb.predict(X_test_example)
if y_pred_example[0]:
    print("Predicted Class: Chinese")
else:
    print("Predicted Class: Japanese")

Predicted Class: Chinese


- ### Looking at the probabilities and the prediction we can say that the sanity check works correctly

# Training the model on real training dataset
- **NOTE** GO back to the class MultinomialNB and comment the two lines in predict method

In [32]:
# Creating the instance 
nb = CustomMultinomialNB()

In [33]:
%%time
# Training the model
nb.fit(X, y)

CPU times: user 619 ms, sys: 9.28 ms, total: 628 ms
Wall time: 627 ms


# Predicting the model on test dataset

In [34]:
%%time
# Predicting on the test set
y_pred = nb.predict(X_test)

CPU times: user 2.41 s, sys: 3.71 ms, total: 2.41 s
Wall time: 2.41 s


# Performance Metrics to evaluate performance of the model

In [35]:
# Get the accuracy and f1 score from scratch
def get_accuracy_and_f1_score(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f1_score_scratch = ((2 * precision * recall) / (precision + recall))
    accuracy_score_scratch = (tp + tn) / (tp+tn+fp+fn)
    return f1_score_scratch, accuracy_score_scratch


In [36]:
f1_score_scratch, accuracy_score_scratch = get_accuracy_and_f1_score(y_test, y_pred)
print(f"F1 Score: {f1_score_scratch}")
print(f"Accuracy Score: {accuracy_score_scratch}")

F1 Score: 0.552688530048277
Accuracy Score: 0.674933462375998


### Sanity Check with sklearn metrics methods `f1_score` and `accuracy_score`

In [37]:
# Get the accuracy and f1 score from sklearn
print(f"F1 Score: {f1_score(y_test, y_pred)}")
print(f"Accuracy Score: {accuracy_score(y_test, y_pred)}")

F1 Score: 0.552688530048277
Accuracy Score: 0.674933462375998


- **Note**: For checking if the f1 score and accuracy score is correct or not I applied sanity check with above code cell using the sklearn library 
- As we can see that both the f1 score and accuracy using both methods gives us the same score. That means the sanity check is correct

- From the above code cell we can say that f1 score and accuracy score is decent.