In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

In [None]:
full_dataset = pd.read_csv("imdb_master.csv", encoding="ISO-8859-1")

In [None]:
full_dataset

In [None]:
full_dataset = full_dataset.iloc[:, 1:-1]

In [None]:
full_dataset

In [None]:
full_dataset = full_dataset[full_dataset.label != "unsup"]

In [None]:
full_dataset

In [None]:
import re
def remove_punctuation(review):
    return re.sub(r'[^\w\s]', "", review)

def convert_to_lowercase(review):
    return review.lower()

def remove_numbers(review):
    return re.sub(r'[\d+]', "", review)

In [None]:
full_dataset["review"] = full_dataset.review.apply(remove_punctuation)

In [None]:
full_dataset["review"] = full_dataset.review.apply(convert_to_lowercase)

In [None]:
full_dataset["review"] = full_dataset.review.apply(remove_numbers)

In [None]:
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords
nltk.download('stopwords')

In [None]:
stopwords_list = stopwords.words("english")
def apply_tokenization_and_remove_stopwords(review):
    # Applying tokenization
    tokenizer = ToktokTokenizer()
    tokens = tokenizer.tokenize(review)
    tokens = [token.strip() for token in tokens]
    # applying removal of stopwords
    review_no_stopwords = [word for word in tokens if word not in stopwords_list]
    return " ".join(review_no_stopwords)
    

In [None]:
full_dataset["review"] = full_dataset.review.apply(apply_tokenization_and_remove_stopwords)

In [None]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('omw-1.4')
def apply_lemmatization(review):
    lemmatized_review = []
    lemmatizer = WordNetLemmatizer()
    tokenizer = ToktokTokenizer()
    tokens = tokenizer.tokenize(review)
    tokens = [token.strip() for token in tokens]
    for w in tokens:
        lemmatized_review.append(lemmatizer.lemmatize(w))
    return " ".join(lemmatized_review)

In [None]:
full_dataset["review"] = full_dataset.review.apply(apply_lemmatization)

In [None]:
class MultiNB:
    def __init__(self,alpha=1):
        self.alpha = alpha
    
    def _prior(self): # CHECKED
        """
        Calculates prior for each unique class in y. P(y)
        """
        P = np.zeros((self.n_classes_))
        _, self.dist = np.unique(self.y,return_counts=True)
        for i in range(self.classes_.shape[0]):
            P[i] = self.dist[i] / self.n_samples
        return P
            
    def fit(self, X, y): # CHECKED, matches with sklearn
        """
        Calculates the following things- 
            class_priors_ is list of priors for each y.
            N_yi: 2D array. Contains for each class in y, the number of time each feature i appears under y.
            N_y: 1D array. Contains for each class in y, the number of all features appear under y.
            
        params
        ------
        X: 2D array. shape(n_samples, n_features)
            Multinomial data
        y: 1D array. shape(n_samples,). Labels must be encoded to integers.
        """
        self.y = y
        self.n_samples, self.n_features = X.shape
        self.classes_ = np.unique(y)
        self.n_classes_ = self.classes_.shape[0]
        self.class_priors_ = self._prior()
        
        # distinct values in each features
        self.uniques = []
        for i in range(self.n_features):
            tmp = np.unique(X[:,i])
            self.uniques.append( tmp )
            
        self.N_yi = np.zeros((self.n_classes_, self.n_features)) # feature count
        self.N_y = np.zeros((self.n_classes_)) # total count 
        for i in self.classes_: # x axis
            indices = np.argwhere(self.y==i).flatten()
            columnwise_sum = []
            for j in range(self.n_features): # y axis
                columnwise_sum.append(np.sum(X[indices,j]))
                
            self.N_yi[i] = columnwise_sum # 2d
            self.N_y[i] = np.sum(columnwise_sum) # 1d
            
    def _theta(self, x_i, i, h):
        """
        Calculates theta_yi. aka P(xi | y) using eqn(1) in the notebook.
        
        params
        ------
        x_i: int. 
            feature x_i
            
        i: int.
            feature index. 
            
        h: int or string.
            a class in y
        
        returns
        -------
        theta_yi: P(xi | y)
        """
        
        Nyi = self.N_yi[h,i]
        Ny  = self.N_y[h]
        
        numerator = Nyi + self.alpha
        denominator = Ny + (self.alpha * self.n_features)
        
        return  (numerator / denominator)**x_i
    
    def _likelyhood(self, x, h):
        """
        Calculates P(E|H) = P(E1|H) * P(E2|H) .. * P(En|H).
        
        params
        ------
        x: array. shape(n_features,)
            a row of data.
        h: int. 
            a class in y
        """
        tmp = []
        for i in range(x.shape[0]):
            tmp.append(self._theta(x[i], i,h))
        
        return np.prod(tmp)
    
    def predict(self, X):
        samples, features = X.shape
        self.predict_proba = np.zeros((samples,self.n_classes_))
        
        for i in range(X.shape[0]):
            joint_likelyhood = np.zeros((self.n_classes_))
            
            for h in range(self.n_classes_):
                joint_likelyhood[h]  = self.class_priors_[h] * self._likelyhood(X[i],h) # P(y) P(X|y) 
                
            denominator = np.sum(joint_likelyhood)
            
            for h in range(self.n_classes_):
                numerator = joint_likelyhood[h]
                self.predict_proba[i,h] = (numerator / denominator)
            
        indices = np.argmax(self.predict_proba,axis=1)
        return self.classes_[indices]

In [None]:
train_dataset = full_dataset[full_dataset["type"] == "train"]
test_dataset = full_dataset[full_dataset["type"] == "test"]

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelBinarizer

In [None]:
cv = CountVectorizer()
X = cv.fit_transform(train_dataset['review']).toarray()
lb = LabelBinarizer()
y = lb.fit_transform(train_dataset['label']).ravel()
print(X.shape,y.shape)

X_test = cv.fit_transform(test_dataset['review']).toarray()
y_test = lb.fit_transform(test_dataset['label']).ravel()
print(X_test.shape,y_test.shape)

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train.shape, X_val.shape, y_train.shape, y_val.shape)

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
%%time
me = MultiNB()
me.fit(X_train, y_train)

In [None]:
%%time
yhat = me.predict(X_val)

In [None]:
print(accuracy_score(y_val, yhat))

# Now going for the test set

In [None]:
%%time
y_pred = me.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix
def find_f1_score(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    true_positive_and_false_negative = cm.sum(1)
    true_positive_and_false_positive = cm.sum(0)
    true_positive = cm.diagonal()
    precision = true_positive / true_positive_and_false_positive
    recall = true_positive / true_positive_and_false_negative

    f1_score = (2 * precision * recall) / (precision + recall)
    return f1_score

print(find_f1_score(y_test, y_pred))