In [9]:
#Please note that the first code cell took almost 10 minutes to run on my computer, I suppose it was because my computer 
#is old.
import os
import csv
import pandas as pd
import nltk
from nltk.corpus import stopwords
from bs4 import BeautifulSoup  
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix
from sklearn import metrics , model_selection
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
from sklearn.metrics import classification_report

#this function processes all the text files and converts into a csv file with two fields: review and score. A score of
#-1 is given to a negative review and a score of 1 is given to a positive review
def convertToCSV(reviews, data_path_neg, data_path_pos):
    with open(reviews, 'w') as f:
        neg_rev_path = os.listdir(data_path_neg)
        pos_rev_path = os.listdir(data_path_pos)
        output = csv.writer(f)
        output.writerow(['review', 'score'])

        for each in neg_rev_path:
            with open(data_path_neg + '/' + each, 'rb') as txt_f:
                output.writerow([txt_f.read(), -1])
                txt_f.close()
        for each in pos_rev_path:
            with open(data_path_pos + '/' + each, 'rb') as txt_f:
                output.writerow([txt_f.read(), 1])
                txt_f.close()


        f.close()
    return reviews

#the given data is split into training and testing data sets
def splitData(train_reviews, test_reviews):
    df_train = pd.read_csv(train_reviews)
    df_train.head()
    #print(df_train['review'][2500])
    df_test = pd.read_csv(test_reviews)
    df_test.head()
    #print(df_test['review'][2500])
    X_train = df_train.iloc[:,:-1]
    y_train = df_train.iloc[:,-1]
    X_test = df_test.iloc[:,:-1]
    y_test = df_test.iloc[:,-1]
    return X_train, y_train, X_test, y_test

#This is a function which takes in the raw movie review as argument and spits out a clean sequence of words by removing html
#tags, stop words, any non letter characters, and converts to lower case.
def data_cleaning(movie_review):
    soup = BeautifulSoup(movie_review)
    text = soup.get_text()
    #print(text)
    text = re.sub('[^a-z\s]'," ", text.lower())
    clean_words = text.split()
    stop_words_set = set(stopwords.words("english"))
    stop_words_set.add('b')
    #print(stop_words_set)
    clean_words = [w for w in clean_words if not w in stop_words_set]
    return " ".join(clean_words)

#this function applies data cleaning to each and every row of the data and converts into a list
def processedReviews(splittedData):
    processed_reviews = []
    for i in range(splittedData['review'].size):
        processed_reviews.append(data_cleaning(splittedData["review"][i]))
    return processed_reviews

#this is a function to create Bag of words model
def bowVectorizer(processed_train_reviews, processed_test_reviews):
    vectorizer_bow = CountVectorizer(analyzer = "word", max_features = 5000)
    train_featVec_bow = vectorizer_bow.fit_transform(processed_train_reviews)
    train_featVec_bow = train_featVec_bow.toarray()
    #type(train_featVec_bow)
    #print(train_feature_vectors[2500])
    test_featVec_bow = vectorizer_bow.transform(processed_test_reviews)
    test_featVec_bow = test_featVec_bow.toarray()
    return train_featVec_bow, test_featVec_bow

#this is a function for Tfidf
def tfidfVectorizer(processed_train_reviews, processed_test_reviews):
    vectorizer_tfidf = TfidfVectorizer(min_df = 5,max_df = 0.8,sublinear_tf = True,use_idf = True)
    train_featVec_tfidf = vectorizer_tfidf.fit_transform(processed_train_reviews)
    test_featVec_tfidf = vectorizer_tfidf.transform(processed_test_reviews)
    return train_featVec_tfidf, test_featVec_tfidf

#this function gives the accuracy of Naive Bayes classifier
def naiveBayes(train_featVec, test_featVec):
    model = MultinomialNB()
    model.fit(train_featVec, y_train)
    y_pred = model.predict(test_featVec)
    return y_pred
    
#this function gives the accuracy of SVM classifier
def svm(train_featVec, test_featVec):
    model = LinearSVC()
    model.fit(train_featVec, y_train)
    y_pred = model.predict(test_featVec)
    return y_pred
    
#this function gives the accuracy of K Nearest Neighbour classifier
def knn(train_featVec, test_featVec):
    model = KNeighborsClassifier(n_neighbors = 3)
    model.fit(train_featVec_tfidf, y_train)
    y_pred = model.predict(test_featVec_tfidf)
    return y_pred
    
train_reviews = convertToCSV('train_movie_reviews.csv', 'aclImdb/train/neg', 'aclImdb/train/pos')
test_reviews = convertToCSV('test_movie_reviews.csv', 'aclImdb/test/neg', 'aclImdb/test/pos')

X_train, y_train, X_test, y_test = splitData(train_reviews, test_reviews)

processed_train_reviews = processedReviews(X_train)
processed_test_reviews = processedReviews(X_test)

train_featVec_bow, test_featVec_bow = bowVectorizer(processed_train_reviews, processed_test_reviews)
train_featVec_tfidf, test_featVec_tfidf = tfidfVectorizer(processed_train_reviews, processed_test_reviews)

nb_y_pred = naiveBayes(train_featVec_bow, test_featVec_bow)
#print("one")
svm_y_pred = svm(train_featVec_tfidf, test_featVec_tfidf)
#print("two")
knn_y_pred = knn(train_featVec_tfidf, test_featVec_tfidf)
#print("three")

In [10]:
#We will now evaluate the performance of all the three classifiers using the following methods

#First is the simple one, which is accuracy and it just says how often the classifier is correct
nb_accuracy = metrics.accuracy_score(y_test, nb_y_pred)
print('Accuracy of Multinomial Naive Bayes: {:.2f}'.format(nb_accuracy))

svm_accuracy = metrics.accuracy_score(y_test, svm_y_pred)
print('Accuracy of SVM: {:.2f}'.format(svm_accuracy))

knn_accuracy = metrics.accuracy_score(y_test, knn_y_pred)
print('Accuracy of KNN: {:.2f}'.format(knn_accuracy))

Accuracy of Multinomial Naive Bayes: 0.84
Accuracy of SVM: 0.87
Accuracy of KNN: 0.67


In [11]:
#Second method is the confusion matrix
nb_cm = confusion_matrix(y_test, nb_y_pred)
print("Confusion matrxi of Multinomial Naive Bayes is ",nb_cm)
print("Accuracy of confusion matrix for Multinomial Naive Bayes is ", (nb_cm[0,0]+nb_cm[1,1])/np.sum(nb_cm))
print("\n")
svm_cm = confusion_matrix(y_test, svm_y_pred)
print("Confusion matrxi of SVM is ",svm_cm)
print("Accuracy of confusion matrix for SVM is ", (svm_cm[0,0]+svm_cm[1,1])/np.sum(svm_cm))
print("\n")
knn_cm = confusion_matrix(y_test, knn_y_pred)
print("Confusion matrxi of KNN is ",knn_cm)
print("Accuracy of confusion matrix for KNN is ", (knn_cm[0,0]+knn_cm[1,1])/np.sum(knn_cm))

Confusion matrxi of Multinomial Naive Bayes is  [[10828  1672]
 [ 2346 10154]]
Accuracy of confusion matrix for Multinomial Naive Bayes is  0.83928


Confusion matrxi of SVM is  [[11041  1459]
 [ 1705 10795]]
Accuracy of confusion matrix for SVM is  0.87344


Confusion matrxi of KNN is  [[8606 3894]
 [4242 8258]]
Accuracy of confusion matrix for KNN is  0.67456


In [12]:
#Third is Precision, recall and f1-score
print("Report of Multinomial Naive Bayes is ")
print(classification_report(y_test, nb_y_pred))
print("\n")
print("Report of SVM is ")
print(classification_report(y_test, svm_y_pred))
print("\n")
print("Report of KNN is ")
print(classification_report(y_test, knn_y_pred))

Report of Multinomial Naive Bayes is 
              precision    recall  f1-score   support

          -1       0.82      0.87      0.84     12500
           1       0.86      0.81      0.83     12500

    accuracy                           0.84     25000
   macro avg       0.84      0.84      0.84     25000
weighted avg       0.84      0.84      0.84     25000



Report of SVM is 
              precision    recall  f1-score   support

          -1       0.87      0.88      0.87     12500
           1       0.88      0.86      0.87     12500

    accuracy                           0.87     25000
   macro avg       0.87      0.87      0.87     25000
weighted avg       0.87      0.87      0.87     25000



Report of KNN is 
              precision    recall  f1-score   support

          -1       0.67      0.69      0.68     12500
           1       0.68      0.66      0.67     12500

    accuracy                           0.67     25000
   macro avg       0.67      0.67      0.67     25