## IMDB Movie Review Sentiment Analysis

In [9]:
from bs4 import BeautifulSoup
import re, string, unicodedata

def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

def remove_between_square_brackets(text):
    return re.sub('\[[^]]*\]', '', text)

def regular_preprocess(text):
    text = strip_html(text)
    text = remove_between_square_brackets(text)
    return text

def advanced_preprocess(text):
    return text

def preprocess(data, advanced=False):
    refined_data=[]

    if advanced:
        for data_point in data:
            refined_data.append(advanced_preprocess(data_point))
    else:
        for data_point in data:
            refined_data.append(regular_preprocess(data_point))

    return refined_data


In [20]:
# copy contents of all files in both folders into a list
import glob
import os
from preprocessing import *
from sklearn.model_selection import train_test_split

train_data = []
test_data = []

# train data
train_neg = glob.glob(os.path.join(os.getcwd(), "Dataset/train/neg", "*.txt"))
for f_path in train_neg:
    with open(f_path) as f:
        train_data.append(f.read())

train_pos = glob.glob(os.path.join(os.getcwd(), "Dataset/train/pos", "*.txt"))
for f_path in train_pos:
    with open(f_path) as f:
        train_data.append(f.read())
# print(train_data[0])
# print(preprocess(train_data[0]))

# test data
test_files = glob.glob(os.path.join(os.getcwd(), "Dataset/test", "*.txt"))
for f_path in test_files:
    with open(f_path) as f:
        test_data.append(f.read())

# targets: first 12500 are pos, next 12500 are neg
targets = [1 if i<12500 else 0 for i in range(25000)]

train_data_clean = preprocess(train_data, advanced=False)
test_data_clean = preprocess(test_data, advanced=False)

# splitting the data
X_train, X_validation, y_train, y_validation = train_test_split(train_data, targets, train_size=0.8, test_size=0.2)
# print(train_data_clean[0])
# print(test_data_clean[0])
# print(y_train[0])

In [21]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.preprocessing import Normalizer
from sklearn import metrics

# Bag of Words vectorization
cv = CountVectorizer().fit(X_train)
X_train_counts = cv.transform(X_train)
X_validation_counts = cv.transform(X_validation)

# tfidf
tfidf_transformer = TfidfTransformer().fit(X_train_counts)
X_train_tfidf = tfidf_transformer.transform(X_train_counts)
X_validation_tfidf = tfidf_transformer.transform(X_validation_counts)

# normalization
normalizer_tranformer = Normalizer().fit(X=X_train_tfidf)
X_train_normalized = normalizer_tranformer.transform(X_train_tfidf)
X_validation_normalized = normalizer_tranformer.transform(X_validation_tfidf)

In [34]:
from sklearn.naive_bayes import MultinomialNB

clf_NB = MultinomialNB().fit(X_train_normalized, y_train)
y_pred = clf_NB.predict(X_validation_normalized)

print(metrics.classification_report(y_validation, y_pred))
print("Accuracy % = ", metrics.accuracy_score(y_validation, y_pred))

              precision    recall  f1-score   support

           0       0.89      0.84      0.87      2523
           1       0.85      0.89      0.87      2477

   micro avg       0.87      0.87      0.87      5000
   macro avg       0.87      0.87      0.87      5000
weighted avg       0.87      0.87      0.87      5000

Accuracy % =  0.868


In [36]:
from sklearn.linear_model import LogisticRegression

clf_LR = LogisticRegression().fit(X_train_normalized, y_train)
y_pred = clf_LR.predict(X_validation_normalized)

print(metrics.classification_report(y_validation, y_pred))
print("Accuracy % = ", metrics.accuracy_score(y_validation, y_pred))

              precision    recall  f1-score   support

           0       0.89      0.90      0.89      2523
           1       0.90      0.88      0.89      2477

   micro avg       0.89      0.89      0.89      5000
   macro avg       0.89      0.89      0.89      5000
weighted avg       0.89      0.89      0.89      5000

Accuracy % =  0.89


In [37]:
from sklearn.tree import DecisionTreeClassifier

clf_DT = DecisionTreeClassifier().fit(X_train_normalized, y_train)
y_pred = clf_DT.predict(X_validation_normalized)

print(metrics.classification_report(y_validation, y_pred))
print("Accuracy % = ", metrics.accuracy_score(y_validation, y_pred))

              precision    recall  f1-score   support

           0       0.71      0.70      0.70      2523
           1       0.70      0.70      0.70      2477

   micro avg       0.70      0.70      0.70      5000
   macro avg       0.70      0.70      0.70      5000
weighted avg       0.70      0.70      0.70      5000

Accuracy % =  0.7018


In [39]:
from sklearn.svm import LinearSVC

clf_SVM = LinearSVC().fit(X_train_normalized, y_train)
y_pred = clf_SVM.predict(X_validation_normalized)

print(metrics.classification_report(y_validation, y_pred))
print("Accuracy % = ", metrics.accuracy_score(y_validation, y_pred))

              precision    recall  f1-score   support

           0       0.90      0.91      0.90      2523
           1       0.90      0.90      0.90      2477

   micro avg       0.90      0.90      0.90      5000
   macro avg       0.90      0.90      0.90      5000
weighted avg       0.90      0.90      0.90      5000

Accuracy % =  0.901
