## IMDB Movie Review Sentiment Analysis

In [1]:
import re, string, unicodedata
from bs4 import BeautifulSoup
from nltk.stem import PorterStemmer

def remove_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

def remove_links_characters(text):
    text = re.sub(r'^https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    text = re.sub('\[[^]]*\]', '', text)
    return text

def regular_preprocess(text):
    text = remove_html(text)
    text = remove_links_characters(text)
    return text

def remove_stopwords(docs, stopwords):
    docs_ref = []
    for doc in docs:
        word_list = doc.lower().split()
        word_list_ref = [word for word in word_list if word not in stopwords]
        word_str_ref = ' '.join(word_list_ref)
        docs_ref.append(word_str_ref)
    return docs_ref

def stem_words(docs):
    stemmer = PorterStemmer()
    stems = []
    for doc in docs:
        word_list = doc.lower().split()
        for word in word_list:
            stem = stemmer.stem(word)
            stems.append(stem)
        stems_str = ' '.join(stems)
        stems.append(stems_str)
    return stems_str

def preprocess(data):
    refined_data = []
    for dp in data:
        refined_data.append(regular_preprocess(dp))        
    return refined_data

In [27]:
import glob
import os
import csv
import numpy as np
from preprocessing import *
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.preprocessing import Normalizer
from sklearn import metrics
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.decomposition import TruncatedSVD
from sklearn.svm import SVC
from scipy.stats import uniform
from sklearn.model_selection import GridSearchCV

In [3]:
# copy contents of all files in both folders into a list
train_data = []
test_data = []

# train data
train_neg = glob.glob(os.path.join(os.getcwd(), "Dataset/train/neg", "*.txt"))
for f_path in train_neg:
    with open(f_path) as f:
        train_data.append(f.read())

train_pos = glob.glob(os.path.join(os.getcwd(), "Dataset/train/pos", "*.txt"))
for f_path in train_pos:
    with open(f_path) as f:
        train_data.append(f.read())

def sort_nicely(l):
# Sort the given list in the way that humans expect.
    convert = lambda text: int(text) if text.isdigit() else text
    alphanum_key = lambda key: [ convert(c) for c in re.split('([0-9]+)', key) ]
    l.sort(key=alphanum_key)

test_files = glob.glob(os.path.join(os.getcwd(), "Dataset/test", "*.txt"))
sort_nicely(test_files)
test_files_ids = [int(re.sub("[^0-9]","", item)) for item in test_files]

for f_path in test_files:
    with open(f_path) as f:
        test_data.append(f.read())

In [46]:
# targets: first 12500 are pos, next 12500 are neg
targets = [0 if i<12500 else 1 for i in range(25000)]
# print(train_data[0])
train_data_clean = preprocess(train_data)
test_data_clean = preprocess(test_data)
# print(train_data_clean[0])

# Using hold-out validation
X_train, X_validation, y_train, y_validation = train_test_split(train_data_clean, targets, train_size=0.8, test_size=0.2, random_state=1)

# Using k-fold (in the Grid Search)
# X_train = train_data_clean
# y_train = targets

def display_results(y_val, y_pred):
    print(metrics.classification_report(y_val, y_pred))
    print("Accuracy % = ", metrics.accuracy_score(y_val, y_pred))

### Finding best parameters using Grid Search (the general flow)

In [17]:
# SOMEpclf = Pipeline(['vect', CountVectorizer(...), ...])


# params = {
#     'vect__max_df': (0.5, 0.75, 1.0),
#     'vect__max_features': (None, 5000, 10000, 50000),
#     'vect__ngram_range': ((1, 1), (1, 2),(2,2)),  # unigrams or bigrams
#     'tfidf__use_idf': (True, False),
#     'tfidf__norm': ('l1', 'l2'),
#     'clf__max_iter': (5,),
#     'clf__alpha': (0.00001, 0.000001),
#     'clf__penalty': ('l2', 'elasticnet'),
#     'clf__max_iter': (10, 50, 80),
# }


# find the best parameters for both the feature extraction and the classifier

# grid_search = GridSearchCV(MODEL_NAMEpclf, params, cv=5,
#                            n_jobs=-1, verbose=10)

# print("Performing grid search...")
# print("pipeline:", [name for name, _ in MODEL_NAMEpclf.steps])
# print("parameters:")
# print(params)
# grid_search.fit(X_train, y_train)
# print()

# print("Best score: %0.3f" % grid_search.best_score_)
# print("Best parameters set:")
# best_parameters = grid_search.best_estimator_.get_params()
# for param_name in sorted(params.keys()):
#     print("\t%s: %r" % (param_name, best_parameters[param_name]))

## SGD

In [47]:
# params chosen by the grid search algorithm
SGDpclf = Pipeline([
    ('vect', CountVectorizer(analyzer='word', max_df=1.0, min_df=0,ngram_range=(1, 2))),
    ('tfidf', TfidfTransformer(norm='l2')),
    ('clf', SGDClassifier(alpha=1e-5, epsilon=0.1, random_state=1)),
])

SGDpclf.fit(X_train, y_train)
y_pred_sgd_val = SGDpclf.predict(X_validation)

y_pred_sgd_test = SGDpclf.predict(test_data_clean)
print(len(y_pred_sgd_test))

with open('submissionSGD.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerow(("Id", "Category"))
    writer.writerows(zip(test_files_ids, y_pred_sgd_test))

display_results(y_validation, y_pred_sgd_val)

25000


## Linear SVM

In [133]:
SVMpclf = Pipeline([
    ('vect', CountVectorizer(max_df=0.5, ngram_range=(1,2))),
    ('tfidf', TfidfTransformer(norm='l2')),
    ('clf', LinearSVC(C=0.5)),
])

SVMpclf.fit(X_train, y_train)
y_pred_svm_val = SVMpclf.predict(X_validation)

y_pred_svm_test = SVMpclf.predict(test_data_clean)
with open('submissionLSVM.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerow(("Id", "Category"))
    writer.writerows(zip(test_files_ids, y_pred_svm_test))

display_results(y_validation, y_pred_svm_val)

              precision    recall  f1-score   support

           0       0.92      0.90      0.91      2517
           1       0.90      0.92      0.91      2483

   micro avg       0.91      0.91      0.91      5000
   macro avg       0.91      0.91      0.91      5000
weighted avg       0.91      0.91      0.91      5000

Accuracy % =  0.9108


## Logistic

In [134]:
LRpclf = Pipeline([
    ('vect', CountVectorizer(binary=True, max_df=0.5, ngram_range=(1,2))),
    ('tfidf', TfidfTransformer(norm='l2')),
    ('clf', LogisticRegression()),
])

LRpclf.fit(X_train, y_train)
y_pred_lr_val = LRpclf.predict(X_validation)

y_pred_lr_test = LRpclf.predict(test_data_clean)
with open('submissionLR.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerow(("Id", "Category"))
    writer.writerows(zip(test_files_ids, y_pred_lr_test))

display_results(y_validation, y_pred_lr_val)

              precision    recall  f1-score   support

           0       0.91      0.89      0.90      2517
           1       0.89      0.91      0.90      2483

   micro avg       0.90      0.90      0.90      5000
   macro avg       0.90      0.90      0.90      5000
weighted avg       0.90      0.90      0.90      5000

Accuracy % =  0.8966


## Using Major Vote

In [135]:
import pandas as pd
preds_val = pd.DataFrame()
preds_val['pred_val_1'] = y_pred_sgd_val
preds_val['pred_val_2'] = y_pred_svm_val
preds_val['pred_val_3'] = y_pred_lr_val

preds_val['maj_vote_val'] = preds_val.mode(axis = 1)

In [116]:
# preds_val

In [136]:
display_results(y_validation, preds_val['maj_vote_val'])

              precision    recall  f1-score   support

           0       0.92      0.90      0.91      2517
           1       0.90      0.92      0.91      2483

   micro avg       0.91      0.91      0.91      5000
   macro avg       0.91      0.91      0.91      5000
weighted avg       0.91      0.91      0.91      5000

Accuracy % =  0.9096


In [137]:
preds = pd.DataFrame()
preds['pred1'] = y_pred_sgd_test
preds['pred2'] = y_pred_svm_test
preds['pred3'] = y_pred_lr_test

preds['maj_vote'] = preds.mode(axis = 1)

In [140]:
preds

Unnamed: 0,pred1,pred2,pred3,maj_vote
0,0,0,0,0
1,0,0,0,0
2,0,0,0,0
3,0,0,0,0
4,1,1,1,1
5,0,0,0,0
6,0,0,0,0
7,0,0,0,0
8,0,0,0,0
9,0,0,0,0


In [141]:
with open('submission.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerow(("Id", "Category"))
    writer.writerows(zip(test_files_ids, preds['maj_vote']))

## Cross Validation of SVM and LR

## Naive Bayes Implementation