## IMDB Movie Review Sentiment Analysis

In [1]:
import re, string, unicodedata
# import nltk
# nltk.download('stopwords')
# import contractions
# import inflect
from bs4 import BeautifulSoup
# from nltk import word_tokenize
# from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

def remove_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

def remove_links_characters(text):
    text = re.sub(r'^https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    text = re.sub('\[[^]]*\]', '', text)
    return text

def regular_preprocess(text):
    text = remove_html(text)
    text = remove_links_characters(text)
    # text = replace_contractions(text)
    return text

def remove_stopwords(docs, stopwords):
    docs_ref = []
    for doc in docs:
        word_list = doc.lower().split()
        word_list_ref = [word for word in word_list if word not in stopwords]
        word_str_ref = ' '.join(word_list_ref)
        docs_ref.append(word_str_ref)
    return docs_ref

def stem_words(docs):
    stemmer = PorterStemmer()
    stems = []
    for doc in docs:
        word_list = doc.lower().split()
        for word in word_list:
            stem = stemmer.stem(word)
            stems.append(stem)
        stems_str = ' '.join(stems)
        stems.append(stems_str)
    return stems

def preprocess(data):
    refined_data = []
    for dp in data:
        refined_data.append(regular_preprocess(dp))        
    return refined_data

In [6]:
# copy contents of all files in both folders into a list
import glob
import os
import csv
import numpy as np
from preprocessing import *
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.preprocessing import Normalizer
from sklearn import metrics
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.decomposition import TruncatedSVD
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as randint
from scipy.stats import uniform
from sklearn.model_selection import GridSearchCV
# copy contents of all files in both folders into a list
train_data = []
test_data = []

# train data
train_neg = glob.glob(os.path.join(os.getcwd(), "Dataset/train/neg", "*.txt"))
for f_path in train_neg:
    with open(f_path) as f:
        train_data.append(f.read())

train_pos = glob.glob(os.path.join(os.getcwd(), "Dataset/train/pos", "*.txt"))
for f_path in train_pos:
    with open(f_path) as f:
        train_data.append(f.read())
# print(train_data[0])
# print(preprocess(train_data[0]))

# test data
def sort_nicely(l):
# Sort the given list in the way that humans expect.
    convert = lambda text: int(text) if text.isdigit() else text
    alphanum_key = lambda key: [ convert(c) for c in re.split('([0-9]+)', key) ]
    l.sort(key=alphanum_key)

test_files = glob.glob(os.path.join(os.getcwd(), "Dataset/test", "*.txt"))
sort_nicely(test_files)
test_files_ids = [int(re.sub("[^0-9]","", item)) for item in test_files]

for f_path in test_files:
    with open(f_path) as f:
        test_data.append(f.read())

# targets: first 12500 are pos, next 12500 are neg
targets = [0 if i<12500 else 1 for i in range(25000)]

with open('english') as f:
    stopwords = f.read().splitlines()
    
print(test_data[0])
train_data_clean = preprocess(train_data)
# train_data_clean = remove_stopwords(train_data_clean, stopwords)
# train_data_clean = stem_words(train_data_clean)
test_data_clean = preprocess(test_data)
# test_data_clean = remove_stopwords(test_data_clean, stopwords)
# test_data_clean = stem_words(test_data_clean)
print(test_data_clean[0])


# splitting the data
X_train, X_validation, y_train, y_validation = train_test_split(train_data_clean, targets, train_size=0.8, test_size=0.2, random_state=1)

pclf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('norm', Normalizer()),
#     ('best', TruncatedSVD(n_components=10000)),
    # ('clf', LogisticRegression()),
    ('clf', SGDClassifier()),
])

params = {
#     'vect__max_df': (0.5, 0.75, 1.0),
    'vect__max_features': (None, 10000),
    'vect__ngram_range': ((1, 1), (1, 2),(2,2)),  # unigrams or bigrams
    'clf__max_iter': (5,),
    'clf__alpha': (0.00001, 0.000001),
    'clf__penalty': ('l2', 'elasticnet'),
#     'clf__max_iter': (10, 50, 80),
}

# def report(results, n_top=3):
#     for i in range(1, n_top + 1):
#         candidates = np.flatnonzero(results['rank_test_score'] == i)
#         for candidate in candidates:
#             print("Model with rank: {0}".format(i))
#             print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
#                   results['mean_test_score'][candidate],
#                   results['std_test_score'][candidate]))
#             print("Parameters: {0}".format(results['params'][candidate]))
#             print("")
            
grid_search = GridSearchCV(pclf, params, cv=5, n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)
y1_pred = grid_search.predict(X_validation)
y_pred = grid_search.predict(test_data_clean)

# random_search = RandomizedSearchCV(pclf, param_distributions = params, cv=2, verbose = 10, random_state = 1, n_iter = 1)
# random_search.fit(X_train, y_train)

# report(random_search.cv_results_)

# y_pred = random_search.predict(X_validation)

# def display_results(y_val, y_pred):
#     print(metrics.classification_report(y_val, y_pred))
#     print("Accuracy % = ", metrics.accuracy_score(y_val, y_pred))

# pclf.fit(X_train, y_train)
# y_pred = pclf.predict(X_validation)
# print(y_pred)

with open('submission.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerow(("Id", "Category"))
    writer.writerows(zip(test_files_ids, y_pred))

# display_results(y_validation, y_pred)
print(metrics.accuracy_score(y_validation, y1_pred))

Thinking that it could only get better was the worst assumption I ever made....<br /><br />Drivvle does not describe this movie appropriately enough!<br /><br />Not only is the plot thin, but I get more emotional acting from my pet fish!<br /><br />It was a shame to see Pete Postlethwaite, whom I respect as an actor trying to do the best with the little he had to work with...<br /><br />I think that a cardboard cut out of Stephen Baldwin would have done a better job , and in fact have been more animate.<br /><br />Avoid at all costs! This could really be hazardous to your health!
Thinking that it could only get better was the worst assumption I ever made....Drivvle does not describe this movie appropriately enough!Not only is the plot thin, but I get more emotional acting from my pet fish!It was a shame to see Pete Postlethwaite, whom I respect as an actor trying to do the best with the little he had to work with...I think that a cardboard cut out of Stephen Baldwin would have done a b

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  7.0min
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed: 41.6min finished


0.9052


In [None]:
# def advanced_preprocess(text):
# #     words = nltk.word_tokenize(text)
#     words = replace_numbers(text)
#     words = remove_non_ascii(words)
#     words = to_lowercase(words)
#     words = remove_stopwords(words)
#     words = stem_words(words)
#     return words

# def replace_contractions(text):
#     """Replace contractions in string of text"""
#     return contractions.fix(text)
#
# def stem_words(words):
#     """Stem words in list of tokenized words"""
#     stemmer = PorterStemmer()
#     stems = []
#     for word in words:
#         stem = stemmer.stem(word)
#         stems.append(stem)
#     return stems
#
# def replace_numbers(words):
#     """Replace all interger occurrences in list of tokenized words with textual representation"""
#     p = inflect.engine()
#     new_words = []
#     for word in words:
#         if word.isdigit():
#             new_word = p.number_to_words(word)
#             new_words.append(new_word)
#         else:
#             new_words.append(word)
#     return new_words
#
# def lemmatize_verbs(words):
#     """Lemmatize verbs in list of tokenized words"""
#     lemmatizer = WordNetLemmatizer()
#     lemmas = []
#     for word in words:
#         lemma = lemmatizer.lemmatize(word, pos='v')
#         lemmas.append(lemma)
#     return lemmas
#
# def to_lowercase(words):
#     """Convert all characters to lowercase from list of tokenized words"""
#     new_words = []
#     for word in words:
#         new_word = word.lower()
#         new_words.append(new_word)
#     return new_words
#
# def remove_non_ascii(words):
#     """Remove non-ASCII characters from list of tokenized words"""
#     new_words = []
#     for word in words:
#         new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
#         new_words.append(new_word)
#     return new_words

### Only CountVectorizer(binary=True)

In [57]:
from sklearn.feature_extraction.text import CountVectorizer

# Bag of Words vectorization
cv = CountVectorizer(binary=True).fit(X_train)
X_train_counts = cv.transform(X_train)
X_validation_counts = cv.transform(X_validation)

In [58]:
from sklearn.naive_bayes import MultinomialNB

clf_NB = MultinomialNB().fit(X_train_counts, y_train)
y_pred = clf_NB.predict(X_validation_counts)

# X_train_normalized = np.array(X_train_normalized)

# print(X_train_normalized.shape)
print(metrics.classification_report(y_validation, y_pred))
print("Accuracy % = ", metrics.accuracy_score(y_validation, y_pred))

              precision    recall  f1-score   support

           0       0.88      0.83      0.85      2488
           1       0.84      0.88      0.86      2512

   micro avg       0.86      0.86      0.86      5000
   macro avg       0.86      0.86      0.86      5000
weighted avg       0.86      0.86      0.86      5000

Accuracy % =  0.8556


In [59]:
from sklearn.linear_model import LogisticRegression

clf_LR = LogisticRegression().fit(X_train_counts, y_train)
y_pred = clf_LR.predict(X_validation_counts)

print(metrics.classification_report(y_validation, y_pred))
print("Accuracy % = ", metrics.accuracy_score(y_validation, y_pred))



              precision    recall  f1-score   support

           0       0.87      0.89      0.88      2488
           1       0.89      0.87      0.88      2512

   micro avg       0.88      0.88      0.88      5000
   macro avg       0.88      0.88      0.88      5000
weighted avg       0.88      0.88      0.88      5000

Accuracy % =  0.8814


In [None]:
# from sklearn.tree import DecisionTreeClassifier

# clf_DT = DecisionTreeClassifier().fit(X_train_counts, y_train)
# y_pred = clf_DT.predict(X_validation_counts)

# print(metrics.classification_report(y_validation, y_pred))
# print("Accuracy % = ", metrics.accuracy_score(y_validation, y_pred))

In [60]:
from sklearn.svm import LinearSVC

clf_SVM = LinearSVC().fit(X_train_counts, y_train)
y_pred = clf_SVM.predict(X_validation_counts)

print(metrics.classification_report(y_validation, y_pred))
print("Accuracy % = ", metrics.accuracy_score(y_validation, y_pred))

              precision    recall  f1-score   support

           0       0.86      0.87      0.87      2488
           1       0.87      0.86      0.87      2512

   micro avg       0.87      0.87      0.87      5000
   macro avg       0.87      0.87      0.87      5000
weighted avg       0.87      0.87      0.87      5000

Accuracy % =  0.8656




### CountVectorizer(binary=True) with tf-idf

In [61]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.preprocessing import Normalizer
from sklearn import metrics

# Bag of Words vectorization
cv = CountVectorizer(binary=True).fit(X_train)
X_train_counts = cv.transform(X_train)
X_validation_counts = cv.transform(X_validation)

# tfidf
tfidf_transformer = TfidfTransformer().fit(X_train_counts)
X_train_tfidf = tfidf_transformer.transform(X_train_counts)
X_validation_tfidf = tfidf_transformer.transform(X_validation_counts)

# normalization
normalizer_tranformer = Normalizer().fit(X=X_train_tfidf)
X_train_normalized = normalizer_tranformer.transform(X_train_tfidf)
X_validation_normalized = normalizer_tranformer.transform(X_validation_tfidf)

In [62]:
clf_NB = MultinomialNB().fit(X_train_normalized, y_train)
y_pred = clf_NB.predict(X_validation_normalized)

# X_train_normalized = np.array(X_train_normalized)

print(X_train_normalized.shape)
print(metrics.classification_report(y_validation, y_pred))
print("Accuracy % = ", metrics.accuracy_score(y_validation, y_pred))

(20000, 68499)
              precision    recall  f1-score   support

           0       0.88      0.85      0.86      2488
           1       0.85      0.89      0.87      2512

   micro avg       0.87      0.87      0.87      5000
   macro avg       0.87      0.87      0.87      5000
weighted avg       0.87      0.87      0.87      5000

Accuracy % =  0.8654


In [63]:
from sklearn.linear_model import LogisticRegression

clf_LR = LogisticRegression().fit(X_train_normalized, y_train)
y_pred = clf_LR.predict(X_validation_normalized)

print(metrics.classification_report(y_validation, y_pred))
print("Accuracy % = ", metrics.accuracy_score(y_validation, y_pred))



              precision    recall  f1-score   support

           0       0.88      0.91      0.90      2488
           1       0.91      0.88      0.89      2512

   micro avg       0.89      0.89      0.89      5000
   macro avg       0.89      0.89      0.89      5000
weighted avg       0.89      0.89      0.89      5000

Accuracy % =  0.8944


In [64]:
# from sklearn.tree import DecisionTreeClassifier

# clf_DT = DecisionTreeClassifier().fit(X_train_normalized, y_train)
# y_pred = clf_DT.predict(X_validation_normalized)

# print(metrics.classification_report(y_validation, y_pred))
# print("Accuracy % = ", metrics.accuracy_score(y_validation, y_pred))

In [65]:
from sklearn.svm import LinearSVC

clf_SVM = LinearSVC().fit(X_train_normalized, y_train)
y_pred = clf_SVM.predict(X_validation_normalized)

print(metrics.classification_report(y_validation, y_pred))
print("Accuracy % = ", metrics.accuracy_score(y_validation, y_pred))

              precision    recall  f1-score   support

           0       0.89      0.90      0.90      2488
           1       0.90      0.88      0.89      2512

   micro avg       0.89      0.89      0.89      5000
   macro avg       0.89      0.89      0.89      5000
weighted avg       0.89      0.89      0.89      5000

Accuracy % =  0.8946


### Only CountVectorizer()

In [66]:
# Bag of Words vectorization
cv = CountVectorizer().fit(X_train)
X_train_counts = cv.transform(X_train)
X_validation_counts = cv.transform(X_validation)

In [67]:
clf_NB = MultinomialNB().fit(X_train_counts, y_train)
y_pred = clf_NB.predict(X_validation_counts)

# X_train_normalized = np.array(X_train_normalized)

# print(X_train_normalized.shape)
print(metrics.classification_report(y_validation, y_pred))
print("Accuracy % = ", metrics.accuracy_score(y_validation, y_pred))

              precision    recall  f1-score   support

           0       0.87      0.81      0.84      2488
           1       0.83      0.88      0.85      2512

   micro avg       0.85      0.85      0.85      5000
   macro avg       0.85      0.85      0.85      5000
weighted avg       0.85      0.85      0.85      5000

Accuracy % =  0.846


In [68]:
clf_LR = LogisticRegression().fit(X_train_counts, y_train)
y_pred = clf_LR.predict(X_validation_counts)

print(metrics.classification_report(y_validation, y_pred))
print("Accuracy % = ", metrics.accuracy_score(y_validation, y_pred))



              precision    recall  f1-score   support

           0       0.88      0.90      0.89      2488
           1       0.90      0.88      0.89      2512

   micro avg       0.89      0.89      0.89      5000
   macro avg       0.89      0.89      0.89      5000
weighted avg       0.89      0.89      0.89      5000

Accuracy % =  0.8896


In [69]:
# clf_DT = DecisionTreeClassifier().fit(X_train_counts, y_train)
# y_pred = clf_DT.predict(X_validation_counts)

# print(metrics.classification_report(y_validation, y_pred))
# print("Accuracy % = ", metrics.accuracy_score(y_validation, y_pred))

In [70]:
clf_SVM = LinearSVC().fit(X_train_counts, y_train)
y_pred = clf_SVM.predict(X_validation_counts)

print(metrics.classification_report(y_validation, y_pred))
print("Accuracy % = ", metrics.accuracy_score(y_validation, y_pred))

              precision    recall  f1-score   support

           0       0.86      0.88      0.87      2488
           1       0.88      0.86      0.87      2512

   micro avg       0.87      0.87      0.87      5000
   macro avg       0.87      0.87      0.87      5000
weighted avg       0.87      0.87      0.87      5000

Accuracy % =  0.8686




### CountVectorizer() with tf-idf

In [32]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.preprocessing import Normalizer
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer

# Bag of Words vectorization
cv = CountVectorizer().fit(X_train)
X_train_counts = cv.transform(X_train)
X_validation_counts = cv.transform(X_validation)

# tfidf
tfidf_transformer = TfidfTransformer().fit(X_train_counts)
X_train_tfidf = tfidf_transformer.transform(X_train_counts)
X_validation_tfidf = tfidf_transformer.transform(X_validation_counts)

# normalization
normalizer_tranformer = Normalizer().fit(X=X_train_tfidf)
X_train_normalized = normalizer_tranformer.transform(X_train_tfidf)
X_validation_normalized = normalizer_tranformer.transform(X_validation_tfidf)

In [34]:
from sklearn.naive_bayes import MultinomialNB
clf_NB = MultinomialNB().fit(X_train_normalized, y_train)
y_pred = clf_NB.predict(X_validation_normalized)

# X_train_normalized = np.array(X_train_normalized)

print(X_train_normalized.shape)
print(metrics.classification_report(y_validation, y_pred))
print("Accuracy % = ", metrics.accuracy_score(y_validation, y_pred))

(20000, 68354)
              precision    recall  f1-score   support

           0       0.85      0.89      0.87      2517
           1       0.88      0.84      0.86      2483

   micro avg       0.87      0.87      0.87      5000
   macro avg       0.87      0.87      0.87      5000
weighted avg       0.87      0.87      0.87      5000

Accuracy % =  0.866


In [35]:
from sklearn.linear_model import LogisticRegression
clf_LR = LogisticRegression().fit(X_train_normalized, y_train)
y_pred = clf_LR.predict(X_validation_normalized)

print(metrics.classification_report(y_validation, y_pred))
print("Accuracy % = ", metrics.accuracy_score(y_validation, y_pred))



              precision    recall  f1-score   support

           0       0.90      0.88      0.89      2517
           1       0.88      0.90      0.89      2483

   micro avg       0.89      0.89      0.89      5000
   macro avg       0.89      0.89      0.89      5000
weighted avg       0.89      0.89      0.89      5000

Accuracy % =  0.8944


In [75]:
# clf_DT = DecisionTreeClassifier().fit(X_train_normalized, y_train)
# y_pred = clf_DT.predict(X_validation_normalized)

# print(metrics.classification_report(y_validation, y_pred))
# print("Accuracy % = ", metrics.accuracy_score(y_validation, y_pred))

In [41]:
from sklearn.svm import LinearSVC
for c in [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 0.6, 0.55, 0.45, 1.0, 1.5, 0.525]:
    svm = LinearSVC(C=c)
    svm.fit(X_train_normalized, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, metrics.accuracy_score(y_validation, svm.predict(X_validation_normalized))))

# clf_SVM = LinearSVC().fit(X_train_normalized, y_train)
# y_pred = clf_SVM.predict(X_validation_normalized)

# print(metrics.classification_report(y_validation, y_pred))
# print("Accuracy % = ", metrics.accuracy_score(y_validation, y_pred))

Accuracy for C=0.001: 0.7858
Accuracy for C=0.005: 0.8366
Accuracy for C=0.01: 0.857
Accuracy for C=0.05: 0.8856
Accuracy for C=0.1: 0.8976
Accuracy for C=0.5: 0.899
Accuracy for C=0.6: 0.8974
Accuracy for C=0.55: 0.8986
Accuracy for C=0.45: 0.899
Accuracy for C=1.0: 0.8952
Accuracy for C=1.5: 0.8932
Accuracy for C=0.525: 0.8988
