## LIBRARY IMPORT

In [1]:
import pandas as pd
import numpy as np

In [2]:
import os
import re

## READING FILES

In [4]:
train_reviews = []
for line in open('../movie-sentiment-analysis/aclImdb/movie_data/full_train.txt','r',encoding='utf-8'):
    train_reviews.append(line.strip())

In [6]:
test_reviews = []
for line in open('../movie-sentiment-analysis/aclImdb/movie_data/full_test.txt','r',encoding='utf-8'):
    test_reviews.append(line.strip())

## CLEANING AND PROCESSING

In [8]:
no_space = re.compile("(\.)|(\;)|(\:)|(\!)|(\')|(\?)|(\,)|(\")|(\()|(\))|(\[)|(\])|(\d+)")
with_space = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")
def cleaningandprocessing(reviews):
    reviews = [no_space.sub("",line.lower()) for line in reviews]
    reviews = [with_space.sub(" ",line) for line in reviews]
    return reviews
clean_train_reviews = cleaningandprocessing(train_reviews)
clean_test_reviews = cleaningandprocessing(test_reviews)

## BASELINE OF  LOGISTIC REGRESSION PERFORMED

In [17]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression

In [18]:
baseline_vectorizer = CountVectorizer(binary=True)
baseline_vectorizer.fit(clean_train_reviews)
X_train = baseline_vectorizer.transform(clean_train_reviews)
X_test = baseline_vectorizer.transform(clean_test_reviews)

In [19]:
target = [1 if i <12500 else 0 for i in range(25000)]
x_train, x_test, y_train, y_test = train_test_split(X_train, target, test_size=0.8, random_state=42)

In [20]:
for c in [0.01,0.05,0.1,0.5,1]:
    lr = LogisticRegression(C=c)
    lr.fit(x_train, y_train)
    print("For: ", c)
    print("\n")
    print(accuracy_score(y_test, lr.predict(x_test)))
    print("\n")
    print(confusion_matrix(y_test, lr.predict(x_test)))
    print("\n")
    print(classification_report(y_test, lr.predict(x_test)))
    print("\n\n")

For:  0.01


0.847


[[8373 1589]
 [1471 8567]]


             precision    recall  f1-score   support

          0       0.85      0.84      0.85      9962
          1       0.84      0.85      0.85     10038

avg / total       0.85      0.85      0.85     20000




For:  0.05


0.8558


[[8457 1505]
 [1379 8659]]


             precision    recall  f1-score   support

          0       0.86      0.85      0.85      9962
          1       0.85      0.86      0.86     10038

avg / total       0.86      0.86      0.86     20000




For:  0.1


0.85705


[[8472 1490]
 [1369 8669]]


             precision    recall  f1-score   support

          0       0.86      0.85      0.86      9962
          1       0.85      0.86      0.86     10038

avg / total       0.86      0.86      0.86     20000




For:  0.5


0.85405


[[8454 1508]
 [1411 8627]]


             precision    recall  f1-score   support

          0       0.86      0.85      0.85      9962
          1       0.85      0.86    

In [21]:
final_model = LogisticRegression(C=0.1)
final_model.fit(X_train, target)

LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [22]:
print(accuracy_score(target, final_model.predict(X_test)))
print("\n")
print(confusion_matrix(target, final_model.predict(X_test)))
print("\n")
print(classification_report(target,final_model.predict(X_test)))

0.87932


[[10980  1520]
 [ 1497 11003]]


             precision    recall  f1-score   support

          0       0.88      0.88      0.88     12500
          1       0.88      0.88      0.88     12500

avg / total       0.88      0.88      0.88     25000



### TILL NOW MAX ACCURACY ACHIEVED ON FINAL MODEL IS 87.93

In [43]:
def get_stemmed_text(corpus):
    from nltk.stem.porter import PorterStemmer
    stemmer = PorterStemmer()
    return [' '.join([stemmer.stem(word) for word in review.split()]) for review in corpus]
stemmed_reviews_train = get_stemmed_text(clean_train_reviews)
stemmed_reviews_test = get_stemmed_text(clean_test_reviews)