## LIBRARY IMPORT

In [1]:
import pandas as pd
import numpy as np
import os
import re

## READING FILES

In [2]:
train_reviews = []
for line in open('../movie-sentiment-analysis/aclImdb/movie_data/full_train.txt','r',encoding='utf-8'):
    train_reviews.append(line.strip())

In [3]:
test_reviews = []
for line in open('../movie-sentiment-analysis/aclImdb/movie_data/full_test.txt','r',encoding='utf-8'):
    test_reviews.append(line.strip())

## CLEANING AND PROCESSING

In [5]:
no_space = re.compile("(\.)|(\;)|(\:)|(\!)|(\')|(\?)|(\,)|(\")|(\()|(\))|(\[)|(\])|(\d+)")
with_space = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")
def cleaningandprocessing(reviews):
    reviews = [no_space.sub("",line.lower()) for line in reviews]
    reviews = [with_space.sub(" ",line) for line in reviews]
    return reviews
clean_train_reviews = cleaningandprocessing(train_reviews)
clean_test_reviews = cleaningandprocessing(test_reviews)

## REMOVE STOPWORDS

In [6]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

english_stop_words = stopwords.words('english')
def remove_stop_words(corpus):
    removed_stop_words = []
    for review in corpus:
        removed_stop_words.append(
            ' '.join([word for word in review.split() 
                      if word not in english_stop_words])
        )
    return removed_stop_words

no_stop_words_train = remove_stop_words(clean_train_reviews)
no_stop_words_test = remove_stop_words(clean_test_reviews)

  from collections import Mapping
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\srija\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
no_stop_words_train[1]

'homelessness houselessness george carlin stated issue years never plan help street considered human everything going school work vote matter people think homeless lost cause worrying things racism war iraq pressuring kids succeed technology elections inflation worrying theyll next end streets given bet live streets month without luxuries home entertainment sets bathroom pictures wall computer everything treasure see like homeless goddard bolts lesson mel brooks directs stars bolt plays rich man everything world deciding make bet sissy rival jeffery tambor see live streets thirty days without luxuries bolt succeeds wants future project making buildings bets bolt thrown street bracelet leg monitor every move cant step sidewalk hes given nickname pepto vagrant written forehead bolt meets characters including woman name molly lesley ann warren ex dancer got divorce losing home pals sailor howard morris fumes teddy wilson already used streets theyre survivors bolt isnt hes used reaching mu

In [8]:
clean_train_reviews[1]

'homelessness or houselessness as george carlin stated has been an issue for years but never a plan to help those on the street that were once considered human who did everything from going to school work or vote for the matter most people think of the homeless as just a lost cause while worrying about things such as racism the war on iraq pressuring kids to succeed technology the elections inflation or worrying if theyll be next to end up on the streets but what if you were given a bet to live on the streets for a month without the luxuries you once had from a home the entertainment sets a bathroom pictures on the wall a computer and everything you once treasure to see what its like to be homeless that is goddard bolts lesson mel brooks who directs who stars as bolt plays a rich man who has everything in the world until deciding to make a bet with a sissy rival jeffery tambor to see if he can live in the streets for thirty days without the luxuries if bolt succeeds he can do what he w

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression

In [10]:
cv = CountVectorizer(binary=True)
cv.fit(no_stop_words_train)
X_train = cv.transform(no_stop_words_train)
X_test = cv.transform(no_stop_words_test)
target = [1 if i <12500 else 0 for i in range(25000)]
x_train, x_test, y_train, y_test = train_test_split(X_train, target, test_size=0.8, random_state=42)

In [11]:
for c in [0.01,0.05,0.1,0.25,0.5,1]:
    lr = LogisticRegression(C=c)
    lr.fit(x_train, y_train)
    print("For: ", c)
    print("\n")
    print(accuracy_score(y_test, lr.predict(x_test)))
    print("\n")
    print(confusion_matrix(y_test, lr.predict(x_test)))
    print("\n")
    print(classification_report(y_test, lr.predict(x_test)))
    print("\n\n")

For:  0.01


0.8502


[[8410 1552]
 [1444 8594]]


             precision    recall  f1-score   support

          0       0.85      0.84      0.85      9962
          1       0.85      0.86      0.85     10038

avg / total       0.85      0.85      0.85     20000




For:  0.05


0.85795


[[8489 1473]
 [1368 8670]]


             precision    recall  f1-score   support

          0       0.86      0.85      0.86      9962
          1       0.85      0.86      0.86     10038

avg / total       0.86      0.86      0.86     20000




For:  0.1


0.85835


[[8492 1470]
 [1363 8675]]


             precision    recall  f1-score   support

          0       0.86      0.85      0.86      9962
          1       0.86      0.86      0.86     10038

avg / total       0.86      0.86      0.86     20000




For:  0.25


0.85545


[[8468 1494]
 [1397 8641]]


             precision    recall  f1-score   support

          0       0.86      0.85      0.85      9962
          1       0.85      0.86 

### FOR C=0.1 MAXIMUM ACCURACY SCORE 0.858

In [12]:
final_model = LogisticRegression(C=0.1)
final_model.fit(X_train, target)
predictions1 = final_model.predict(X_test)
print("Accuracy score is: ",accuracy_score(target,predictions1))
print("\n")
print("Confusion matrix is: ", confusion_matrix(target, predictions1))
print("\n")
print("Classification matrix is: ",classification_report(target, predictions1))

Accuracy score is:  0.87796


Confusion matrix is:  [[10932  1568]
 [ 1483 11017]]


Classification matrix is:               precision    recall  f1-score   support

          0       0.88      0.87      0.88     12500
          1       0.88      0.88      0.88     12500

avg / total       0.88      0.88      0.88     25000



### MAXIMUM ACCURACY AFTER REMOVING OF STOPWORDS IS 87.78