## IMPORT LIBRARIES

In [1]:
import pandas as pd
import numpy as np
import os 
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression

##  READ FILE

In [2]:
train_review =[]
for line in open('../movie-sentiment-analysis/aclImdb/movie_data/full_train.txt','r',encoding='utf-8'):
    train_review.append(line.strip())


In [3]:
test_review=[]
for line in open('../movie-sentiment-analysis/aclImdb/movie_data/full_test.txt','r',encoding='utf-8'):
    test_review.append(line.strip())

## CLEANING AND PROCESSING

In [4]:
no_space = re.compile("(\.)|(\;)|(\:)|(\!)|(\')|(\?)|(\,)|(\")|(\()|(\))|(\[)|(\])|(\d+)")
with_space = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")
def cleaningandprocessing(reviews):
    reviews = [no_space.sub("",line.lower()) for line in reviews]
    reviews = [with_space.sub(" ",line) for line in reviews]
    return reviews
clean_train_reviews = cleaningandprocessing(train_review)
clean_test_reviews = cleaningandprocessing(test_review)

## USE OF TFID VECTORIZER

In [5]:
cv = TfidfVectorizer()
cv.fit(clean_train_reviews)
X_train = cv.transform(clean_train_reviews)
X_test = cv.transform(clean_test_reviews)
target = [1 if i <12500 else 0 for i in range(25000)]
x_train, x_test, y_train, y_test = train_test_split(X_train, target, test_size=0.8, random_state=42)

In [6]:
for c in [0.01,0.05,0.1,0.25,0.5,1]:
    lr = LogisticRegression(C=c)
    lr.fit(x_train, y_train)
    print("For: ", c)
    print("\n")
    print(accuracy_score(y_test, lr.predict(x_test)))
    print("\n")
    print(confusion_matrix(y_test, lr.predict(x_test)))
    print("\n")
    print(classification_report(y_test, lr.predict(x_test)))
    print("\n\n")

For:  0.01


0.7201


[[9280  682]
 [4916 5122]]


             precision    recall  f1-score   support

          0       0.65      0.93      0.77      9962
          1       0.88      0.51      0.65     10038

avg / total       0.77      0.72      0.71     20000




For:  0.05


0.7901


[[8048 1914]
 [2284 7754]]


             precision    recall  f1-score   support

          0       0.78      0.81      0.79      9962
          1       0.80      0.77      0.79     10038

avg / total       0.79      0.79      0.79     20000




For:  0.1


0.80695


[[7985 1977]
 [1884 8154]]


             precision    recall  f1-score   support

          0       0.81      0.80      0.81      9962
          1       0.80      0.81      0.81     10038

avg / total       0.81      0.81      0.81     20000




For:  0.25


0.82975


[[8118 1844]
 [1561 8477]]


             precision    recall  f1-score   support

          0       0.84      0.81      0.83      9962
          1       0.82      0.84  

### FOR C = 1 ACCURACY SCORE = 85.9

In [7]:
final_model = LogisticRegression(C=1)
final_model.fit(X_train, target)
predictions1 = final_model.predict(X_test)
print("Accuracy score is: ",accuracy_score(target,predictions1))
print("\n")
print("Confusion matrix is: ", confusion_matrix(target, predictions1))
print("\n")
print("Classification matrix is: ",classification_report(target, predictions1))

Accuracy score is:  0.88248


Confusion matrix is:  [[11039  1461]
 [ 1477 11023]]


Classification matrix is:               precision    recall  f1-score   support

          0       0.88      0.88      0.88     12500
          1       0.88      0.88      0.88     12500

avg / total       0.88      0.88      0.88     25000



## FINAL MODEL ACCURACY SCORE IS 88.24