## IMPORT LIBRARIES

In [1]:
import pandas as pd
import numpy as np
import os 
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC

  from collections import Sequence
  from collections import Iterable
  from collections import Mapping, namedtuple, defaultdict, Sequence


## READING FILES

In [2]:
train_review = []
for line in open('../movie-sentiment-analysis/aclImdb/movie_data/full_train.txt','r',encoding='utf-8'):
    train_review.append(line.strip())

In [3]:
test_review = []
for line in open('../movie-sentiment-analysis/aclImdb/movie_data/full_test.txt','r',encoding='utf-8'):
    test_review.append(line.strip())

## CLEANING AND PREPROCESSING

In [4]:
no_space = re.compile("(\.)|(\;)|(\:)|(\!)|(\')|(\?)|(\,)|(\")|(\()|(\))|(\[)|(\])|(\d+)")
with_space = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")
def cleaningandprocessing(reviews):
    reviews = [no_space.sub("",line.lower()) for line in reviews]
    reviews = [with_space.sub(" ",line) for line in reviews]
    return reviews
clean_train_reviews = cleaningandprocessing(train_review)
clean_test_reviews = cleaningandprocessing(test_review)

## FINAL MODEL TRAINING AND PREDICTION USING SVM

In [5]:
stop_words = ['in', 'of', 'at', 'a', 'the']
cv = CountVectorizer(binary=True, ngram_range=(1,2), stop_words=stop_words)

In [6]:
cv.fit(clean_train_reviews)

CountVectorizer(analyzer='word', binary=True, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), preprocessor=None,
        stop_words=['in', 'of', 'at', 'a', 'the'], strip_accents=None,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, vocabulary=None)

In [7]:
X_train = cv.transform(clean_train_reviews)
X_test = cv.transform(clean_test_reviews)
target = [1 if i < 12500 else 0 for i in range(0,25000)]
x_train, x_test, y_train, y_test = train_test_split(X_train, target, test_size=0.8, random_state=42)

In [8]:
for C in [0.01,0.05,0.1,0.25,0.5,1]:
    sv = LinearSVC(C=C)
    sv.fit(x_train, y_train)
    predictions = sv.predict(x_test)
    print("For :", C)
    print("\n")
    print("accuracy score is ", accuracy_score(y_test,predictions))
    print("\n")
    print("confusion matrix is ",confusion_matrix(y_test, predictions))
    print("\n")
    print("classification report is ", classification_report(y_test, predictions))
    print("\n\n")

For : 0.01


accuracy score is  0.86645


confusion matrix is  [[8581 1381]
 [1290 8748]]


classification report is               precision    recall  f1-score   support

          0       0.87      0.86      0.87      9962
          1       0.86      0.87      0.87     10038

avg / total       0.87      0.87      0.87     20000




For : 0.05


accuracy score is  0.8632


confusion matrix is  [[8547 1415]
 [1321 8717]]


classification report is               precision    recall  f1-score   support

          0       0.87      0.86      0.86      9962
          1       0.86      0.87      0.86     10038

avg / total       0.86      0.86      0.86     20000




For : 0.1


accuracy score is  0.8623


confusion matrix is  [[8537 1425]
 [1329 8709]]


classification report is               precision    recall  f1-score   support

          0       0.87      0.86      0.86      9962
          1       0.86      0.87      0.86     10038

avg / total       0.86      0.86      0.86     20000

## FOR C = 0.01 MAXIMUM ACCURACY SCORE IS 86.645

In [10]:
final_model = LinearSVC(C=0.01)
final_model.fit(X_train, target)
predictions = final_model.predict(X_test)

In [11]:
print("Accuracy Score is ", accuracy_score(target, predictions))
print("\n")
print("Confusion matrix is ", confusion_matrix(target, predictions))
print("\n")
print("Classification report is ", classification_report(target, predictions))

Accuracy Score is  0.89932


Confusion matrix is  [[11200  1300]
 [ 1217 11283]]


Classification report is               precision    recall  f1-score   support

          0       0.90      0.90      0.90     12500
          1       0.90      0.90      0.90     12500

avg / total       0.90      0.90      0.90     25000



### ACCURACY OBTAINED IS 89.932 that is roughly equal to  90 %