In [270]:
class Review:
    def __init__(self, text, score):
        self.text = text
        self.score = score
        self.sentiment = self.get_sentiment()

    def get_sentiment(self):
        if (self.score >= 3):
            return "POSITIVE"
        else: 
            return "NEGATIVE"

In [271]:
import json

filename = "./data/sentiment/books_small_10000.json"

reviews = []
with open(filename) as f:
    for line in f:
        review = json.loads(line)
        reviews.append(Review(review['reviewText'], review['overall']))

print(reviews[1].text, reviews[1].score, reviews[1].sentiment)

I enjoyed this short book. But it was way way to short ....I can see how easily it would have been to add several chapters. 3.0 POSITIVE


### Fair Distribution

In [272]:
positive_part = list(filter(lambda x: x.sentiment == "POSITIVE", reviews))
negative_part = list(filter(lambda x: x.sentiment == "NEGATIVE", reviews))

n = min(len(positive_part), len(negative_part))
reviews = positive_part[:n] + negative_part[:n]

import random
random.shuffle(reviews)

print(reviews[0].sentiment)

POSITIVE


### Train Test Split

In [273]:
from sklearn.model_selection import train_test_split

X = [x.text for x in reviews]
y = [y.sentiment for y in reviews]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 1)

### Vectorization of Text

In [293]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
vectorizer.fit(X_train)

X_train_vectors = vectorizer.transform(X_train)
X_test_vectors = vectorizer.transform(X_test)

### Linear SVM

In [294]:
from sklearn import svm

clf_svm = svm.SVC(kernel = 'linear')
clf_svm.fit(X_train_vectors, y_train)

### Naive Bayes

In [295]:
from sklearn.naive_bayes import GaussianNB

clf_gnb = GaussianNB()
clf_gnb.fit(X_train_vectors.toarray(), y_train)

### Logistic Regression

In [296]:
from sklearn.linear_model import LogisticRegression

clf_log = LogisticRegression(max_iter = 1000)
clf_log.fit(X_train_vectors, y_train)

### Models Evaluation

In [297]:
print("The Mean Accuracy of Linear SVM Model:", clf_svm.score(X_test_vectors, y_test))
print("The Mean Accuracy of Naive Bayes Model:", clf_gnb.score(X_test_vectors.toarray(), y_test))
print("The Mean Accuracy of Logistic Regression Model:", clf_log.score(X_test_vectors, y_test))

The Mean Accuracy of Linear SVM Model: 0.8286384976525821
The Mean Accuracy of Naive Bayes Model: 0.6173708920187794
The Mean Accuracy of Logistic Regression Model: 0.8262910798122066


In [298]:
from sklearn.metrics import classification_report
print(classification_report(y_test, clf_svm.predict(X_test_vectors)))

              precision    recall  f1-score   support

    NEGATIVE       0.82      0.84      0.83       214
    POSITIVE       0.84      0.82      0.83       212

    accuracy                           0.83       426
   macro avg       0.83      0.83      0.83       426
weighted avg       0.83      0.83      0.83       426



In [299]:
print(classification_report(y_test, clf_gnb.predict(X_test_vectors.toarray())))

              precision    recall  f1-score   support

    NEGATIVE       0.62      0.61      0.61       214
    POSITIVE       0.61      0.63      0.62       212

    accuracy                           0.62       426
   macro avg       0.62      0.62      0.62       426
weighted avg       0.62      0.62      0.62       426



In [300]:
print(classification_report(y_test, clf_log.predict(X_test_vectors)))

              precision    recall  f1-score   support

    NEGATIVE       0.83      0.82      0.83       214
    POSITIVE       0.82      0.83      0.83       212

    accuracy                           0.83       426
   macro avg       0.83      0.83      0.83       426
weighted avg       0.83      0.83      0.83       426



### Testing

In [301]:
new_test = ['I hate this book', 'Bad book', 'trash book', 'Amazing book', 'This book is a masterpiece', 'What a waste of time']
new_test_vectors = vectorizer.transform(new_test)

print(clf_log.predict(new_test_vectors))

['NEGATIVE' 'NEGATIVE' 'NEGATIVE' 'POSITIVE' 'POSITIVE' 'NEGATIVE']


### Saving model

In [303]:
import pickle

with open('./model/sentiment_classifier.pkl', 'wb') as f:
    pickle.dump(clf_log, f) 

### Loading model for reusing

In [304]:
with open('./model/sentiment_classifier.pkl', 'rb') as f:
    model = pickle.load(f)

print(model.predict(new_test_vectors))

['NEGATIVE' 'NEGATIVE' 'NEGATIVE' 'POSITIVE' 'POSITIVE' 'NEGATIVE']
