# Classes

In [77]:
class Sentiment:
    NEGATIVE = "NEGATIVE"
    NEUTRAL = "NEUTRAL"
    POSITIVE = "POSITIVE"
    
class Review:
    def __init__(self, text, score):
        self.text = text
        self.score = score
        self.sentiment = self.get_sentiment()
        
    def get_sentiment(self):
        if self.score <=2:
            return Sentiment.NEGATIVE
        elif self.score == 3:
            return Sentiment.NEUTRAL
        else:
            return Sentiment.POSITIVE

# Load Data

In [78]:
import json

file_name = 'books_small.json'

reviews = []
with open(file_name) as f:
    for line in f:
        review = json.loads(line)
        reviews.append(Review(review['reviewText'], review['overall']))
        
        
reviews[5].text

'Love the book, great story line, keeps you entertained.for a first novel from this author she did a great job,  Would definitely recommend!'

# Prep Data

In [79]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(reviews, test_size=0.33, random_state=42)

In [80]:
train_x = [x.text for x in train]
train_y = [x.sentiment for x in train]

test_x = [x.text for x in test]
test_y = [x.sentiment for x in test]

##### Bag of words

In [81]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()

train_x_vectors = vectorizer.fit_transform(train_x)

test_x_vectors = vectorizer.transform(test_x)


# Classification

#### SVM

In [82]:
from sklearn.svm import SVC

clf = SVC(kernel='linear')

clf.fit(train_x_vectors, train_y)

svm_score = (clf.score(test_x_vectors, test_y))

print('SVM Score:', svm_score)


SVM Score: 0.8242424242424242


#### Decision Tree

In [83]:
from sklearn.tree import DecisionTreeClassifier

dec = DecisionTreeClassifier()

dec.fit(train_x_vectors, train_y)

dec_score = dec.score(test_x_vectors, test_y)

print('Decision Tree Score:', dec_score)

Decision Tree Score: 0.7575757575757576


#### Naive Bayes

In [84]:
from sklearn.naive_bayes import GaussianNB

nb = GaussianNB()

nb.fit(train_x_vectors.toarray(), train_y)

nb_score = nb.score(test_x_vectors.toarray(), test_y)

print('Naive Bayes Score:', nb_score)

Naive Bayes Score: 0.8121212121212121


#### Logistic Regression

In [85]:
from sklearn.linear_model import LogisticRegression

log = LogisticRegression()

log.fit(train_x_vectors, train_y)

log_score = log.score(test_x_vectors, test_y)

print('Decision Tree Score:', log_score)

Decision Tree Score: 0.8303030303030303


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


# F1 Scores

In [88]:
from sklearn.metrics import f1_score

print(f1_score(test_y, clf.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE]))
print(f1_score(test_y, dec.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE]))
print(f1_score(test_y, nb.predict(test_x_vectors.toarray()), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE]))
print(f1_score(test_y, log.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE]))


[0.91319444 0.21052632 0.22222222]
[0.86879433 0.11594203 0.07407407]
[0.89678511 0.08510638 0.09090909]
[0.91370558 0.12244898 0.1       ]
