In [3]:
class Sentiment:
    POSITIVE = 'POSITIVE'
    NEUTRAL = 'NEUTRAL'
    NEGATIVE = 'NEGATIVE'

class Review:
    def __init__(self, text, score):
        self.text = text
        self.score = score
        self.sentiment = self.get_sentiment()
        
    def get_sentiment(self):
        if self.score >=4:
            return Sentiment.POSITIVE
        elif self.score == 3:
            return Sentiment.NEUTRAL
        else:
            return Sentiment.NEGATIVE

In [4]:
import json

file_name = './data/category/Books_small.json'

reviews = []
with open(file_name) as file:
    for line in file:
        review = json.loads(line)
        reviews.append(Review(review['reviewText'], review['overall']))
print(reviews[34].sentiment)

NEGATIVE


In [9]:
from sklearn.model_selection import train_test_split

training, test = train_test_split(reviews, test_size=0.33, random_state=42)

In [6]:
train_x = [x.text for x in training]
train_y = [y.sentiment for y in training]

test_x = [x.text for x in test]
test_y = [y.sentiment for y in test]


In [22]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()

train_x_vectors = vectorizer.fit_transform(train_x)

# Don't use fit cause we just did this, just transform our test_x
test_x_vectors = vectorizer.transform(test_x)


In [27]:
print(training[0].sentiment)

POSITIVE


#### Linear SVM

In [23]:
from sklearn import svm

clf_svm = svm.SVC(kernel='linear')

clf_svm.fit(train_x_vectors, train_y)

clf_svm.predict(test_x_vectors[0])


array(['POSITIVE'], dtype='<U8')

#### Decision Tree


In [25]:
from sklearn.tree import DecisionTreeClassifier

clf_tree = DecisionTreeClassifier()

clf_tree.fit(train_x_vectors, train_y)

clf_tree.predict(test_x_vectors[0])



array(['POSITIVE'], dtype='<U8')

In [37]:
from sklearn.metrics import accuracy_score

# There are two methods of checking accuracy

# prediction = clf_tree.predict(test_x_vectors)
# accuracy_score(test_y, prediction)

clf_tree.score(test_x_vectors, test_y)

0.7606060606060606

In [33]:
prediction = clf_svm.predict(test_x_vectors)
accuracy_score(test_y, prediction)

0.8242424242424242

In [45]:
from sklearn.metrics import f1_score

f1_score(test_y,
         clf_svm.predict(test_x_vectors),
         average=None,
         labels=[Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE])

f1_score(test_y,
         clf_tree.predict(test_x_vectors),
         average=None,
         labels=[Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE])

array([0.87079646, 0.12307692, 0.06666667])