In [36]:
import random

class Sentiment:
    POSITIVE = 'POSITIVE'
    NEUTRAL = 'NEUTRAL'
    NEGATIVE = 'NEGATIVE'

class Review:
    def __init__(self, text, score):
        self.text = text
        self.score = score
        self.sentiment = self.get_sentiment()
        
    def get_sentiment(self):
        if self.score >=4:
            return Sentiment.POSITIVE
        elif self.score == 3:
            return Sentiment.NEUTRAL
        else:
            return Sentiment.NEGATIVE

class ReviewContainer:
    def __init__(self, reviews):
        self.reviews = reviews
        
    def get_text(self):
        return [x.text for x in self.reviews]
    
    def get_sentiment(self):
        return [x.sentiment for x in self.reviews]
    
    def evenly_distribute(self):
        negative = list(filter(lambda x: x.sentiment==Sentiment.NEGATIVE, self.reviews))
        positive = list(filter(lambda x: x.sentiment==Sentiment.POSITIVE, self.reviews))
        positive_shrunk = positive[:len(negative)]
        self.reviews = negative + positive_shrunk
        random.shuffle(self.reviews)
        

In [37]:
import json

file_name = './data/sentiment/Books_small_10000.json'

reviews = []
with open(file_name) as file:
    for line in file:
        review = json.loads(line)
        reviews.append(Review(review['reviewText'], review['overall']))

In [38]:
from sklearn.model_selection import train_test_split

training, test = train_test_split(reviews, test_size=0.33, random_state=42)

train_container = ReviewContainer(training)
test_container = ReviewContainer(test)

In [39]:
train_container.evenly_distribute()
train_x = train_container.get_text()
train_y = train_container.get_sentiment()


test_container.evenly_distribute()
test_x = test_container.get_text()
test_y = test_container.get_sentiment()

In [40]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
# CountVectorizer -> create a map where every word has the same value
# TfidfVectorizer -> create a map where rather used words ave higher value

vectorizer = TfidfVectorizer()
train_x_vectors = vectorizer.fit_transform(train_x)
# Don't use fit cause we just did this, just transform our test_x
test_x_vectors = vectorizer.transform(test_x)


In [41]:
print(training[0].sentiment)

POSITIVE


#### Linear SVM

In [42]:
from sklearn import svm

clf_svm = svm.SVC(kernel='linear')

clf_svm.fit(train_x_vectors, train_y)

clf_svm.predict(test_x_vectors[0])


array(['NEGATIVE'], dtype='<U8')

#### Decision Tree


In [43]:
from sklearn.tree import DecisionTreeClassifier

clf_tree = DecisionTreeClassifier()

clf_tree.fit(train_x_vectors, train_y)

clf_tree.predict(test_x_vectors[0])



array(['NEGATIVE'], dtype='<U8')

In [44]:
from sklearn.metrics import accuracy_score

# There are two methods of checking accuracy

# prediction = clf_tree.predict(test_x_vectors)
# accuracy_score(test_y, prediction)

clf_tree.score(test_x_vectors, test_y)

0.6442307692307693

In [45]:
prediction = clf_svm.predict(test_x_vectors)
accuracy_score(test_y, prediction)

0.8076923076923077

In [46]:
from sklearn.metrics import f1_score

f1_score(test_y,
         clf_svm.predict(test_x_vectors),
         average=None,
         labels=[Sentiment.POSITIVE, Sentiment.NEGATIVE])

array([0.80582524, 0.80952381])

In [47]:
f1_score(test_y,
         clf_tree.predict(test_x_vectors),
         average=None,
         labels=[Sentiment.POSITIVE, Sentiment.NEGATIVE])

array([0.6372549, 0.6509434])

In [48]:
train_y.count(Sentiment.NEGATIVE)

436

#### Test of our model

In [49]:
test_set = ["I don't like you"]
new_test = vectorizer.transform(test_set)
clf_svm.predict(new_test)

array(['NEGATIVE'], dtype='<U8')

#### Turning our model (with Grid Search)

In [56]:
from sklearn.model_selection import GridSearchCV

parameters = {'kernel': ('linear', 'rbf'), 'C': (1,4,8,16,32)}

svc = svm.SVC()
clf = GridSearchCV(svc, parameters, cv=5)
clf.fit(train_x_vectors, train_y)


GridSearchCV(cv=5, estimator=SVC(),
             param_grid={'C': (1, 4, 8, 16, 32), 'kernel': ('linear', 'rbf')})

In [58]:
clf.score(test_x_vectors, test_y)

0.8197115384615384

### Saving model

In [59]:
import pickle

with open('./models/Real-World_AI.pkl', 'wb') as file:
    pickle.dump(clf, file)

#### Load model


In [60]:
with open('./models/Real-World_AI.pkl', 'rb') as file:
    loaded_model = pickle.load(file)

print(loaded_model)

GridSearchCV(cv=5, estimator=SVC(),
             param_grid={'C': (1, 4, 8, 16, 32), 'kernel': ('linear', 'rbf')})
