In [91]:
import random
class Review:
    def __init__(self,text,score):
        self.text = text
        self.score = score
        self.sentiment = self.get_sentiment()
    def get_sentiment(self):
        if self.score <= 2:
            return "NEGATIVE"
        elif self.score == 3:
            return "NEUTRAL"
        else:
            return "POSITIVE"
        
class ReviewContainer:
    def __init__(self,reviews):
        self.reviews=reviews
    def evenly_distribute(self):
        neg=list(filter(lambda x: x.sentiment=="NEGATIVE", self.reviews))
        pos=list(filter(lambda x: x.sentiment=="POSITIVE", self.reviews))
        pos_shrunk = pos[:len(neg)]
        self.reviews = neg + pos_shrunk
        random.shuffle(self.reviews)

In [79]:
import json

file_name = './books_small_10000.json'

reviews = []
with open(file_name) as f:
    for line in f:
        review = json.loads(line)
        reviews.append(Review(review['reviewText'],review['overall']))
#         print(review['reviewText'])
#         print(review['overall'])
# print(reviews[5].sentiment)
        

#### TRAINING DATA

In [102]:
from sklearn.model_selection import train_test_split
training, test = train_test_split(reviews,test_size=0.33,random_state=42)

train_cont = ReviewContainer(training)
train_cont.evenly_distribute()
test_cont = ReviewContainer(test)
test_cont.evenly_distribute()
len(test_cont.reviews)

416

In [103]:
# print(training[0].text)
train_x = [x.text for x in train_cont.reviews]
train_y = [x.sentiment for x in train_cont.reviews]

# print(len(train_x))
test_x = [x.text for x in test_cont.reviews]
test_y = [x.sentiment for x in test_cont.reviews]

# print(train_x[0])

#### BAGS OF WORDS VECTORIZATION

In [127]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
vectorizer = TfidfVectorizer()
train_x_vectors = vectorizer.fit_transform(train_x)
test_x_vectors = vectorizer.transform(test_x)
# print(train_x_vectors.toarray())
# print(train_x_vectors[0])

#### LINER SVM

In [128]:
from sklearn import svm

clf_svm = svm.SVC(kernel='linear')
clf_svm.fit(train_x_vectors, train_y)

# print(test_x[0])
# print(test_x_vectors[0])
clf_svm.predict(test_x_vectors[0])

array(['NEGATIVE'], dtype='<U8')

### decision tree

In [129]:
from sklearn.tree import DecisionTreeClassifier

clf_dec = DecisionTreeClassifier()
clf_dec.fit(train_x_vectors, train_y)

clf_dec.predict(test_x_vectors[0])

array(['NEGATIVE'], dtype='<U8')

#### naive bayes

In [130]:
from sklearn.naive_bayes import GaussianNB
import numpy as np
clf_nb = GaussianNB()
clf_nb.fit(train_x_vectors.toarray(),train_y)

clf_nb.predict(test_x_vectors[0].toarray())

array(['NEGATIVE'], dtype='<U8')

#### evaluation

In [131]:
#mean Accuracy
print(clf_svm.score(test_x_vectors,test_y))
print(clf_dec.score(test_x_vectors,test_y))
print(clf_nb.score(test_x_vectors.toarray(),test_y))

0.8076923076923077
0.6610576923076923
0.6610576923076923


In [132]:
# f1 score
from sklearn.metrics import f1_score
f1_score(test_y,clf_svm.predict(test_x_vectors), average=None,labels=["POSITIVE","NEGATIVE"])
# f1_score(test_y,clf_dec.predict(test_x_vectors), average=None,labels=["POSITIVE","NEUTRAL","NEGATIVE"])

array([0.80582524, 0.80952381])

#### test with own data

In [134]:
new_test=["i love this product","fucking disgusting","what is this","nice"]
new_test_vector = vectorizer.transform(new_test)

clf_svm.predict(new_test_vector)

array(['POSITIVE', 'POSITIVE', 'POSITIVE', 'POSITIVE'], dtype='<U8')

#### tuning our model

In [137]:
from sklearn.model_selection import GridSearchCV
parameters = {'kernel':('linear','rbf'),'C':(1,4,8,16,32)}

svc = svm.SVC()
clf= GridSearchCV(svc, parameters, cv=5)
clf.fit(train_x_vectors,train_y)



GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'kernel': ('linear', 'rbf'), 'C': (1, 4, 8, 16, 32)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [139]:
print(clf.score(test_x_vectors,test_y))

0.8076923076923077


#### saving the clf classfier

In [141]:
import pickle

with open('./sentiment_classifier.pkl','wb') as f:
    pickle.dump(clf, f)

#### loading the clf model

In [142]:
with open('./sentiment_classifier.pkl','rb') as f:
    loaded_clf = pickle.load(f)

In [144]:
print(test_x[0])
loaded_clf.predict(test_x_vectors[0])

I wanted to like this and I wanted to read all of them because it sounded so go. But, I won't. The characters are so immature and the writing was just not good.


array(['NEGATIVE'], dtype='<U8')