In [1]:
import json

In [2]:
file_path = "./data/Books_review.json"

In [3]:
import random

In [4]:
class Review:
    def __init__(self, text, score):
        self.text = text
        self.score = score
        self.sentiment = self.get_sentiment()
        
    def get_sentiment(self):
        if self.score <= 2:
            return "NEGATIVE"
        elif self.score == 3:
            return "NEUTRAL"
        else: 
            return "POSITIVE"
        
        
class Reviews_Spread:
    def __init__(self,reviews):
        self.reviews = reviews
    
    
    def get_text(self):
        return [i.text for i in self.reviews]
    
    def get_sentiment(self):
        return [x.sentiment for x in self.reviews]
    
    def data_balance(self):
        negative = list(filter(lambda x: x.sentiment == "NEGATIVE", self.reviews))
        positive = list(filter(lambda x: x.sentiment == "POSITIVE", self.reviews))
        positive_red = positive[:len(negative)]
        self.reviews = negative + positive_red
        random.shuffle(self.reviews)
        
    


In [5]:
reviews = []

In [6]:
with open(file_path) as f:
    for line in f:
        review = json.loads(line)
        reviews.append(Review(review["reviewText"],review["overall"]))

                       
        

In [7]:
reviews[25].text

"Wow...just wow!!!! Omg I absolutely love this sensitive side to Grant!!! My heart hurt for both him and Harlow throughout the book!! I seriously and I mean SERIOUSLY still can't stand Nan!!! I need more....god the wait for book two is gonna kill me!!! You kick some serious a** Abbi and you are one of my absolute favorite authors!!! Thank you so much for writing this series!!!"

In [8]:
reviews[25].score

5.0

In [9]:
reviews[25].sentiment

'POSITIVE'

In [10]:
reviews[10].sentiment

'POSITIVE'

In [11]:
len(reviews)

10000

#### Splitting data into training and test

In [12]:
from sklearn.model_selection import train_test_split

training, test = train_test_split(reviews, test_size=0.33, random_state=42)



train_container = Reviews_Spread(training)

test_container = Reviews_Spread(test)

In [20]:
train_container.data_balance()
train_x = train_container.get_text()
train_y = train_container.get_sentiment()

test_container.data_balance()
test_x = test_container.get_text()
test_y = test_container.get_sentiment()

print(train_y.count("POSITIVE"))
print(train_y.count("NEGATIVE"))

436
436


In [21]:
len(train_x),len(test_x)

(872, 416)

#### Bag of words 

In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer
vector = TfidfVectorizer()

In [23]:
train_x_vector =  vector.fit_transform(train_x)
test_x_vector =   vector.transform(test_x)

In [24]:
print(train_x[0])
print(train_x_vector[0])
train_x_vector.shape

I received a complimentary copy from the author in exchange for an honest review...so I can honestly say...I only finished this book because it was a gifted copy.Wow - I'm so put off and disturbed by the actions of the characters in this book I'm not sure I can continue reading the series. If you enjoyed Cheyenne and Dylan's HEA in When Snow Falls, you might not want to read this book. Cheyenne's on the train to Crazy Town and took Aaron and Presley with her. The whole Cheyenne/Dylan storyline is WRONG. Period. I can't believe the book ends the way it did with them. I thought this book would resolve the huge secret of Wyatt's paternity Cheyenne kept at the end of her book, but instead we're left with an even bigger secret - one that can't possibly bring her any kind of HEA. The Cheyenne/Dylan storyline bothered me so much that it really overshadows everything else in the book.So many characters in the book keeping secrets and lies, supposedly justified, but not really. In addition, the

(872, 8906)

In [25]:
print(test_x[0])
print(test_x_vector[0])
test_x_vector.shape

I really have enjoyed this series and can't wait for the next one to come out. I especially liked that the author gave Kid and Jason their own stories, even though they occurred at the same time. She could have easily put them both into one book and jumped between locations. I am so glad she didn't.The last two installments have not had as much sensuality as the previous 3 books, but the story line is there, so you don't necessarily miss it. I do like how she has not just dropped the other characters in sacrifice to the current beau.This is a great series.
  (0, 8879)	0.05970852573215447
  (0, 8572)	0.10823912621830749
  (0, 8255)	0.09201207568656643
  (0, 8052)	0.07054933230160364
  (0, 8036)	0.07459027771864277
  (0, 7986)	0.10476935339308607
  (0, 7976)	0.07277587875510333
  (0, 7956)	0.07605782076223942
  (0, 7951)	0.0703043261387078
  (0, 7938)	0.0823988133577062
  (0, 7935)	0.08851517348930724
  (0, 7929)	0.2595876394685867
  (0, 7925)	0.048259435249688475
  (0, 7533)	0.056869381

(416, 8906)

##  Modeling 

####  Linear svc

In [26]:
from sklearn import svm

svm_clf = svm.SVC(kernel='linear')


In [29]:
test_x[0]

"I really have enjoyed this series and can't wait for the next one to come out. I especially liked that the author gave Kid and Jason their own stories, even though they occurred at the same time. She could have easily put them both into one book and jumped between locations. I am so glad she didn't.The last two installments have not had as much sensuality as the previous 3 books, but the story line is there, so you don't necessarily miss it. I do like how she has not just dropped the other characters in sacrifice to the current beau.This is a great series."

In [27]:
test_y[0]

'POSITIVE'

In [28]:
print(test_x_vector[0])

  (0, 8879)	0.05970852573215447
  (0, 8572)	0.10823912621830749
  (0, 8255)	0.09201207568656643
  (0, 8052)	0.07054933230160364
  (0, 8036)	0.07459027771864277
  (0, 7986)	0.10476935339308607
  (0, 7976)	0.07277587875510333
  (0, 7956)	0.07605782076223942
  (0, 7951)	0.0703043261387078
  (0, 7938)	0.0823988133577062
  (0, 7935)	0.08851517348930724
  (0, 7929)	0.2595876394685867
  (0, 7925)	0.048259435249688475
  (0, 7533)	0.05686938102762462
  (0, 7530)	0.09982061132562985
  (0, 7280)	0.1241593074735902
  (0, 7063)	0.23223125285120996
  (0, 7002)	0.1597970682154747
  (0, 6844)	0.10762914696469325
  (0, 6411)	0.07219878793079816
  (0, 6293)	0.09764801735691563
  (0, 6112)	0.14133288513819856
  (0, 5637)	0.11445055802141185
  (0, 5589)	0.0733682028857757
  (0, 5583)	0.0823988133577062
  :	:
  (0, 3332)	0.11604766185737089
  (0, 3177)	0.046873954377396804
  (0, 2781)	0.0851529922650715
  (0, 2760)	0.11863465982156403
  (0, 2688)	0.09235350074764578
  (0, 2535)	0.15131401474597617
  (0, 24

In [30]:
svm_clf.fit(train_x_vector,train_y)

SVC(kernel='linear')

In [31]:
svm_clf.predict(test_x_vector[0])

array(['POSITIVE'], dtype='<U8')

#### Decision Tree

In [32]:
from sklearn.tree import DecisionTreeClassifier

clf_dt = DecisionTreeClassifier()
clf_dt.fit(train_x_vector, train_y)

clf_dt.predict(test_x_vector[0])


array(['POSITIVE'], dtype='<U8')

#### Logistic Regression

In [33]:
from sklearn.linear_model import LogisticRegression

log = LogisticRegression()
log.fit(train_x_vector, train_y)

log.predict(test_x_vector[0])


array(['POSITIVE'], dtype='<U8')

#### Evaluation

In [34]:

print(svm_clf.score(test_x_vector, test_y))
print(clf_dt.score(test_x_vector, test_y))
print(log.score(test_x_vector, test_y))

0.8076923076923077
0.6610576923076923
0.8052884615384616


In [35]:
# F1 Scores
from sklearn.metrics import f1_score

f1_score(test_y, svm_clf.predict(test_x_vector), average = None, labels = ["POSITIVE","NEGATIVE"])

array([0.80582524, 0.80952381])

In [36]:
f1_score(test_y, clf_dt.predict(test_x_vector), average = None, labels = ["POSITIVE","NEGATIVE"])


array([0.65693431, 0.66508314])

In [37]:
f1_score(test_y, log.predict(test_x_vector), average = None, labels = ["POSITIVE","NEGATIVE"])


array([0.80291971, 0.80760095])

In [38]:
train_y.count("POSITIVE")

436

In [39]:
train_y.count("NEGATIVE")

436

In [40]:
test_y.count("POSITIVE")

208

In [41]:
test_y.count("NEGATIVE")

208

### Tuning our model (with Grid Search)

In [43]:
from sklearn.model_selection import GridSearchCV

parameters = {'kernel': ('linear', 'rbf'), 'C': (1,4,8,16,32)}

svc = svm.SVC()
random_search = GridSearchCV(svc, parameters, cv=5 , verbose=3)
random_search.fit(train_x_vector, train_y)



Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] C=1, kernel=linear ..............................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] .................. C=1, kernel=linear, score=0.874, total=   0.2s
[CV] C=1, kernel=linear ..............................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s


[CV] .................. C=1, kernel=linear, score=0.811, total=   0.2s
[CV] C=1, kernel=linear ..............................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.4s remaining:    0.0s


[CV] .................. C=1, kernel=linear, score=0.776, total=   0.2s
[CV] C=1, kernel=linear ..............................................
[CV] .................. C=1, kernel=linear, score=0.799, total=   0.2s
[CV] C=1, kernel=linear ..............................................
[CV] .................. C=1, kernel=linear, score=0.879, total=   0.2s
[CV] C=1, kernel=rbf .................................................
[CV] ..................... C=1, kernel=rbf, score=0.857, total=   0.2s
[CV] C=1, kernel=rbf .................................................
[CV] ..................... C=1, kernel=rbf, score=0.829, total=   0.3s
[CV] C=1, kernel=rbf .................................................
[CV] ..................... C=1, kernel=rbf, score=0.776, total=   0.2s
[CV] C=1, kernel=rbf .................................................
[CV] ..................... C=1, kernel=rbf, score=0.793, total=   0.2s
[CV] C=1, kernel=rbf .................................................
[CV] .

[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:   11.9s finished


GridSearchCV(cv=5, estimator=SVC(),
             param_grid={'C': (1, 4, 8, 16, 32), 'kernel': ('linear', 'rbf')},
             verbose=3)

In [44]:
print(random_search.score(test_x_vector, test_y))

0.8197115384615384


## Saving Model

In [45]:
import pickle

with open('./sentiment_classifier.pkl', 'wb') as f:
    pickle.dump(random_search, f)

#### Load & test model

In [46]:
with open('./sentiment_classifier.pkl', 'rb') as f:
    model = pickle.load(f)

In [47]:
test_x[0]

"I really have enjoyed this series and can't wait for the next one to come out. I especially liked that the author gave Kid and Jason their own stories, even though they occurred at the same time. She could have easily put them both into one book and jumped between locations. I am so glad she didn't.The last two installments have not had as much sensuality as the previous 3 books, but the story line is there, so you don't necessarily miss it. I do like how she has not just dropped the other characters in sacrifice to the current beau.This is a great series."

In [49]:

model.predict(test_x_vector[0])

array(['POSITIVE'], dtype='<U8')