In [98]:
import random
class Review:
    def __init__(self,text,score):
        self.text  = text
        self.score = score
        self.sentiment = self.get_sentiment()
        
    def get_sentiment(self):
        if self.score <= 2:
            return 'NEGATIVE'
        if self.score == 3:
            return 'NEUTRAL'
        else :#for scores 4 and 5
            return 'POSITIVE'
#To evenly distribute the classes of POSITIVE and NEGATIVE      
class ReviewContainer:
    def __init__(self,reviews):
        self.reviews = reviews
    def get_text(self):
        return [x.text for x in self.reviews]
    def get_sentiment(self):
        return [x.sentiment for x in self.reviews]
    
    def evenly_distribute(self): #to evenly distribute the data between positive and negative
        positive = list(filter(lambda x:x.sentiment=='POSITIVE',self.reviews))
        negative = list(filter(lambda x:x.sentiment=='NEGATIVE',self.reviews))
#         print(len(positive))
#         print(len(negative))
        new_positive = positive[:len(negative)]
        self.reviews = new_positive+negative
        random.shuffle(self.reviews)

In [1]:
import json

In [68]:
#convert the string data format to json format
file_name = 'Book Review_10000.json'
reviews = []
with open(file_name) as f:
    for line in f:
        review = json.loads(line)
        reviews.append(Review(review['reviewText'],review['overall']))
print(reviews[5].score,reviews[5].sentiment)



5.0 POSITIVE


In [69]:
#Splitting the data to train and test data set
from sklearn.model_selection import train_test_split
train,test= train_test_split(reviews,test_size = 0.1,random_state = 42)

In [113]:
train_cont = ReviewContainer(train)
train_cont.evenly_distribute()
X_train = train_cont.get_text()
Y_train = train_cont.get_sentiment()

test_cont = ReviewContainer(test)
test_cont.evenly_distribute()
X_test = test_cont.get_text()
Y_test = test_cont.get_sentiment()

print(len(X_train))
print(X_train[0])
print(Y_train.count('POSITIVE'))
print(Y_train.count('NEGATIVE'))

1176
loved this book...and I hope she writes about these character again.....and that the publisher PRINTS IT IN BOOK FORM AT THE SAME TIME THEY RELEASE THE E BOOK (hint hint) because there ARE those of us out here for whom technology is NOT a friend...this book will definitely be a keeper for your collection.......
588
588


In [105]:
len(X_train)

1176

In [106]:
print(X_train[0],Y_train[0])

I received this ARC in exchange for an honest review. I felt like the story was undeveloped and very slow to point and then rushed into a climax only to abruptly end with a cliffhanger. Vincent was a hot, sexy guy; Kristen was an angsty female. Definitely has the bones to be a great story just not there yet. NEGATIVE


In [124]:
## Transform the string to vector form
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
vectorizer = TfidfVectorizer()
train_X = vectorizer.fit_transform(X_train)
test_X = vectorizer.transform(X_test)

In [125]:
print(train_X[0].toarray())

[[0. 0. 0. ... 0. 0. 0.]]


### Classification

In [126]:
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression

#fit to a linear svm
clf_svm = SVC(kernel = 'linear')
clf_svm.fit(train_X,Y_train)

#fit to a Decision Tree Classifier
clf_dt = DecisionTreeClassifier(random_state = 42)
clf_dt.fit(train_X,Y_train)

#fit to Gaussian NB
clf_GNB = GaussianNB()
clf_GNB.fit(train_X.toarray(),Y_train)

# fit to a logistic regression
clf_lr = LogisticRegression(solver = 'liblinear') #default solver was changed to lbfgs
clf_lr.fit(train_X,Y_train)

LogisticRegression(solver='liblinear')

In [127]:
pred_svm = clf_svm.predict(test_X[0])
print(Y_test[0],pred)

pred_dt = clf_dt.predict(test_X[0])
print(Y_test[0],pred_dt)

pred_GNB = clf_GNB.predict(test_X[0].toarray())
print(Y_test[0],pred)

pred_lr = clf_lr.predict(test_X[0])
print(Y_test[0],pred)

POSITIVE ['POSITIVE']
POSITIVE ['NEGATIVE']
POSITIVE ['POSITIVE']
POSITIVE ['POSITIVE']


### Evaluation

In [128]:
#Mean Accuracy
print(clf_svm.score(test_X,Y_test))
print(clf_dt.score(test_X,Y_test))
print(clf_GNB.score(test_X.toarray(),Y_test))
print(clf_lr.score(test_X,Y_test))

0.8214285714285714
0.6785714285714286
0.5982142857142857
0.8214285714285714


In [129]:
#Calculate the F1-score
from sklearn.metrics import f1_score as fs
pred_svm = clf_svm.predict(test_X)
pred_dt = clf_dt.predict(test_X)
pred_GNB = clf_GNB.predict(test_X.toarray())
pred_lr = clf_lr.predict(test_X)
score_svm = fs(Y_test,pred_svm,average = None,labels = ['POSITIVE','NEGATIVE'])
score_dt = fs(Y_test,pred_dt,average = None)
score_GNB = fs(Y_test,pred_GNB,average = None)
score_lr = fs(Y_test,pred_lr,average = None)
print(score_svm)
print(score_dt)
print(score_GNB)
print(score_lr)

[0.82142857 0.82142857]
[0.67272727 0.68421053]
[0.60176991 0.59459459]
[0.8245614  0.81818182]


In [130]:
## Test the classifier with our own data
test_data = ['great book','don\'t but it','worst book ever purchased']
new_test = vectorizer.transform(test_data)
clf_svm.predict(new_test)

array(['POSITIVE', 'NEGATIVE', 'NEGATIVE'], dtype='<U8')

### Finding the best parameters using GridSearchCV

In [136]:
from sklearn.model_selection import GridSearchCV
parameters_svm = {'kernel':('linear','rbf'),'C':[1,2,3,4,5]}
parameters_lr = {'penalty':('l1', 'l2', 'elasticnet', 'none'),'solver':('newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga')}
clf_svm = GridSearchCV(SVC(),parameters_svm)
clf_lr = GridSearchCV(LogisticRegression(),parameters_lr)
clf_svm.fit(train_X,Y_train)
clf_lr.fit(train_X,Y_train)


Traceback (most recent call last):
  File "C:\Users\Dibya\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Dibya\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1304, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\Dibya\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 442, in _check_solver
    raise ValueError("Solver %s supports only 'l2' or 'none' penalties, "
ValueError: Solver newton-cg supports only 'l2' or 'none' penalties, got l1 penalty.

Traceback (most recent call last):
  File "C:\Users\Dibya\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Dibya\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1304, in fit
    solver = _check_solver(self.solver, s



GridSearchCV(estimator=LogisticRegression(),
             param_grid={'penalty': ('l1', 'l2', 'elasticnet', 'none'),
                         'solver': ('newton-cg', 'lbfgs', 'liblinear', 'sag',
                                    'saga')})

In [137]:
print(clf_svm.best_estimator_,clf_svm.best_score_)
print(clf_lr.best_estimator_,clf_lr.best_score_)

SVC(C=2) 0.8290551749008295
LogisticRegression(solver='newton-cg') 0.8231049404976559


### Saving the model

In [140]:
import pickle
with open('./models/svm_classifier.pkl','wb') as f:
    pickle.dump(clf_svm,f)

### Load Model

In [141]:
with open('./models/svm_classifier.pkl','rb') as f: 
    loaded_model = pickle.load(f)

In [142]:
print(test_data)
loaded_model.predict(new_test)

['great book', "don't but it", 'worst book ever purchased']


array(['POSITIVE', 'NEGATIVE', 'NEGATIVE'], dtype='<U8')