In [5]:
import numpy as np
import pandas as pd 

### Data Class


In [6]:
import random
class Sentiment:
    NEGATIVE ="NEGATIVE"
    NEUTRAL = "NEUTRAL"
    POSITIVE ="POSITIVE"
class Review:
    def __init__(self,text,score):
        self.text = text
        self.score = score
        self.sentiment = self.get_sentiment()
        
    def get_sentiment(self):
        if self.score <=2:
            return Sentiment.NEGATIVE
        elif self.score == 3:
            return Sentiment.NEUTRAL
        else:
            return Sentiment.POSITIVE
        
             
            
class ReviewContainer :
    def __init__(self,reviews):
        self.reviews = reviews
        
    def get_text(self):
        return[x.text for x in self.reviews]
    
    def get_sentiment(self):
        return[x.sentiment for x in self.reviews]    
    
    def evenly_distrbute(self):
        negative = list(filter(lambda x:x.sentiment == Sentiment.NEGATIVE,self.reviews))
        positive = list(filter(lambda x:x.sentiment == Sentiment.POSITIVE,self.reviews))
        # to make the positive and negative are equal
        positive_shrunk =positive[:len(negative)]
        self.reviews = negative + positive_shrunk
        random.shuffle(self.reviews)
    
        
        

### Load Data

In [7]:
import json 

filename ='books_small_10000.json'
reviews = []
with open(filename) as f:
    for line in f:
        review = json.loads(line)
        reviews.append(Review(review['reviewText'],review['overall']))
        

### Prep Data

In [8]:
from sklearn.model_selection import train_test_split

training ,test=train_test_split(reviews ,test_size = 0.33,random_state=42)

train_container = ReviewContainer(training)
test_container = ReviewContainer(test)


In [9]:
# We Prepare x and y (inputs and outputs for the model)
train_container.evenly_distrbute()
train_x = train_container.get_text()
train_y = train_container.get_sentiment()

test_container.evenly_distrbute()
test_x = test_container.get_text()
test_y = test_container.get_sentiment()

#### Bag of words vectorization

In [10]:
from sklearn.feature_extraction.text import CountVectorizer ,TfidfVectorizer

vectorizer  = TfidfVectorizer()
train_x_vector=vectorizer.fit_transform(train_x)

test_x_vector=vectorizer.transform(test_x)

### Classification

#### Linear SVM

In [11]:
from sklearn import svm

clf_svm = svm.SVC(kernel='linear')

clf_svm.fit(train_x_vector,train_y)

clf_svm.predict(test_x_vector[0])

array(['NEGATIVE'], dtype='<U8')

#### Decision tree

In [12]:
from sklearn.tree import DecisionTreeClassifier

clf_dec = DecisionTreeClassifier()
clf_dec.fit(train_x_vector ,train_y)

clf_dec.predict(test_x_vector[0])

array(['POSITIVE'], dtype='<U8')

#### Logistic Regression

In [13]:
from sklearn.linear_model import LogisticRegression

clf_log = LogisticRegression()
clf_log.fit(train_x_vector ,train_y)

clf_log.predict(test_x_vector[0])

array(['NEGATIVE'], dtype='<U8')

### Evaluation

In [14]:
#Mean Accuracy
#check this prediction compares to test_y
print(clf_svm.score(test_x_vector,test_y))
print(clf_dec.score(test_x_vector,test_y))
print(clf_log.score(test_x_vector,test_y))


0.8076923076923077
0.6442307692307693
0.8052884615384616


In [15]:
# F1 score
from sklearn.metrics import f1_score

f1_score(test_y,clf_svm.predict(test_x_vector),average=None,labels=[Sentiment.NEGATIVE,Sentiment.POSITIVE])
f1_score(test_y,clf_dec.predict(test_x_vector),average=None,labels=[Sentiment.NEGATIVE,Sentiment.POSITIVE])
f1_score(test_y,clf_log.predict(test_x_vector),average=None,labels=[Sentiment.NEGATIVE,Sentiment.POSITIVE])

array([0.80760095, 0.80291971])

In [16]:
test_y.count(Sentiment.POSITIVE)
# when we investigate in data we see that most of data is positive

# We will try to Balanace the data  by adding more samples

208

In [17]:
test_set = ['very fun', "bad book do not buy", 'horrible waste of time']
new_test = vectorizer.transform(test_set)

clf_svm.predict(new_test)

array(['POSITIVE', 'NEGATIVE', 'NEGATIVE'], dtype='<U8')

In [18]:
test_set = ['very fun', "bad book do not buy", 'horrible waste of time']
new_test = vectorizer.transform(test_set)

clf_log.predict(new_test)

array(['POSITIVE', 'NEGATIVE', 'NEGATIVE'], dtype='<U8')

## Turing our model (with Grid Search)

In [19]:
from sklearn.model_selection import GridSearchCV

parameters = {'kernel': ('linear', 'rbf'), 'C': (1,4,8,16,32)}

svc = svm.SVC()
clf = GridSearchCV(svc, parameters, cv=5)
clf.fit(train_x_vector,train_y)

GridSearchCV(cv=5, estimator=SVC(),
             param_grid={'C': (1, 4, 8, 16, 32), 'kernel': ('linear', 'rbf')})

In [21]:
print(clf.score(test_x_vector, test_y))

0.8197115384615384


## Saving Model

In [23]:
import pickle

with open('./models/sentiment_classifier.pkl', 'wb') as f:
    pickle.dump(clf, f)

## Load Model 

In [25]:
with open('./models/sentiment_classifier.pkl', 'rb') as f:
    loaded_clf = pickle.load(f)

In [27]:
print(test_x[0])

loaded_clf.predict(test_x_vector[0])

I couldn't stay interested in the places and theEvents of a less than seller rock and roll band.Boring


array(['NEGATIVE'], dtype='<U8')