Load Data

In [3]:
# import json to load jsonfile.
import json

file_name = 'C:/Users/ASUS/Anaconda2/dataset/Books_small_10000.json'

## using json to load line information

with open(file_name) as f:
    for line in f:
        review = json.loads(line)
        print(line) # print all info
    
        print(review['reviewText']) # print only review text
        print(review['overall']) # print only ratings
        
        break  

{"reviewerID": "A1F2H80A1ZNN1N", "asin": "B00GDM3NQC", "reviewerName": "Connie Correll", "helpful": [0, 0], "reviewText": "I bought both boxed sets, books 1-5.  Really a great series!  Start book 1 three weeks ago and just finished book 5.  Sloane Monroe is a great character and being able to follow her through both private life and her PI life gets a reader very involved!  Although clues may be right in front of the reader, there are twists and turns that keep one guessing until the last page!  These are books you won't be disappointed with.", "overall": 5.0, "summary": "Can't stop reading!", "unixReviewTime": 1390435200, "reviewTime": "01 23, 2014"}

I bought both boxed sets, books 1-5.  Really a great series!  Start book 1 three weeks ago and just finished book 5.  Sloane Monroe is a great character and being able to follow her through both private life and her PI life gets a reader very involved!  Although clues may be right in front of the reader, there are twists and turns that k

In [4]:
#using reviews as an empty list to append both reviewText and overall rating

reviews = []

with open(file_name) as f:
    for line in f:
        review = json.loads(line)
        reviews.append((review['reviewText'],review['overall']))
        
reviews[100][0]

u'Good follow up to the original.  A little faster paced than book one, which I found refreshing.  The story fills in a lot of historic details in the origins of the Garden.  While it seemed a bit rushed toward the end, it still qualifies as a good read.  I would recommended it.'

Class Data

In [5]:
#clean it up using a data class functions

import random

class Sentiment:
    NEGATIVE = 'NEGATIVE'
    NEUTRAL = 'NEUTRAL'
    POSITIVE = 'POSITIVE'


class Review:    
    def __init__(self,text,rating):
        self.t = text
        self.r = rating
        
#sentiment function using if to get negative,nuetral or positive ratings
            
        self.s = self.get_sentiment()        

    def get_sentiment(self):
        if self.r <=2:
            return Sentiment.NEGATIVE
        elif self.r ==3:
            return Sentiment.NEUTRAL
        else:
            return Sentiment.POSITIVE
        
        
#class reviewcontainer for even distribution.
class ReviewContainer:
    def __init__(self,reviews):
        self.reviews = reviews
        
    def get_text(self):
        return [x.t for x in self.reviews]
    
    def get_sentiment(self):
        return [x.s for x in self.reviews]
        
        
        
        
        
        
    def evenly_distribute(self):
        negative = list(filter(lambda x: x.s == Sentiment.NEGATIVE, self.reviews))
        positive = list(filter(lambda x: x.s == Sentiment.POSITIVE, self.reviews))
        #neutral = list(filter(lambda x: x.s ==Sentiment.NEUTRAL, self.reviews))
        
#shrink the no.of positives to evenly match negatives & assign to self.reviews
        positive_shrunk = positive[:len(negative)]
        self.reviews = negative + positive_shrunk
        random.shuffle(self.reviews)

In [6]:
# instead u can use the class data function in append.()

reviews = []

with open(file_name) as f:
    for line in f:
        review = json.loads(line)
        reviews.append(Review(review['reviewText'],review['overall']))
        
reviews[1000].s

'POSITIVE'

Prep Data

In [7]:
#import sklearn model and use split data to test nd train

from sklearn.model_selection import train_test_split as t_t_s


training, test = t_t_s(reviews,test_size=0.33,random_state=42)

#splits dataset into 0.33 for training and 0.67 for test. 



train_container = ReviewContainer(training)

test_container = ReviewContainer(test)

In [8]:
print(training[1000].s)

POSITIVE


In [9]:
#& split into x and y variables to predict using text & sentiment
train_container.evenly_distribute() #evenly distribute neg&pos making both 436.

train_x = train_container.get_text() #for training
train_y = train_container.get_sentiment()

test_container.evenly_distribute() #evenly distribute neg&pos making both 208

test_x = test_container.get_text()     # for testing
test_y = test_container.get_sentiment()

print(train_y.count(Sentiment.NEGATIVE))
print(train_y.count(Sentiment.POSITIVE))

436
436


Bag of Words Vectorization

In [11]:
# import the countVectorizer
# to change text to numeric variables.

from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

In [12]:
#vectorize training & test dataset to numerical vectors
vectorizer = TfidfVectorizer()

train_x_vectors = vectorizer.fit_transform(train_x)

test_x_vectors = vectorizer.transform(test_x)

# alternative
#vectorizer.fit(train_x)
#train_x_vectorizer = vectorizer.transform(train_x)


print(train_x[100])
print(train_x_vectors[100].toarray())

train_x_vectors
train_y

Fun book to read.  Lots of humor to keep the reader interested.  Since I believe in spirits but, can't see or hear them, I'm kinda jealous and amazed at the gift that some people have.  Keep writing about Angela. I'm waiting for book three.  Is it out yet?
[[0. 0. 0. ... 0. 0. 0.]]


['NEGATIVE',
 'NEGATIVE',
 'POSITIVE',
 'POSITIVE',
 'NEGATIVE',
 'NEGATIVE',
 'POSITIVE',
 'POSITIVE',
 'POSITIVE',
 'POSITIVE',
 'NEGATIVE',
 'POSITIVE',
 'POSITIVE',
 'NEGATIVE',
 'NEGATIVE',
 'POSITIVE',
 'NEGATIVE',
 'POSITIVE',
 'NEGATIVE',
 'NEGATIVE',
 'POSITIVE',
 'NEGATIVE',
 'POSITIVE',
 'POSITIVE',
 'POSITIVE',
 'POSITIVE',
 'POSITIVE',
 'NEGATIVE',
 'POSITIVE',
 'NEGATIVE',
 'POSITIVE',
 'NEGATIVE',
 'POSITIVE',
 'POSITIVE',
 'NEGATIVE',
 'NEGATIVE',
 'NEGATIVE',
 'NEGATIVE',
 'NEGATIVE',
 'POSITIVE',
 'NEGATIVE',
 'NEGATIVE',
 'NEGATIVE',
 'POSITIVE',
 'POSITIVE',
 'NEGATIVE',
 'NEGATIVE',
 'NEGATIVE',
 'NEGATIVE',
 'POSITIVE',
 'POSITIVE',
 'NEGATIVE',
 'POSITIVE',
 'NEGATIVE',
 'POSITIVE',
 'POSITIVE',
 'NEGATIVE',
 'POSITIVE',
 'NEGATIVE',
 'NEGATIVE',
 'POSITIVE',
 'POSITIVE',
 'NEGATIVE',
 'NEGATIVE',
 'NEGATIVE',
 'POSITIVE',
 'NEGATIVE',
 'NEGATIVE',
 'NEGATIVE',
 'NEGATIVE',
 'NEGATIVE',
 'NEGATIVE',
 'NEGATIVE',
 'NEGATIVE',
 'NEGATIVE',
 'NEGATIVE',
 'POSITIVE',

Classification

In [13]:
# Linear SVM- load svm

from sklearn import svm

clf_svm = svm.SVC(kernel='linear')

clf_svm.fit(train_x_vectors,train_y)

test_x[100]

clf_svm.predict(test_x_vectors[100])

array(['NEGATIVE'], dtype='|S8')

Decision Tree classifier

In [14]:

from sklearn.tree import DecisionTreeClassifier


clf_dec = DecisionTreeClassifier()

clf_dec.fit(train_x_vectors,train_y)

clf_dec.predict(test_x_vectors[100])

array(['POSITIVE'], dtype='|S8')

Logistic Regression

In [15]:
from sklearn.linear_model import LogisticRegression

clf_log = LogisticRegression()

clf_log.fit(train_x_vectors,train_y)

clf_log.predict(test_x_vectors[100])



array(['NEGATIVE'], dtype='|S8')

Naive Bayes 

In [16]:
from sklearn.naive_bayes import BernoulliNB


clf_gnb = BernoulliNB()
 
clf_gnb.fit(train_x_vectors,train_y)

clf_gnb.predict(test_x_vectors[100])

array(['NEGATIVE'], dtype='|S8')

Evalution

In [17]:
# mean accuracy on all test labels
print(clf_svm.score(test_x_vectors,test_y))

print(clf_dec.score(test_x_vectors,test_y))

print(clf_log.score(test_x_vectors,test_y))

print(clf_gnb.score(test_x_vectors,test_y))

0.8076923076923077
0.6754807692307693
0.8028846153846154
0.8269230769230769


In [18]:
#f_1 score to check the wieghted average

from sklearn.metrics import f1_score

In [19]:
#showing the weighted average in relation to various sentiments
print(f1_score(test_y,clf_svm.predict(test_x_vectors),average=None,labels=(Sentiment.POSITIVE,Sentiment.NEUTRAL,Sentiment.NEGATIVE)))

print(f1_score(test_y,clf_dec.predict(test_x_vectors),average=None,labels=(Sentiment.POSITIVE,Sentiment.NEUTRAL,Sentiment.NEGATIVE)))

print(f1_score(test_y,clf_log.predict(test_x_vectors),average=None,labels=(Sentiment.POSITIVE,Sentiment.NEUTRAL,Sentiment.NEGATIVE)))

print(f1_score(test_y,clf_gnb.predict(test_x_vectors),average=None,labels=(Sentiment.POSITIVE,Sentiment.NEUTRAL,Sentiment.NEGATIVE)))

[0.80582524 0.         0.80952381]
[0.66992665 0.         0.68085106]
[0.80097087 0.         0.8047619 ]
[0.83561644 0.         0.81725888]


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [20]:
print(test_y.count(Sentiment.POSITIVE))
print(test_y.count(Sentiment.NEGATIVE))

print(train_y.count(Sentiment.NEUTRAL))

print(train_y.count(Sentiment.NEGATIVE))
# check the number of positives,neutrals, negatives in the train_y label

208
208
0
436


 so we'll go bck to ceate a class review container to evenly distribute positive and negative labels on both train and test data
    

In [21]:
# Qualitative analis
test_set = ['not great','bad book not dont buy','waste of time']

new_test = vectorizer.transform(test_set)

clf_svm.predict(new_test)

array(['NEGATIVE', 'NEGATIVE', 'NEGATIVE'], dtype='|S8')

use GridSearchCV to automatically find the best parameter

In [22]:
from sklearn.model_selection import GridSearchCV

In [23]:
parameters = {'kernel':('linear','rbf'),'C':(1,4,8,16,32)}

svc =  svm.SVC()

clf = GridSearchCV(svc,parameters,cv=5)
clf.fit(train_x_vectors,train_y)



GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'kernel': ('linear', 'rbf'), 'C': (1, 4, 8, 16, 32)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [24]:
print(clf.score(test_x_vectors,test_y))

0.8052884615384616


use Pickle to Save the Model

In [25]:
import pickle

In [26]:
with open('C:/Users/ASUS/Anaconda2/dataset/models/sentiment_classifier.pkl', 'wb') as f:
    pickle.dump(clf,f)    

Load Model

In [27]:
with open('C:/Users/ASUS/Anaconda2/dataset/models/sentiment_classifier.pkl','rb') as f:
    loaded_clf = pickle.load(f)

In [28]:
print(test_x[100])
loaded_clf.predict(test_x_vectors[100])

I don't know if its me or how this is written but I did not like the way it comes across. I felt like I was being talked down to. The story line is good but the narration leaves so much to be desired. The writing doesn't make the character innocent, makes her snobby.


array(['NEGATIVE'], dtype='|S8')