In [1]:
import pandas as pd
import json
import random

## Class

In [2]:
class Sentiment:
    NEGATIVE='NEGATIVE'
    POSITIVE='POSITIVE'
    NEUTRAL='NEUTRAL'
    

class Review:
    def __init__(self,text,score):
        self.text=text
        self.score=score
        self.sentiment=self.get_sentiment()
        
    def get_sentiment(self):
        if self.score<=2:
            return Sentiment.NEGATIVE
        elif self.score==3:
            return Sentiment.NEUTRAL
        else:   # score of 4 & 5
            return Sentiment.POSITIVE

        
class ReviewContainer:
        def __init__(self, reviews):
            self.reviews=reviews
            
        def get_x(self):
            '''geting text from reviews'''
            return [x.text for x in self.reviews]
        
        def get_y(self):
            '''getting sentiment for reviews'''
            return [x.sentiment for x in self.reviews]
        
        def evenly_distribute(self):
            'evenly distributing positive and negative reviews'
            negative = list(filter(lambda x: x.sentiment==Sentiment.NEGATIVE, self.reviews))
            positive = list(filter(lambda x: x.sentiment==Sentiment.POSITIVE, self.reviews))
            positive_shrunk = positive[:len(negative)] 
            self.reviews = negative+positive_shrunk
            random.shuffle(self.reviews)
        

### Load data

In [3]:
#df=pd.read_json(file_path,lines=True)   # using pandas much easier

In [4]:
# reading json file and get the data we require i.e. review text and overall score

file_path='Keith_Galli_Tutorial/data/sentiment/Books_small_10000.json'
reviews=[]
with open (file_path) as f:
    for line in f:
        review= json.loads(line)                                  # convering string into dictionary
        reviews.append(Review(review["reviewText"],review["overall"]))  
        

In [5]:
reviews[5].get_sentiment()

'POSITIVE'

## Preparing data

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
training,test=train_test_split(reviews,random_state=0,test_size=0.3)

In [8]:
train_container = ReviewContainer(training)
train_container.evenly_distribute()

train_x= train_container.get_x()
train_y= train_container.get_y()

test_container = ReviewContainer(test)
test_container.evenly_distribute()

test_x = test_container.get_x()
test_y = test_container.get_y()


In [9]:
test_y.count('POSITIVE')

198

### bag of words vectorisation

In [11]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
vectorizer = TfidfVectorizer() # Tfidf gives less preferance to repeatative words

In [12]:
train_x_vectors = vectorizer.fit_transform(train_x)
test_x_vectors = vectorizer.transform(test_x) # not fitting model again as it will create another model


In [13]:
train_x_vectors[0]

<1x9116 sparse matrix of type '<class 'numpy.float64'>'
	with 203 stored elements in Compressed Sparse Row format>

## Classification

### linear SVM

In [14]:
from sklearn import svm

In [15]:
cls_svm = svm.SVC(kernel='linear',gamma=1)

cls_svm.fit(train_x_vectors, train_y)

cls_svm.predict(test_x_vectors[0]) # checking the data

array(['POSITIVE'], dtype='<U8')

In [16]:
test_y[0]

'POSITIVE'

### decision tree

In [17]:
from sklearn.tree import DecisionTreeClassifier

In [18]:
cls_dec = DecisionTreeClassifier()

cls_dec.fit(train_x_vectors,train_y)

cls_dec.predict(test_x_vectors[0])

array(['NEGATIVE'], dtype='<U8')

### Naive Bayes

In [19]:
from sklearn.naive_bayes import GaussianNB

In [20]:
#cls_gnb = GaussianNB()
#cls_gnb.fit(train_x_vectors, train_y)
#cls_gnb.predict(test_x_vectors[0])

### Logistic regression

In [21]:
from sklearn.linear_model import LogisticRegression

In [22]:
cls_lr = LogisticRegression()
cls_lr.fit(train_x_vectors, train_y)
cls_lr.predict(test_x_vectors[0])

array(['POSITIVE'], dtype='<U8')

## Evalution

In [23]:
# mean accuracy
print(cls_svm.score(test_x_vectors, test_y))
print(cls_dec.score(test_x_vectors, test_y))
print(cls_lr.score(test_x_vectors, test_y))

0.8484848484848485
0.6186868686868687
0.8459595959595959


In [24]:
# f1 score
from sklearn.metrics import f1_score
labels = [Sentiment.POSITIVE,Sentiment.NEGATIVE]
f1_score(test_y, cls_svm.predict(test_x_vectors), average=None, labels = labels)

array([0.84848485, 0.84848485])

In [25]:
# qualitative checking
test_set = ['I enjoyed it, thanks','aweful book','not good']
trans_test_set = vectorizer.transform(test_set)

cls_svm.predict(trans_test_set)

array(['POSITIVE', 'NEGATIVE', 'NEGATIVE'], dtype='<U8')

## Tuning model (GRID Search)

In [26]:
from sklearn.model_selection import GridSearchCV

In [27]:
parameters={'gamma':(0.1,0.5,1,4,8,16,20), 'kernel':('rbf', 'linear')}

svc=svm.SVC()

cls_grid = GridSearchCV(svc, parameters, cv=4)

cls_grid.fit(train_x_vectors, train_y)


GridSearchCV(cv=4, estimator=SVC(),
             param_grid={'gamma': (0.1, 0.5, 1, 4, 8, 16, 20),
                         'kernel': ('rbf', 'linear')})

In [28]:
cls_grid.best_estimator_
print(cls_grid.score(test_x_vectors, test_y))


0.8585858585858586


### Saving Model

In [29]:
import pickle

In [30]:
with open ('C:/Users/PASHYA/Desktop/Pandas_Demo/Scikit Learn/models/sentiment_classifier.pkl', 'wb') as model:
    pickle.dump(cls_grid, model)

### Loading model`

In [31]:
with open ('C:/Users/PASHYA/Desktop/Pandas_Demo/Scikit Learn/models/sentiment_classifier.pkl', 'rb') as model:
    loaded_cls_grid=pickle.load(model)

In [32]:
print(test_x[0])
print(loaded_cls_grid.predict(test_x_vectors[0]))

One word: AMAZING! I couldn't put it down. I'm scared for Book two and excited all at once. Tori was my favorite. Definitely a must read
['POSITIVE']
