We want to create a model that can understand the good and bad comments from our database. Use any models and feature engineerings necessary to achieve the best result.

# Opening the file

### Creating classes to fit our data into

In [1]:
import random
class Sentiment: #doing clean coding(in case we mistype). a simple class.
    NEGATIVE='NEGATIVE'
    NEUTRAL='NEUTRAL'
    POSITIVE='POSITIVE'


#Creating a class, instead of doing a tuple or a list for reviews because its neater:
class Review:
    def __init__(self,text,score):
        self.text=text
        self.score=score
        self.sentiment = self.get_sentiment() #wasn't in the parantheses because It doesnt take it in.
    
    def get_sentiment(self):
        if self.score <= 2:
            return Sentiment.NEGATIVE #could've done return 'negative' too
        elif self.score ==3:
            return Sentiment.NEUTRAL
        else: #score of 4 or 5
            return Sentiment.POSITIVE

class ReviewContainer: #We need to distribute positive and negative test samples evenly.
    def __init__(self,reviews):
        self.reviews=reviews
    
    def get_text(self):
        return [x.text for x in self.reviews]
    
    def get_sentiment(self):
        return [x.sentiment for x in self.reviews]
        
    def evenly_distribute(self): #We wanna have the same number of positive and negative in container.
        negative=list(filter(lambda x: x.sentiment ==Sentiment.NEGATIVE, self.reviews))
        positive=list(filter(lambda x: x.sentiment ==Sentiment.POSITIVE, self.reviews))
        positive_shrunk=positive[:len(negative)] #Shrinking it to negative size
        self.reviews=negative+positive_shrunk
        random.shuffle(self.reviews)

### Loading the json file into our classes

In [2]:
import json

file_name = 'Books_small_10000.json' #we need reviewText and overall from the file

reviews=[]
with open(file_name) as f:#Opens a file
    for line in f: #goes to every line(if its a bunch of dictionary, goes to every dictionary) [which is a text]
        review= json.loads(line) #telling it its a json file and loading it
        reviews.append(Review(review['reviewText'],review['overall'])) #passing the review as a Review(text,score)
reviews[5].score #If it was a tuple, we would have to do reviews[5][1]
reviews[5].sentiment #thanks to our classes.

'POSITIVE'

# Data Preparation

In [3]:
from sklearn.model_selection import train_test_split

training,test=train_test_split(reviews, test_size=0.33, random_state=42) #shift+tab in parantheses to get the documentation

train_container = ReviewContainer(training)

test_container = ReviewContainer(test)

In [4]:
train_container.evenly_distribute()
train_x = train_container.get_text() #[x.text for x in training]
train_y = train_container.get_sentiment() #[x.sentiment for x in training]

test_container.evenly_distribute()
test_x = test_container.get_text() #[x.text for x in test]
test_y = test_container.get_sentiment() #[x.sentiment for x in test]

print(train_y.count(Sentiment.POSITIVE))
print(train_y.count(Sentiment.NEGATIVE))

# train_x=[x.text for x in training]
# train_y=[x.sentiment for x in training]

# test_x=[x.text for x in test]
# test_y=[x.sentiment for x in test]


436
436


### Bag of words vectorization

In [5]:
#setting up a list like [0,0,1,0,0] for every word in sentence.

from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
#Tfidf is a better method, it excludes commonly used words.
vectorizer = TfidfVectorizer()
train_x_vectors=vectorizer.fit_transform(train_x)#fits and then transforms. could be done seperately.
#Now its not a string, but a vector(matrix)

test_x_vectors=vectorizer.transform(test_x) #This is TEST data so we don't have to fit, just transform.

train_x_vectors[0].toarray() #shows our vectors as array

array([[0., 0., 0., ..., 0., 0., 0.]])

# Classification

### Linear SVM

In [6]:
from sklearn import svm

clf_svm=svm.SVC(kernel='linear')#classifier SVM()

clf_svm.fit(train_x_vectors,train_y) #Training

clf_svm.predict(test_x_vectors[0]) #Predicting

array(['NEGATIVE'], dtype='<U8')

### Decision Tree

In [7]:
from sklearn.tree import DecisionTreeClassifier

clf_dec = DecisionTreeClassifier()
clf_dec.fit(train_x_vectors,train_y)

clf_dec.predict(test_x_vectors[0])

array(['NEGATIVE'], dtype='<U8')

#### Random Forrest

In [8]:
from sklearn.ensemble import RandomForestClassifier

clf_RFC=RandomForestClassifier()
clf_RFC.fit(train_x_vectors,train_y)

clf_RFC.predict(test_x_vectors[0])

array(['NEGATIVE'], dtype='<U8')

### Naive Bayes

#### GaussianNB

In [9]:
from sklearn.naive_bayes import GaussianNB
#Note: This one requires arrays, so our data needs to be in arrays.

clf_gnb=GaussianNB()
clf_gnb.fit(train_x_vectors.toarray(),train_y)

clf_gnb.predict(test_x_vectors[0].toarray())


array(['NEGATIVE'], dtype='<U8')

#### MultinomialNB

In [10]:
from sklearn.naive_bayes import MultinomialNB

clf_MNB=MultinomialNB()
clf_MNB.fit(train_x_vectors,train_y)

clf_MNB.predict(test_x_vectors[0])

array(['NEGATIVE'], dtype='<U8')

### Logistic Regression

In [11]:
from sklearn.linear_model import LogisticRegression

clf_log=LogisticRegression()
clf_log.fit(train_x_vectors,train_y)

clf_log.predict(test_x_vectors[0])

array(['NEGATIVE'], dtype='<U8')

# Evaluation

Mean accuracy:

In [12]:
#.score command compares predicted values with actual test values
print(clf_svm.score(test_x_vectors,test_y)) #81
print(clf_dec.score(test_x_vectors,test_y)) #77
print(clf_RFC.score(test_x_vectors,test_y)) #83
print(clf_gnb.score(test_x_vectors.toarray(),test_y)) #65
print(clf_MNB.score(test_x_vectors,test_y)) #83
print(clf_log.score(test_x_vectors,test_y)) #84

0.8076923076923077
0.6730769230769231
0.78125
0.6610576923076923
0.8125
0.8052884615384616


F1 Score

In [13]:
from sklearn.metrics import f1_score

print(f1_score(test_y,clf_svm.predict(test_x_vectors),average=None,labels=[Sentiment.POSITIVE,Sentiment.NEUTRAL,Sentiment.NEGATIVE]))
#It's good for POSITIVE,but trash for NEGATIVE and NEUTRAL
print(f1_score(test_y,clf_dec.predict(test_x_vectors),average=None,labels=[Sentiment.POSITIVE,Sentiment.NEUTRAL,Sentiment.NEGATIVE]))
print(f1_score(test_y,clf_RFC.predict(test_x_vectors),average=None,labels=[Sentiment.POSITIVE,Sentiment.NEUTRAL,Sentiment.NEGATIVE]))
print(f1_score(test_y,clf_gnb.predict(test_x_vectors.toarray()),average=None,labels=[Sentiment.POSITIVE,Sentiment.NEUTRAL,Sentiment.NEGATIVE]))
print(f1_score(test_y,clf_MNB.predict(test_x_vectors.toarray()),average=None,labels=[Sentiment.POSITIVE,Sentiment.NEUTRAL,Sentiment.NEGATIVE]))
print(f1_score(test_y,clf_log.predict(test_x_vectors),average=None,labels=[Sentiment.POSITIVE,Sentiment.NEUTRAL,Sentiment.NEGATIVE]))


#They are all terrible at our negative predicting

[0.80582524 0.         0.80952381]
[0.66502463 0.         0.68075117]
[0.77858881 0.         0.78384798]
[0.65693431 0.         0.66508314]
[0.79144385 0.         0.82969432]
[0.80291971 0.         0.80760095]


  _warn_prf(
  _warn_prf(
  _warn_prf(


To investigate, we can check our data:

In [14]:
test_y.count(Sentiment.POSITIVE) #Counting our negative reviews. we should make our test data balanced.

208

# Tuning our model(with Grid search)

In [15]:
from sklearn.model_selection import GridSearchCV

parameters={'kernel':('linear','rbf'), 'C':(1,4,8)}
#GridSearchCV will check every one of these for our model
svc=svm.SVC()
clf=GridSearchCV(svc,parameters,cv=5)
clf.fit(train_x_vectors,train_y)


GridSearchCV(cv=5, estimator=SVC(),
             param_grid={'C': (1, 4, 8), 'kernel': ('linear', 'rbf')})

# Saving the models

In [16]:
import pickle
with open ('sentiment_classifier.pkl','wb') as f:#wb=write buffer
    pickle.dump(clf,f) #clf_WE gets dumped into that file
with open ('vectorizer.pkl','wb') as f:
    pickle.dump(vectorizer,f)

# Load model

In [17]:
with open ('sentiment_classifier.pkl','rb') as f: #read buffer
    loaded_clf=pickle.load(f)
with open ('vectorizer.pkl','rb') as f: #read buffer
    loaded_vectorizer=pickle.load(f)

In [18]:
loaded_clf.predict(test_x_vectors[0]) #So It's all loaded

array(['NEGATIVE'], dtype='<U8')