We want to create a model that can understand the good and bad comments from our database. Use any models and feature engineerings necessary to achieve the best result.

# Loading

In [403]:
import pandas as pd
import numpy as np
import json
import random

Creating a class for our data:

In [404]:
class Reviews:
    def __init__(self,text,score):
        self.text=text
        self.score=score
        self.sentiment=self.get_sentiment()
    
    def get_sentiment(self):
        if self.score>=4:
            return 'POSITIVE'
        elif self.score<=2:
            return 'NEGATIVE'
        else:
            return 'NEUTRAL'
        

Loading json file:

In [405]:
file_name='Books_small_10000.json'
reviews=[]
with open (file_name) as f:
    for line in f:
        review=json.loads(line)
        reviews.append(Reviews(review['reviewText'],review['overall'])) #Adds to list as a class
print(reviews[1].sentiment)


NEUTRAL


# Prep

We need to even out the number of POSITIVES and NEGATIVES for our training and test 

In [445]:
from sklearn.model_selection import train_test_split

train,test=train_test_split(reviews, test_size=0.33, random_state=42)


t1,t2=[],[]
f1,f2=[],[]

#For training data:
for x in train:
    if x.sentiment=='POSITIVE':
        f1.append(x)
    elif x.sentiment=='NEGATIVE':
        f2.append(x)
        
random.shuffle(f1)
train1=f2+f1[0:len(f2)]
random.shuffle(train1)

train_x=[x.text for x in train1]
train_y=[x.sentiment for x in train1]
        
#Now for our test data:
for x in(test):
    if x.sentiment=='POSITIVE':
        t1.append(x)
    elif x.sentiment=='NEGATIVE':
        t2.append(x)

        
random.shuffle(t1)
test1=t2+t1[0:len(t2)]
random.shuffle(test1)

test_x=[x.text for x in test1]
test_y=[x.sentiment for x in test1]


# Bag of words

In [446]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

vectorizer = TfidfVectorizer()
# vectorizer=CountVectorizer()
train_x_vectors= vectorizer.fit_transform(train_x)
test_x_vectors=vectorizer.transform(test_x)

train_x_vectors.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

# Model

In [450]:
from sklearn.ensemble import GradientBoostingClassifier

clf2=GradientBoostingClassifier(n_estimators=200, learning_rate=0.1,max_depth=2, random_state=0).fit(train_x_vectors, train_y)
clf2.predict(test_x_vectors)
clf2.score(test_x_vectors, test_y)

0.7764423076923077

In [451]:
from sklearn.metrics import f1_score
f1_score(test_y,clf2.predict(test_x_vectors),average=None)

array([0.77590361, 0.77697842])

# Optimizing using GridSearchCV

In [449]:
from sklearn.model_selection import GridSearchCV

clf = xgb.XGBClassifier()
parameters = {
    'n_estimators': [100, 150, 200],
    'max_depth': [1, 2, 3],
    'learning_rate':[0.1,0.2,0.5]
}

grid = GridSearchCV(clf,
                    parameters, n_jobs=4,
                    scoring="neg_log_loss",
                    cv=3)

grid.fit(train_x_vectors, train_y)
print("Best: %f using %s" % (grid.best_score_, grid.best_params_))




Best: -0.499818 using {'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 200}
