<h1> Prep

<h2> Classes Iniciais

In [1]:
import random

class Sentiment:
    NEGATIVE = "NEGATIVE"
    POSITIVE = "POSITIVE"
    
class Review:
    def __init__(self, text, score):
        self.text = text
        self.score = score
        self.sentiment = self.get_sentiment()
    
    def get_sentiment(self):
        if self.score <= 3:
            return Sentiment.NEGATIVE
        else:
            return Sentiment.POSITIVE
    def __repr__(self):
        return f'Text: {self.text}\nScore: {self.score}'
    
class ReviewContainer():
    def __init__(self, reviews):
        self.reviews = reviews
    
    def get_text(self):
        return [x.text for x in self.reviews]
    
    def get_sentiment(self):
        return [x.sentiment for x in self.reviews]
        
    def evenly_distribute(self):
        negative = list(filter(lambda x: x.sentiment == Sentiment.NEGATIVE, self.reviews))
        positive = list(filter(lambda x: x.sentiment == Sentiment.POSITIVE, self.reviews))
                
        random.shuffle(positive)
        positive_shrunk = positive[:len(negative)]
        
        self.reviews = negative + positive_shrunk
        random.shuffle(self.reviews)

<h2> Dados Base

In [2]:
import json

file_name = './data/books_large.json'

reviews = []
with open(file_name) as f:
    for line in f:
        review = json.loads(line)
        reviews.append(Review(review["reviewText"], review["overall"]))

<h2>Prep Data

In [3]:
from sklearn.model_selection import train_test_split

training, test = train_test_split(reviews,test_size=0.33, random_state=42)

train_container = ReviewContainer(training)
test_container = ReviewContainer(test)

In [4]:
train_container.evenly_distribute()
train_x = train_container.get_text()
train_y = train_container.get_sentiment()

test_container.evenly_distribute()
test_x = test_container.get_text()
test_y = test_container.get_sentiment()

<h2> Bag of Words Vectorization

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [6]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer  

#vectorizer = CountVectorizer()
vectorizer = TfidfVectorizer()
train_x_vectors = vectorizer.fit_transform(train_x)

test_x_vectors = vectorizer.transform(test_x)

#we could also do
#vectorizer.fit(train_x)
#train_x_vectors = vectorizer.transform(train_x)

# Classification

## Linear SVM

In [7]:
from sklearn import svm

clf_svm = svm.SVC(kernel='linear')
clf_svm.fit(train_x_vectors, train_y)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [8]:
clf_svm.predict(test_x_vectors[0])

array(['NEGATIVE'], dtype='<U8')

## Decision Tree

In [9]:
from sklearn.tree import DecisionTreeClassifier

clf_tree = DecisionTreeClassifier()
clf_tree.fit(train_x_vectors, train_y)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [10]:
clf_tree.predict(test_x_vectors[0])

array(['POSITIVE'], dtype='<U8')

## Naive Bayes

In [11]:
from sklearn.naive_bayes import GaussianNB

clf_NB = GaussianNB()
clf_NB.fit(train_x_vectors.toarray(), train_y)

GaussianNB(priors=None, var_smoothing=1e-09)

In [12]:
clf_NB.predict(test_x_vectors[0].toarray())

array(['NEGATIVE'], dtype='<U8')

## Logistic Regression

In [13]:
from sklearn.linear_model import LogisticRegression

clf_log = LogisticRegression(max_iter=500)
clf_log.fit(train_x_vectors, train_y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=500,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [14]:
clf_log.predict(test_x_vectors[0].toarray())

array(['NEGATIVE'], dtype='<U8')

# Evaluation

In [15]:
print(f'SVM: {clf_svm.score(test_x_vectors, test_y)}')
print(f'Decision Tree: {clf_tree.score(test_x_vectors, test_y)}')
print(f'Naive Bayes {clf_NB.score(test_x_vectors.toarray(), test_y)}')
print(f'Logistic: {clf_log.score(test_x_vectors, test_y)}')

SVM: 0.7879924953095685
Decision Tree: 0.6144465290806754
Naive Bayes 0.5722326454033771
Logistic: 0.775797373358349


In [16]:
from sklearn.metrics import f1_score

print(f'SVM: {f1_score(test_y, clf_svm.predict(test_x_vectors), average=None, labels = [Sentiment.POSITIVE, Sentiment.NEGATIVE])}')
print(f'Decision Tree: {f1_score(test_y, clf_tree.predict(test_x_vectors), average=None, labels = [Sentiment.POSITIVE, Sentiment.NEGATIVE])}')
print(f'Naive Bayes: {f1_score(test_y, clf_NB.predict(test_x_vectors.toarray()), average=None, labels = [Sentiment.POSITIVE, Sentiment.NEGATIVE])}')
print(f'Logistic: {f1_score(test_y, clf_log.predict(test_x_vectors), average=None, labels = [Sentiment.POSITIVE, Sentiment.NEGATIVE])}')

SVM: [0.78638941 0.78957169]
Decision Tree: [0.6183844  0.61042654]
Naive Bayes: [0.58992806 0.55294118]
Logistic: [0.77259753 0.77890842]


In [17]:
print(train_y.count(Sentiment.POSITIVE), train_y.count(Sentiment.NEGATIVE))

1089 1089


In [18]:
test_set = ['I really enjoyed this, 5 stars', 'bad book do not buy', 'horrible, waste of time', 'the best product i ever bought']
expected = [Sentiment.POSITIVE, Sentiment.NEGATIVE, Sentiment.NEGATIVE, Sentiment.POSITIVE]
new_test = vectorizer.transform(test_set)
print(f'Reality: {expected}')
print(f'SVM: {clf_svm.predict(new_test)}')
print(f'Tree: {clf_tree.predict(new_test)}')
print(f'NB: {clf_NB.predict(new_test.toarray())}')
print(f'Logistic: {clf_log.predict(new_test)}')

Reality: ['POSITIVE', 'NEGATIVE', 'NEGATIVE', 'POSITIVE']
SVM: ['POSITIVE' 'NEGATIVE' 'NEGATIVE' 'POSITIVE']
Tree: ['POSITIVE' 'NEGATIVE' 'NEGATIVE' 'POSITIVE']
NB: ['POSITIVE' 'POSITIVE' 'NEGATIVE' 'NEGATIVE']
Logistic: ['POSITIVE' 'NEGATIVE' 'NEGATIVE' 'POSITIVE']


# Tuning our Model (with Grid Search)

In [19]:
from sklearn.model_selection import GridSearchCV

parameters = {'kernel': ('rbf', 'sigmoid'),
             'C': (1, 4, 8)}

svc = svm.SVC()
clf = GridSearchCV(svc, parameters)
clf.fit(train_x_vectors, train_y)

GridSearchCV(cv=None, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': (1, 4, 8), 'kernel': ('rbf', 'sigmoid')},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [21]:
print(f'SVM Grid: {f1_score(test_y, clf.predict(test_x_vectors), average=None, labels = [Sentiment.POSITIVE, Sentiment.NEGATIVE])}')

SVM Grid: [0.78062678 0.78591288]
Reality: ['POSITIVE', 'NEGATIVE', 'NEGATIVE', 'POSITIVE']
SVM Grid: ['POSITIVE' 'NEGATIVE' 'NEGATIVE' 'NEGATIVE']


# Saving Model

## Save

In [26]:
import pickle

with open('./models/sentiment_clf.pkl', 'wb') as f:
    pickle.dump(clf, f)

## Load

In [27]:
with open('./models/sentiment_clf.pkl', 'rb') as f:
    loaded_clf = pickle.load(f)

In [30]:
print(test_x[1], test_y[1], loaded_clf.predict(test_x_vectors[1]))

Be sure you have a pencil so you can write down all those cookie recipes.  The story itself is basically the usual fare, but still a good read.  At least I wasn't bored with it even though the plot was typical. NEGATIVE ['NEGATIVE']
