In [None]:
import random
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

class Sentiment:
  NEGATIVE = "NEGATIVE"
  POSITIVE = "POSITIVE"
  NEUTRAL = "NEUTRAL"

class Review:
  def __init__(self, text, score):
    self.text = text
    self.score = score
    self.sentiment = self.get_sentiment()

  def get_sentiment(self):
    if self.score <= 2:
      return Sentiment.NEGATIVE
    elif self.score == 3:
      return Sentiment.NEUTRAL
    else:
      return Sentiment.POSITIVE

    self.text = clean_text(text)


class ReviewContainer:
  def __init__(self, reviews):
    self.reviews = reviews

  def get_text(self):
    return [x.text for x in self.reviews]

  def get_sentiment(self):
    return [x.sentiment for x in self.reviews]

  def evenly_distribute(self):
    negative = [x for x in self.reviews if x.sentiment == Sentiment.NEGATIVE]
    neutral = [x for x in self.reviews if x.sentiment == Sentiment.NEUTRAL]
    positive = [x for x in self.reviews if x.sentiment == Sentiment.POSITIVE]

    min_size = min(len(negative), len(neutral), len(positive))

    negative = random.sample(negative, min_size)
    neutral = random.sample(neutral, min_size)
    positive = random.sample(positive, min_size)

    self.reviews = negative + neutral + positive
    random.shuffle(self.reviews)

  def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\W', ' ', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = ' '.join([word for word in text.split() if word not in stopwords.words('english')])  # Remove stopwords
    return text

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
import json
file_name = '/content/Books_small_10000.json'
reviews = []
with open(file_name) as f:
  for line in f:
    review = json.loads(line)
    reviews.append(Review(review['reviewText'], review['overall']))

In [None]:
from sklearn.model_selection import train_test_split
training, test = train_test_split(reviews, test_size=0.33, random_state=42)

train_container = ReviewContainer(training)
test_container = ReviewContainer(test)

In [None]:
train_container.evenly_distribute()
train_x = train_container.get_text()
train_y = train_container.get_sentiment()

test_container.evenly_distribute()
test_x = test_container.get_text()
test_y = test_container.get_sentiment()

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

vectorizer = TfidfVectorizer(ngram_range=(1,2), max_df=0.95, min_df=5)
train_x_vectors = vectorizer.fit_transform(train_x)

test_x_vectors = vectorizer.transform(test_x)

In [None]:
from sklearn import svm
clf_svm = svm.SVC(kernel='linear')
clf_svm.fit(train_x_vectors, train_y)
test_x[0]
print(test_x[0])
clf_svm.predict(test_x_vectors[0].reshape(1, -1))

I picked up this book because I love the Oxford English Dictionary, and lexicography in general. The story itself is fascinating, but Winchester's telling does it a great disservice, in large part because he spends far more time than necessary on numerous parts of the story. I came away from this with the sense that there just wasn't enough for a book (despite the fact that this is a very short book and its font and size suggest a young-adult audience). Winchester apparently doesn't have the talent to turn what is an amazing story into an enjoyable book. One further annoyance: he refers on occasion to pictures, but there are no photos in the book. There are drawings scattered throughout the book, but they are unlabeled and serve no real person. The decisions about illustration just reinforce the sense that this is a shoddily conceived product.


array(['NEGATIVE'], dtype='<U8')

In [None]:
from sklearn.tree import DecisionTreeClassifier
clf_dec = DecisionTreeClassifier()
clf_dec.fit(train_x_vectors, train_y)
clf_dec.predict(test_x_vectors[0].reshape(1, -1))

array(['POSITIVE'], dtype='<U8')

In [None]:
from sklearn.naive_bayes import GaussianNB
clf_gnb = GaussianNB()
clf_gnb.fit(train_x_vectors.toarray(), train_y)
clf_gnb.predict(test_x_vectors[0].toarray())

array(['NEGATIVE'], dtype='<U8')

In [None]:
from sklearn.linear_model import LogisticRegression
clf_log = LogisticRegression()
clf_log.fit(train_x_vectors, train_y)
clf_log.predict(test_x_vectors[0].reshape(1, -1))

array(['NEGATIVE'], dtype='<U8')

In [None]:
from sklearn.ensemble import RandomForestClassifier

clf_rf = RandomForestClassifier(n_estimators=200, random_state=42)
clf_rf.fit(train_x_vectors, train_y)

In [None]:
print(clf_svm.score(test_x_vectors, test_y))
print(clf_dec.score(test_x_vectors, test_y))
print(clf_gnb.score(test_x_vectors.toarray(), test_y))
print(clf_log.score(test_x_vectors, test_y))
print(clf_rf.score(test_x_vectors, test_y))

0.6217948717948718
0.4358974358974359
0.5096153846153846
0.6330128205128205
0.5865384615384616


In [None]:
# F1 Scores
from sklearn.metrics import f1_score
print(f1_score(test_y, clf_svm.predict(test_x_vectors), average = None, labels = [Sentiment.POSITIVE, Sentiment.NEGATIVE, Sentiment.NEUTRAL]))
print(f1_score(test_y, clf_dec.predict(test_x_vectors), average = None, labels = [Sentiment.POSITIVE, Sentiment.NEGATIVE, Sentiment.NEUTRAL]))
print(f1_score(test_y, clf_gnb.predict(test_x_vectors.toarray()), average = None, labels = [Sentiment.POSITIVE, Sentiment.NEGATIVE, Sentiment.NEUTRAL]))
print(f1_score(test_y, clf_log.predict(test_x_vectors), average = None, labels = [Sentiment.POSITIVE, Sentiment.NEGATIVE, Sentiment.NEUTRAL]))

[0.70822943 0.61204819 0.55092593]
[0.48101266 0.40686275 0.42247191]
[0.53908356 0.51980198 0.47780127]
[0.71495327 0.63546798 0.54589372]


In [None]:
test_set = ['very fun', "not good", 'horrible waste of time']
new_test = vectorizer.transform(test_set)

clf_svm.predict(new_test)

array(['POSITIVE', 'NEGATIVE', 'NEGATIVE'], dtype='<U8')

In [None]:
from sklearn.model_selection import GridSearchCV

parameters = {'kernel': ('linear', 'rbf'), 'C': [1, 4, 8, 16, 32], 'gamma': ['scale', 'auto']}
svc = svm.SVC()
clf = GridSearchCV(svc, parameters, cv=5, scoring='f1_macro')

clf.fit(train_x_vectors, train_y)

In [None]:
print(f1_score(test_y, clf.predict(test_x_vectors), average = None, labels = [Sentiment.POSITIVE, Sentiment.NEGATIVE, Sentiment.NEUTRAL]))

[0.70822943 0.61204819 0.55092593]


In [None]:
import pickle

with open('./models/sentiment_classifier.pkl', 'wb') as f:
    pickle.dump(clf, f)

In [None]:
with open('./models/sentiment_classifier.pkl', 'rb') as f:
    loaded_clf = pickle.load(f)

In [None]:
print(test_x[0])
loaded_clf.predict(test_x_vectors[0])

I picked up this book because I love the Oxford English Dictionary, and lexicography in general. The story itself is fascinating, but Winchester's telling does it a great disservice, in large part because he spends far more time than necessary on numerous parts of the story. I came away from this with the sense that there just wasn't enough for a book (despite the fact that this is a very short book and its font and size suggest a young-adult audience). Winchester apparently doesn't have the talent to turn what is an amazing story into an enjoyable book. One further annoyance: he refers on occasion to pictures, but there are no photos in the book. There are drawings scattered throughout the book, but they are unlabeled and serve no real person. The decisions about illustration just reinforce the sense that this is a shoddily conceived product.


array(['NEGATIVE'], dtype='<U8')