Comment Review

**Sentiment** enum class for constant values. **Review** class for encapsulation.

In [1]:
import random
class Sentiment:
  NEGATIVE = "NEGATIVE"
  POSITIVE = "POSITIVE"
  NEUTRAL = "NEUTRAL"
class Review:
  def __init__(self, text, score):
    self.text = text
    self.score = score
    self.sentiment = self.get_sentiment()
  def get_sentiment(self):
    if self.score <= 2 :
      return Sentiment.NEGATIVE
    elif self.score == 3 : 
      return Sentiment.NEUTRAL
    else:
      return Sentiment.POSITIVE

class ReviewContainer:
  def __init__(self, reviews):
    self.reviews = reviews

  def get_text(self):
    return [x.text for x in self.reviews]
  def get_sentiment(self):
    return [x.sentiment for x in self.reviews]
  def evenly_distribute(self):
    negative = list(filter(lambda x: x.sentiment == Sentiment.NEGATIVE, self.reviews))
    positive = list(filter(lambda x: x.sentiment == Sentiment.POSITIVE, self.reviews))
    positive_shrunk = positive[:len(negative)]
    self.reviews = negative + positive_shrunk
    random.shuffle(self.reviews)
    print(len(negative))
    print(len(positive_shrunk))


**Load** the data from file.

In [2]:
import json

file_name = './data/Books_small_10000.json'
reviews = []
with open(file_name) as file:
  for line in file:
    review = json.loads(line)
    reviews.append(Review(review['reviewText'], review['overall']))

**Prep Data**

In [3]:
from sklearn.model_selection import train_test_split

training, test = train_test_split(reviews, test_size=0.33, random_state=42)

train_container = ReviewContainer(training)
train_container.evenly_distribute()

test_container = ReviewContainer(test)
test_container.evenly_distribute()

ModuleNotFoundError: No module named 'sklearn'

In [None]:
train_x = train_container.get_text()
train_y = train_container.get_sentiment()

test_x = test_container.get_text() 
test_y = test_container.get_sentiment()

****Bags of Words****

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# vectorizer = CountVectorizer()
vectorizer = TfidfVectorizer()
train_x_vectors = vectorizer.fit_transform(train_x)
test_x_vectors = vectorizer.transform(test_x)
print(train_x[0])
print(train_x_vectors[0].toarray())

In [None]:
from sklearn import svm

clf_svm = svm.SVC(kernel='linear')
clf_svm.fit(train_x_vectors, train_y)

test_x[0]
clf_svm.predict(test_x_vectors[0])

***Decision Tree***

In [None]:
from sklearn.tree import DecisionTreeClassifier

clf_dec = DecisionTreeClassifier()
clf_dec.fit(train_x_vectors, train_y)

clf_dec.predict(test_x_vectors[0])

****Naive Bayes****

In [None]:
from sklearn.naive_bayes import GaussianNB
import numpy as np

clf_gnb = GaussianNB()
clf_gnb.fit((np.asarray(train_x_vectors.todense())), train_y)
clf_gnb.predict(np.asarray(test_x_vectors[0].todense()))

****Logistic Regression****

In [None]:
from sklearn.linear_model import LogisticRegression

clf_lr = LogisticRegression(random_state=0)
clf_lr.fit(train_x_vectors, train_y)
clf_lr.predict(test_x_vectors[0])

****Evaluation****

In [None]:
print("SVM : "  + str(clf_svm.score(test_x_vectors, test_y)*100))
print("DecisionTreeClassifier : "  + str(clf_dec.score(test_x_vectors, test_y)*100))
print("Naive Bayes : "  + str(clf_gnb.score(np.asarray(test_x_vectors.todense()), test_y)*100))
print("Logistic Regression : "  + str(clf_lr.score(test_x_vectors, test_y)*100))

**F1 Scores**

In [None]:
from sklearn.metrics import f1_score

print("SVM Score : " + str(f1_score(test_y, clf_svm.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEGATIVE])))
print("DecisionTreeClassifier Score : " + str(f1_score(test_y, clf_dec.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEGATIVE])))
print("Naive Bayes Score : " + str(f1_score(test_y, clf_gnb.predict(np.asarray(test_x_vectors.todense())), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEGATIVE])))
print("Logistic Regression Score : " + str(f1_score(test_y, clf_lr.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEGATIVE])))

**Test for cusom values**

In [None]:
custom_test_set = ["who is still able to relate in 2023", "waste of time", "fucking awesome", "use me as a dislike button"]
custom_test = vectorizer.transform(custom_test_set)


clf_svm.predict(custom_test)

**Tuning Model**

In [None]:
from sklearn.model_selection import GridSearchCV

parameters = {'kernel': ('linear', 'rbf'), 'C': (1,4,8,16,32)}

svc = svm.SVC()
clf = GridSearchCV(svc, parameters, cv=5)
clf.fit(train_x_vectors, train_y)

In [None]:
print(clf.score(test_x_vectors, test_y))

**Saving Model**

In [None]:
import pickle 

with open('./models/sentiment_classifier.pkl', 'wb') as f:
  pickle.dump(clf, f)

with open('./models/category_vectorizer.pkl', 'wb') as f:
  pickle.dump(vectorizer, f)

**Load Model**

In [None]:
with open('./models/sentiment_classifier.pkl', 'rb') as f:
  loaded_clf = pickle.load(f)

with open('./models/category_vectorizer.pkl', 'rb') as f:
  vectorizer = pickle.load(f)

In [None]:
test_set = ["its okay", "a good read", "bad"]
new_test = vectorizer.transform(test_set)

loaded_clf.predict(new_test)

In [None]:
print(test_x[0])
loaded_clf.predict(test_x_vectors[0])