### **Data Class**

In [1]:
import random

class Category:
    BOOKS = "BOOKS"
    CLOTHING = 'CLOTHING'
    ELECTRONICS = 'ELECTRONICS'
    GROCERY = 'GROCERY'
    PATIO = 'PATIO'

class Sentiment:
    NEGATIVE = "NEGATIVE"
    NEUTRAL = "NEUTRAL"
    POSITIVE = "POSITIVE"

class Review:
    def __init__(self, category, text, score):
        self.category = category
        self.text = text
        self.score = score
        self.sentiment = self.get_sentiment()
    
    def get_sentiment(self):
        if self.score <= 2:
            return Sentiment.NEGATIVE
        elif self.score == 3:
            return Sentiment.NEUTRAL
        else:
            return Sentiment.POSITIVE

# Positive and Negative Sentiments are not evenly distributed enough for training
class ReviewContainer:
    def __init__(self, reviews):
        self.reviews = reviews
    
    def get_text(self):
        return [x.text for x in self.reviews]

    def get_sentiment(self):
        return [x.sentiment for x in self.reviews]
    
    def get_category(self):
        return [x.category for x in self.reviews]
    
    def evenly_distribute(self):
        negative = list(filter(lambda x : x.sentiment == Sentiment.NEGATIVE, self.reviews))
        positive = list(filter(lambda x : x.sentiment == Sentiment.POSITIVE, self.reviews))
        positive_shrunk = positive[:len(negative)]
        self.reviews = negative + positive_shrunk
        random.shuffle(self.reviews)

### **Load Data**

In [2]:
import json

file_names = ['./data/category/Books_small.json', 
             './data/category/Clothing_small.json',
             './data/category/Electronics_small.json',
             './data/category/Grocery_small.json',
             './data/category/Patio_small.json']
file_categories = [Category.BOOKS, Category.CLOTHING, Category.ELECTRONICS, Category.GROCERY, Category.PATIO]

reviews = []
for i in range(len(file_names)):
    file_name = file_names[i]
    file_category = file_categories[i]
    with open(file_name) as f :
        for line in f:
            review = json.loads(line)
            reviews.append(Review(file_category, review['reviewText'], review['overall']))

reviews[8].category

'BOOKS'

### **Prep Data**

In [3]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(reviews, test_size=0.33, random_state=42)

train_container = ReviewContainer(train)
test_container = ReviewContainer(test)

#train_container.evenly_distribute()
X_train = train_container.get_text()
y_train = train_container.get_category()

#test_container.evenly_distribute()
X_test = test_container.get_text()
y_test = test_container.get_category()

y_train.count(Category.ELECTRONICS)

673

##### Bag of Words Vectorization

In [4]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

vectorizer = TfidfVectorizer()
X_train_vectors = vectorizer.fit_transform(X_train)
X_test_vectors = vectorizer.transform(X_test)

### **Classification**

#### Linear SVM

In [5]:
from sklearn.svm import SVC

svm = SVC(C=16, kernel='linear', gamma='auto')
svm.fit(X_train_vectors, y_train)

print(X_test[0])
print(svm.predict(X_test_vectors[0]))

I love the tennis bracelet , but too big for my wrist. I have to send it back -:))).....Thank you
['CLOTHING']


#### Decision Tree

In [6]:
from sklearn.tree import DecisionTreeClassifier

decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train_vectors, y_train)

decision_tree.predict(X_test_vectors[0])

array(['ELECTRONICS'], dtype='<U11')

#### Naive Bayes

In [7]:
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import CategoricalNB
from sklearn.naive_bayes import MultinomialNB

gaussian_nb = GaussianNB()
gaussian_nb.fit(X_train_vectors.todense(), y_train) # Gaussian NB takes dense matrix as input

gaussian_nb.predict(X_test_vectors[0].toarray())

bernoulli_nb = BernoulliNB()
bernoulli_nb.fit(X_train_vectors.toarray(), y_train) 

bernoulli_nb.predict(X_test_vectors[0].toarray())

categorical_nb = CategoricalNB()
categorical_nb.fit(X_train_vectors.toarray(), y_train)

categorical_nb.predict(X_test_vectors[0].toarray())

multinomial_nb = MultinomialNB()
multinomial_nb.fit(X_train_vectors.toarray(), y_train)

multinomial_nb.predict(X_test_vectors[0].toarray())



array(['CLOTHING'], dtype='<U11')

#### Logistic Regression

In [8]:
from sklearn.linear_model import LogisticRegression

logistic = LogisticRegression(max_iter=1000)
logistic.fit(X_train_vectors, y_train)

logistic.predict(X_test_vectors[0])

array(['CLOTHING'], dtype='<U11')

### **Evaluation**

#### Mean Accuracy

In [9]:
print("SVM Score : ", end="")
print(svm.score(X_test_vectors, y_test))
print("Decision Tree Score : ", end="")
print(decision_tree.score(X_test_vectors, y_test))
print("Gaussian Naive Bayes Score : ", end="")
print(gaussian_nb.score(X_test_vectors.toarray(), y_test))
print("Bernoulli Naive Bayes Score : ", end="")
print(bernoulli_nb.score(X_test_vectors.toarray(), y_test))
#print("Categorical Naive Bayes Score : ", end="")
#print(categorical_nb.score(X_test_vectors.toarray(), y_test))
print("Multinomial Naive Bayes Score : ", end="")
print(multinomial_nb.score(X_test_vectors.toarray(), y_test))
print("Logistic Regression Score : ", end="")
print(logistic.score(X_test_vectors, y_test))

SVM Score : 0.9084848484848485
Decision Tree Score : 0.6842424242424242
Gaussian Naive Bayes Score : 0.8109090909090909
Bernoulli Naive Bayes Score : 0.7363636363636363
Multinomial Naive Bayes Score : 0.9187878787878788
Logistic Regression Score : 0.9127272727272727


#### F1 Score

In [10]:
from sklearn.metrics import f1_score

f1_score(y_test, svm.predict(X_test_vectors), average=None, 
         labels=(Category.BOOKS, Category.CLOTHING, Category.ELECTRONICS, Category.GROCERY, Category.PATIO))

array([0.95393759, 0.89795918, 0.88217523, 0.89953632, 0.90822785])

In [11]:
test_set = ["long-lasting battery", "bad book do not buy", "very comfortable linen", "not great for garden",
            "very light and enjoyable read", "very fresh", "this book is badly underrated"]
new_test = vectorizer.transform(test_set)

svm.predict(new_test)

array(['ELECTRONICS', 'BOOKS', 'CLOTHING', 'PATIO', 'BOOKS', 'GROCERY',
       'BOOKS'], dtype='<U11')

### **Model Tuning (Grid Search)**

In [12]:
from sklearn.model_selection import GridSearchCV

parameters = {'kernel': ('linear', 'rbf'),
              'C' : (0.1,1,8,16,32)}

svc = SVC()
classifier = GridSearchCV(svc, parameters, cv=5)
classifier.fit(X_train_vectors, y_train)

In [14]:
classifier.score(X_test_vectors, y_test)

0.916969696969697