In [8]:
import pandas as pd
import json
import re

In [2]:
class Category:
    BOOKS = 'BOOKS'
    PATIO = 'PATIO'
    GROCERY = 'GROCERY'
    CLOTHING = 'CLOTHING'
    ELECTRONICS = 'ELECTRONICS'
    
class Review:
    def __init__(self, text, category):
        self.text = text
        self.category = category

# TO LOAD INDIVIDUAL json DATA FILES INTO A SINGLE DATASET

In [7]:
file_names = ['Books_small.json', 'Patio_small.json', 'Grocery_small.json',
              'Clothing_small.json', 'Electronics_small.json']
categories = [Category.BOOKS, Category.PATIO, Category.GROCERY, Category.CLOTHING, Category.ELECTRONICS]

reviews = []

for x in range(len(file_names)):
    file = file_names[x]
    category = categories[x]
    
    with open(file) as f:
        for line in f:
            review = json.loads(line)
            
            reviews.append(Review(review['reviewText'], category))

# TO SPLIT OUR DATA INTO TRAINING AND TEST DATASETS

In [15]:
from sklearn.model_selection import train_test_split

training, test = train_test_split(reviews, test_size=0.3, random_state=42)

x_train = [x.text for x in training]
x_test = [x.text for x in test]

y_train = [x.category for x in training]
y_test = [x.category for x in test]

# CONVERTING TEXT INPUT INTO VECTORS

In [16]:
# Tfidvectorizer is preferred over countvectorizer because it accounts for frequency and importance of words
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()

x_train_vectors = vectorizer.fit_transform(x_train)
x_test_vectors = vectorizer.transform(x_test)



# BUILDING OUR MODEL

In [17]:
from sklearn.svm import SVC

model = SVC()

clf = model.fit(x_train_vectors, y_train)
y_pred = clf.predict(x_test_vectors)

In [19]:
#An important metric in classification report is accuracy 
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

       BOOKS       0.99      0.93      0.96       316
    CLOTHING       0.92      0.93      0.93       301
 ELECTRONICS       0.84      0.93      0.88       290
     GROCERY       0.94      0.96      0.95       284
       PATIO       0.95      0.88      0.91       309

    accuracy                           0.93      1500
   macro avg       0.93      0.93      0.93      1500
weighted avg       0.93      0.93      0.93      1500



In [28]:
#Import GridsearchCV to find optimal parameter values for model
from sklearn.model_selection import GridSearchCV

parameter = {'kernel': ('linear', 'rbf'), 'C': [0.1, 1, 8, 16, 32]}
svc = SVC()
clf = GridSearchCV(svc, parameter, cv = 5)
fit = clf.fit(x_train_vectors, y_train)
y_pred_cv = fit.predict(x_test_vectors)
print(classification_report(y_test, y_pred_cv))

test_set = ['good read', 'beautiful clothing', 'television is clear']
fit_set = vectorizer.transform(test_set)
clf.predict(fit_set)

              precision    recall  f1-score   support

       BOOKS       0.99      0.93      0.96       316
    CLOTHING       0.92      0.94      0.93       301
 ELECTRONICS       0.86      0.93      0.89       290
     GROCERY       0.93      0.96      0.95       284
       PATIO       0.94      0.88      0.91       309

    accuracy                           0.93      1500
   macro avg       0.93      0.93      0.93      1500
weighted avg       0.93      0.93      0.93      1500



array(['BOOKS', 'CLOTHING', 'ELECTRONICS'], dtype='<U11')

In [30]:
from sklearn.linear_model import LogisticRegression

lr_model = LogisticRegression()
lr_fit = lr_model.fit(x_train_vectors, y_train)
lr_pred = lr_fit.predict(x_test_vectors)

print(classification_report(y_test, lr_pred))

test_set = ['good read', 'beautiful clothing', 'television is clear', 'lovely garden', 'shopping list']
fit_set = vectorizer.transform(test_set)
lr_model.predict(fit_set)

              precision    recall  f1-score   support

       BOOKS       0.99      0.94      0.97       316
    CLOTHING       0.91      0.93      0.92       301
 ELECTRONICS       0.86      0.91      0.89       290
     GROCERY       0.93      0.96      0.94       284
       PATIO       0.94      0.89      0.91       309

    accuracy                           0.93      1500
   macro avg       0.93      0.93      0.93      1500
weighted avg       0.93      0.93      0.93      1500



array(['BOOKS', 'CLOTHING', 'ELECTRONICS', 'CLOTHING', 'GROCERY'],
      dtype='<U11')

In [32]:
from sklearn.naive_bayes import GaussianNB

nb_model = GaussianNB()
nb_fit = nb_model.fit(x_train_vectors.toarray(), y_train)
nb_pred = nb_fit.predict(x_test_vectors.toarray())

print(classification_report(y_test, nb_pred))

test_set = ['good read', 'beautiful clothing', 'television is clear','lovely garden', 'shopping list']
fit_set = vectorizer.transform(test_set)
nb_model.predict((fit_set).toarray())

              precision    recall  f1-score   support

       BOOKS       0.91      0.80      0.85       316
    CLOTHING       0.83      0.72      0.77       301
 ELECTRONICS       0.82      0.80      0.81       290
     GROCERY       0.86      0.84      0.85       284
       PATIO       0.70      0.90      0.79       309

    accuracy                           0.81      1500
   macro avg       0.82      0.81      0.81      1500
weighted avg       0.82      0.81      0.81      1500



array(['CLOTHING', 'CLOTHING', 'ELECTRONICS', 'CLOTHING', 'CLOTHING'],
      dtype='<U11')

# FILTERING INPUT DATA 

In [39]:
test_set1 = ['good read##', 'beautiful clothing!', 'television is clear?', 'lovely garden', 'shopping list']

filtered_test_set = [re.sub(r'[^\w\s]', '', item) for item in test_set1]

print(filtered_test_set)
new_fit = vectorizer.transform(filtered_test_set)

#SVM model is the preferred model as it showed the highest accuracy, closely followed by logistic regression
clf.predict(new_fit)

['good read', 'beautiful clothing', 'television is clear', 'lovely garden', 'shopping list']
['BOOKS' 'CLOTHING' 'ELECTRONICS' 'PATIO' 'GROCERY']
