In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
import gzip
import json

In [6]:
def parse(path):
  g = gzip.open(path,'r')
  for l in g:
    yield json.loads(l)

In [7]:
def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')

In [8]:
df = getDF('Office_Products_5.json.gz')

In [15]:
print(df.iloc[46])

overall                                                         5.0
verified                                                       True
reviewTime                                              05 17, 2017
reviewerID                                           A1PJSSJSKSVCTM
asin                                                     0310802636
style                                {'Format:': ' Misc. Supplies'}
reviewerName                                             ShopN4Me12
reviewText        perfect  fit for the bible I purchased for a c...
summary           perfect fit for the bible I purchased for a child
unixReviewTime                                           1494979200
vote                                                            NaN
image                                                           NaN
Name: 46, dtype: object


In [18]:
df['rating'] = df['overall'].map({1.0: '1', 2.0: '1', 3.0: '2', 4.0: '3', 5.0: '3'})

In [19]:
print(df.iloc[0])

overall                                                         4.0
verified                                                       True
reviewTime                                               11 7, 2017
reviewerID                                           A2NIJTYWADLK57
asin                                                     0140503528
style                                    {'Format:': ' Board book'}
reviewerName                                            cotton clay
reviewText        kids like story BUT while i really wanted a bo...
summary                          good story, small size book though
unixReviewTime                                           1510012800
vote                                                            NaN
image                                                           NaN
category                                                          C
rating                                                            3
Name: 0, dtype: object


In [21]:
df.dropna(subset=['reviewText'], inplace=True)  # Remove rows with missing 'reviewText'

tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X = tfidf_vectorizer.fit_transform(df['reviewText'])


In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, df['rating'], test_size=0.2, random_state=42)


In [24]:
from sklearn.linear_model import LogisticRegression

classifier = LogisticRegression(solver='saga', max_iter=1000)
classifier.fit(X_train, y_train)


LogisticRegression(max_iter=1000, solver='saga')

In [25]:
from sklearn.metrics import accuracy_score, classification_report

y_pred = classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

# Print a detailed classification report
print(classification_report(y_test, y_pred))


Accuracy: 0.9021052434246293
              precision    recall  f1-score   support

           1       0.71      0.59      0.64     11061
           2       0.44      0.13      0.20     10055
           3       0.92      0.98      0.95    138913

    accuracy                           0.90    160029
   macro avg       0.69      0.57      0.60    160029
weighted avg       0.88      0.90      0.88    160029



In [27]:
new_reviews = ["This product is great!", "I'm hate this product."]
new_reviews_tfidf = tfidf_vectorizer.transform(new_reviews)
predictions = classifier.predict(new_reviews_tfidf)
print(predictions)


['3' '1']
