In [None]:
import pandas as pd 
import numpy as np
import json
from nltk.stem import PorterStemmer, WordNetLemmatizer
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize

from sklearn.metrics import confusion_matrix, classification_report


import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.layers import Dense, Input
from tf_keras.optimizers import Adam,SGD
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
from tokenizers import BertWordPieceTokenizer
import transformers
from transformers import BertTokenizer, TFBertForSequenceClassification
from transformers import InputExample, InputFeatures

import logging

transformers.logging.set_verbosity_error()

## Import Dataset

In [2]:


# Define the path to your JSON Lines file
file_path = 'All_Beauty_5.json'

# Initialize a list to store the JSON objects
json_objects = []

# Open the file and read it line by line
with open(file_path, 'r') as file:
    for line in file:
        # Strip any extra whitespace and parse the JSON object
        json_object = json.loads(line.strip())
        json_objects.append(json_object)

# Optionally, convert the list of JSON objects into a pandas DataFrame
df = pd.DataFrame(json_objects)



In [3]:
df.head()

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,vote,image
0,5.0,True,"09 1, 2016",A3CIUOJXQ5VDQ2,B0000530HU,"{'Size:': ' 7.0 oz', 'Flavor:': ' Classic Ice ...",Shelly F,As advertised. Reasonably priced,Five Stars,1472688000,,
1,5.0,True,"11 14, 2013",A3H7T87S984REU,B0000530HU,"{'Size:': ' 7.0 oz', 'Flavor:': ' Classic Ice ...",houserules18,Like the oder and the feel when I put it on my...,Good for the face,1384387200,,
2,1.0,True,"08 18, 2013",A3J034YH7UG4KT,B0000530HU,"{'Size:': ' 7.0 oz', 'Flavor:': ' Classic Ice ...",Adam,I bought this to smell nice after I shave. Wh...,Smells awful,1376784000,,
3,5.0,False,"05 3, 2011",A2UEO5XR3598GI,B0000530HU,"{'Size:': ' 7.0 oz', 'Flavor:': ' Classic Ice ...",Rich K,HEY!! I am an Aqua Velva Man and absolutely lo...,Truth is There IS Nothing Like an AQUA VELVA MAN.,1304380800,25.0,
4,5.0,True,"05 6, 2011",A3SFRT223XXWF7,B00006L9LC,{'Size:': ' 200ml/6.7oz'},C. C. Christian,If you ever want to feel pampered by a shampoo...,Bvlgari Shampoo,1304640000,3.0,


## Preprocessing

In [4]:
# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /home/unamed/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/unamed/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/unamed/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [5]:
# Initialize necessary components
stop_words = set(stopwords.words('english'))
ps = PorterStemmer()
lemmatizer = WordNetLemmatizer()


In [6]:
def preprocess_text(text):
    text = str(text)
    # Tokenization
    tokens = word_tokenize(text)
    
    # Stop words removal
    tokens = [word for word in tokens if word.lower() not in stop_words]
    
    # Stemming
    tokens_stemmed = [ps.stem(word) for word in tokens]
    
    # Lemmatization
    tokens_lemmatized = [lemmatizer.lemmatize(word) for word in tokens_stemmed]
    
    # Join tokens back into a single string
    return ' '.join(tokens_lemmatized)

In [7]:

# Keep only relevant columns
data = df
data["reviewText"] = data["reviewText"].apply(preprocess_text)
data["overall"] = data["overall"].apply(float)
df['overall'] = df['overall'] - 1
# Drop rows with missing values
data.dropna(subset=['reviewText', 'overall'], inplace=True)
data["overall"]=data["overall"].astype(int)

In [13]:
# split the dataset into train, validation
training_sentences, test_sentences, training_labels, test_labels = train_test_split(data["reviewText"], data["overall"], test_size=.2)

In [14]:
if None in training_sentences:
    print("yes")

### Tokenizationokenizer = BertTokenizer.from_pretrained('bert-base-cased')

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

In [None]:
train_encodings = tokenizer(training_sentences.to_list(),
                            truncation=True,
                            padding=True)

validation_encodings = tokenizer(test_sentences.to_list(),
                            truncation=True,
                            padding=True)

In [None]:
# We convert the input encodings and labels into a TensorFlow Dataset object

train_dataset = tf.data.Dataset.from_tensor_slices((
                            dict(train_encodings),
                            training_labels
                            ));

validation_dataset = tf.data.Dataset.from_tensor_slices((
                            dict(validation_encodings),
                            test_labels
                            ));

## BERT 

In [138]:
import os
os.environ['TF_USE_LEGACY_KERAS'] = '1'

In [139]:
model = TFBertForSequenceClassification.from_pretrained('bert-base-cased',num_labels=5)

In [140]:
optimizer =  Adam(learning_rate=2e-5)
model.compile(optimizer=optimizer, loss=model.hf_compute_loss, metrics=['accuracy'])

In [None]:
history = model.fit(train_dataset.shuffle(100).batch(8),
          epochs=3,
          batch_size=8,
          validation_data=validation_dataset.shuffle(100).batch(8), verbose=1)

In [None]:
from matplotlib.ticker import MaxNLocator

# We plot train and validation accuracy

ax = plt.figure().gca()
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
ax.xaxis.set_major_locator(MaxNLocator(integer=True))
plt.legend(['train', 'val'], loc='upper left')
plt.show()

In [None]:
# We load the model and then evaluate it on holdout set

loaded_model = TFBertForSequenceClassification.from_pretrained("./output_model")
result = model.evaluate(holdout_dataset.batch(8))
dict(zip(model.metrics_names, result))

In [None]:
# After that we create the confusion matrix of our predictions

cm = tf.math.confusion_matrix(
    holdout_labels, pred_label, num_classes=2, weights=None, dtype=tf.dtypes.int32,
    name=None
).numpy()

print("confusion matrix\n",cm)