## Model Evaluation

To evaluate the performance of the trained models, we can compute the classification reports which contain the precision, recall, and F1-score for each class. We can also compute the accuracy of the models on the test set.

But first we need to load the data:

In [None]:
# Importing the libraries
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from tensorflow.keras.models import load_model
import numpy as np

In [8]:
# Initialize an empty DataFrame to store the preprocessed data
df_preprocessed = pd.DataFrame()

# Load the preprocessed data chunks into the DataFrame
# The number of chunks may vary depending on the chunk size in the previous step

for i in range(5): # for 5k reviews
# for i in range(10): # for 100k reviews
# for i in range(20): # for 1M reviews
# for i in range(70): # for the entire dataset
    print(f'Loading chunk {i}')
    # Load the chunk
    chunk = pd.read_json(f'preprocessing/preprocessed_reviews_chunk_{i}.json')

    # Append the chunk to the DataFrame
    df_bert = pd.concat([df_preprocessed, chunk])
    df_cnn = df_bert.copy()
    df_lstm = df_bert.copy()

# The maximum number of words to be used. (most frequent)
MAX_NB_WORDS = 10000
# Max number of words in each review.
MAX_SEQUENCE_LENGTH = 250
# This is fixed.
EMBEDDING_DIM = 100

# Extract the strings from the dictionaries in the 'text' column
df_cnn['text'] = df_cnn['text'].apply(lambda x: list(x.values())[0] if isinstance(x, dict) else x)

# Extract the ratings from the dictionaries in the 'stars' column
df_cnn['stars'] = df_cnn['stars'].apply(lambda x: list(x.values())[0] if isinstance(x, dict) else x)

# Create a tokenizer
tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@\[\]^_`{|}~', lower=True)

# Fit the tokenizer on the texts
tokenizer.fit_on_texts(df_cnn['text'].values)

# Vocabulary size
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

# Transform text to sequence of integers
X = tokenizer.texts_to_sequences(df_cnn['text'].values)

# Pad sequences
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', X.shape)

# One-hot encode labels
Y = pd.get_dummies(df_cnn['stars']).values
print('Shape of label tensor:', Y.shape)

# Split the data
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.10, random_state = 42)




Loading chunk 0
Loading chunk 1
Loading chunk 2
Loading chunk 3
Loading chunk 4
Found 6468 unique tokens.
Shape of data tensor: (1000, 250)
Shape of label tensor: (1000, 5)


In [6]:
# Load the trained CNN model
model_cnn = load_model('models/sentiment_analysis_model_cnn_5k.h5')

# Predict the labels for the test set
y_pred_cnn = model_cnn.predict(X_test)

# Convert the predictions from categorical to label encoded
y_pred_cnn = np.argmax(y_pred_cnn, axis=1)

# Convert the true labels from categorical to label encoded
y_true = np.argmax(Y_test, axis=1)

# Compute the classification report for the CNN model
classification_report_cnn = classification_report(y_true, y_pred_cnn)
print("Classification Report for CNN: \n", classification_report_cnn)

# Compute the accuracy of the CNN model
accuracy_cnn = accuracy_score(y_true, y_pred_cnn)
print("Accuracy of CNN: ", accuracy_cnn)


Classification Report for CNN: 
               precision    recall  f1-score   support

           0       0.00      0.00      0.00        10
           1       0.00      0.00      0.00         6
           2       0.00      0.00      0.00        12
           3       0.00      0.00      0.00        20
           4       0.52      1.00      0.68        52

    accuracy                           0.52       100
   macro avg       0.10      0.20      0.14       100
weighted avg       0.27      0.52      0.36       100

Accuracy of CNN:  0.52


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Evaluation of LSTM model

In [17]:
# Load the trained LSTM model
model_lstm = load_model('models/sentiment_analysis_model_lstm_5k.h5')

# Predict the labels for the test set
y_pred_lstm = model_lstm.predict(X_test)

# Convert the predictions from categorical to label encoded
y_pred_lstm = np.argmax(y_pred_lstm, axis=1)

# Compute the classification report for the LSTM model
classification_report_lstm = classification_report(y_true, y_pred_lstm)
print("Classification Report for LSTM: \n", classification_report_lstm)

# Compute the accuracy of the LSTM model
accuracy_lstm = accuracy_score(y_true, y_pred_lstm)
print("Accuracy of LSTM: ", accuracy_lstm)

2023-06-11 15:53:47.871661: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-06-11 15:53:47.875187: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-06-11 15:53:47.877819: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus



2023-06-11 15:53:48.237243: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-06-11 15:53:48.238361: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-06-11 15:53:48.240187: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

Classification Report for LSTM: 
               precision    recall  f1-score   support

           0       0.00      0.00      0.00        10
           1       0.00      0.00      0.00         6
           2       0.00      0.00      0.00        12
           3       0.00      0.00      0.00        20
           4       0.52      1.00      0.68        52

    accuracy                           0.52       100
   macro avg       0.10      0.20      0.14       100
weighted avg       0.27      0.52      0.36       100

Accuracy of LSTM:  0.52


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Evaluation of BERT model

In [14]:
# Load Libraries
from transformers import AutoTokenizer, TFDistilBertForSequenceClassification

# Load the BERT tokenizer and model
tokenizer_bert = AutoTokenizer.from_pretrained('distilbert-base-uncased')
model_bert = TFDistilBertForSequenceClassification.from_pretrained('models/sentiment_analysis_model_bert_5k')

# Tokenize the test set
test_encodings = tokenizer_bert(df_bert['text'].to_list(), truncation=True, padding=True)

# Convert the tokenized test set to a TensorFlow dataset
test_dataset = tf.data.Dataset.from_tensor_slices((
dict(test_encodings),
df_bert['stars'].to_list()
))

# Batch the test set
test_dataset = test_dataset.batch(16)

# Predict the labels for the test set
y_pred_bert = model_bert.predict(test_dataset)

# Convert the predictions from categorical to label encoded
y_pred_bert = np.argmax(y_pred_bert.logits, axis=1)

# Compute the classification report for the BERT model
classification_report_bert = classification_report(df_bert['stars'].to_list(), y_pred_bert)
print("Classification Report for BERT: \n", classification_report_bert)

# Compute the accuracy of the BERT model
accuracy_bert = accuracy_score(df_bert['stars'].to_list(), y_pred_bert)
print("Accuracy of BERT: ", accuracy_bert)

Some layers from the model checkpoint at models/sentiment_analysis_model_bert_100k were not used when initializing TFDistilBertForSequenceClassification: ['dropout_19']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at models/sentiment_analysis_model_bert_100k and are newly initialized: ['dropout_99']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2023-06-11 15:41:20.6158

Classification Report for BERT: 
               precision    recall  f1-score   support

           1       0.00      0.00      0.00       114
           2       0.00      0.00      0.00        72
           3       0.15      0.59      0.24       112
           4       0.17      0.36      0.23       256
           5       0.00      0.00      0.00       446

    accuracy                           0.16      1000
   macro avg       0.06      0.19      0.09      1000
weighted avg       0.06      0.16      0.08      1000

Accuracy of BERT:  0.158


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Model Comparison

We can compare the performance of the three models by looking at their accuracy and F1-scores.

In [16]:
#Compare the models
print(f"Accuracy of CNN: {accuracy_cnn}, LSTM: {accuracy_lstm}, BERT: {accuracy_bert}")
print("\nClassification report for CNN:\n", classification_report_cnn)
print("\nClassification report for LSTM:\n", classification_report_lstm)
print("\nClassification report for BERT:\n", classification_report_bert)

Accuracy of CNN: 0.52, LSTM: 0.52, BERT: 0.158

Classification report for CNN:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00        10
           1       0.00      0.00      0.00         6
           2       0.00      0.00      0.00        12
           3       0.00      0.00      0.00        20
           4       0.52      1.00      0.68        52

    accuracy                           0.52       100
   macro avg       0.10      0.20      0.14       100
weighted avg       0.27      0.52      0.36       100


Classification report for LSTM:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00        10
           1       0.00      0.00      0.00         6
           2       0.00      0.00      0.00        12
           3       0.00      0.00      0.00        20
           4       0.52      1.00      0.68        52

    accuracy                           0.52       100
   macro avg     