In [None]:
import numpy as np
import tensorflow as tf
from tqdm import tqdm
import torch

from matplotlib import pyplot as plt
import seaborn as sns

from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

from keras.utils import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding
from keras.layers import Dense, Input
from keras.layers import TimeDistributed
from keras.layers import LSTM, GRU, Bidirectional, SimpleRNN, RNN
from keras.models import load_model
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical 

from transformers import BertTokenizer, BertModel

from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
import json

from sklearn.metrics import f1_score, accuracy_score
from keras.callbacks import Callback
import numpy as np
import pickle

In [None]:
train_data = json.load(open('../Dataset/BIO_Tagged/ATE_train.json'))
val_data = json.load(open('../Dataset/BIO_Tagged/ATE_val.json'))
test_data = json.load(open('../Dataset/BIO_Tagged/ATE_test.json'))

In [None]:
word2vec = pickle.load(open('Word Embeddings/word2vec.pkl', 'rb'))
glove = pickle.load(open('Word Embeddings/glove.pkl', 'rb'))
bert = pickle.load(open('Word Embeddings/bert.pkl', 'rb'))

In [None]:
# word_to_idx = {}

# for case in train_data:
#     for text in train_data[case]['text'].split(' '):
#         if text not in word_to_idx:
#             word_to_idx[text] = len(word_to_idx)

# for case in test_data:
#     for text in test_data[case]['text'].split(' '):
#         if text not in word_to_idx:
#             word_to_idx[text] = len(word_to_idx)

# for case in val_data:
#     for text in val_data[case]['text'].split(' '):
#         if text not in word_to_idx:
#             word_to_idx[text] = len(word_to_idx)

In [None]:
# tag_to_ix = {}

# for case in train_data:
#     for tag in train_data[case]['labels']:
#         if tag not in tag_to_ix:
#             tag_to_ix[tag] = len(tag_to_ix)

# for case in test_data:
#     for tag in test_data[case]['labels']:
#         if tag not in tag_to_ix:
#             tag_to_ix[tag] = len(tag_to_ix)

# for case in val_data:
#     for tag in val_data[case]['labels']:
#         if tag not in tag_to_ix:
#             tag_to_ix[tag] = len(tag_to_ix)

# tag_to_ix['START_TAG'] = len(tag_to_ix)
# tag_to_ix['END_TAG'] = len(tag_to_ix)

In [None]:
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# model = BertModel.from_pretrained("bert-base-uncased")

In [None]:
# embedding_mat = np.zeros((len(word_to_idx), 768))

# for word, idx in tqdm(word_to_idx.items()):
#     try:
#         tokens = tokenizer.batch_encode_plus([word], return_tensors='pt', add_special_tokens=False)
#     except:
#         tokens = tokenizer.batch_encode_plus(['unk'], return_tensors='pt', add_special_tokens=False)
#         continue
#     embeddings = None
#     with torch.no_grad():
#         try:
#             outputs = model(**tokens)
#             embeddings = outputs.last_hidden_state
#         except:
#             tokens = tokenizer.batch_encode_plus(['unk'], return_tensors='pt', add_special_tokens=False)
#             outputs = model(**tokens)
#             embeddings = outputs.last_hidden_state
#     embeddings = embeddings.squeeze(0)
#     word_embeddings = embeddings.mean(dim = 0)
#     embedding_mat[idx] = word_embeddings.squeeze(0).numpy()

In [None]:
# pickle.dump(embedding_mat, open('Word Embeddings/bert.pkl', 'wb'))

In [None]:
# def create_embedding_matrix(model, word_index, embedding_dim):
#     embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))
#     for word, i in word_index.items():
#         if word in model.wv:
#             embedding_matrix[i] = model.wv[word]
#         # Else, you can choose to initialize randomly or use a special token here
#     return embedding_matrix

In [None]:
def prepareData(train_data, val_data, test_data, embedding_type):
    # Combine all datasets for consistent tokenization and vocabulary creation
    all_data = {**train_data, **val_data, **test_data}
        
    # Extract texts and labels from the combined data
    texts = [item["text"] for item in all_data.values()]
    labels = [item["labels"] for item in all_data.values()]
    
    word_tokenizer = Tokenizer()                      
    word_tokenizer.fit_on_texts(texts)                    
    
    train_sequences = word_tokenizer.texts_to_sequences([item["text"] for item in train_data.values()])
    val_sequences = word_tokenizer.texts_to_sequences([item["text"] for item in val_data.values()])
    test_sequences = word_tokenizer.texts_to_sequences([item["text"] for item in test_data.values()])
 
    max_sequence_length = 50 
    vocab_size = len(word_tokenizer.word_index) + 1  # Adding 1 for the zero-padding

    X_train = pad_sequences(train_sequences, maxlen=max_sequence_length)
    X_val = pad_sequences(val_sequences, maxlen=max_sequence_length)
    X_test = pad_sequences(test_sequences, maxlen=max_sequence_length)


    tag_tokenizer = Tokenizer()
    tag_tokenizer.fit_on_texts(labels)

    train_labels = tag_tokenizer.texts_to_sequences([item["labels"] for item in train_data.values()])
    val_labels = tag_tokenizer.texts_to_sequences([item["labels"] for item in val_data.values()])
    test_labels = tag_tokenizer.texts_to_sequences([item["labels"] for item in test_data.values()])

    max_label_length = 50    
    num_classes = len(tag_tokenizer.word_index) + 1  # Adding 1 for the zero-padding

    Y_train = pad_sequences(train_labels, maxlen=max_label_length)
    Y_val = pad_sequences(val_labels, maxlen=max_label_length)
    Y_test = pad_sequences(test_labels, maxlen=max_label_length)    

    try:
        embedding_dim = embedding_type.vector_size
    except AttributeError:
        embedding_dim = 768

    embedding_matrix = np.zeros((len(word_tokenizer.word_index) + 1, embedding_dim))
    word2id = word_tokenizer.word_index

    for word, index in word2id.items():
        try:
            embedding_matrix[index, :] = embedding_type[word]
        except IndexError:
            embedding_matrix[index, :] = embedding_type[index]
        except KeyError:
            pass       

    Y_train = to_categorical(Y_train)
    Y_val = to_categorical(Y_val)
    Y_test = to_categorical(Y_test)    

    return X_train, X_val, X_test, Y_train, Y_val, Y_test, vocab_size, embedding_dim, max_sequence_length, num_classes, embedding_matrix, word_tokenizer, tag_tokenizer

In [None]:
data = {}
data['word2vec'] = {}
data['glove'] = {}
data['bert'] = {}

data['word2vec']['X_train'], data['word2vec']['X_val'], data['word2vec']['X_test'], data['word2vec']['Y_train'], data['word2vec']['Y_val'], data['word2vec']['Y_test'], data['word2vec']['vocab_size'], data['word2vec']['embedding_dim'], data['word2vec']['max_sequence_length'], data['word2vec']['num_classes'], data['word2vec']['embedding_matrix'], word_tokenizer, tag_tokenizer = prepareData(train_data, val_data, test_data, word2vec)
data['glove']['X_train'], data['glove']['X_val'], data['glove']['X_test'], data['glove']['Y_train'], data['glove']['Y_val'], data['glove']['Y_test'], data['glove']['vocab_size'], data['glove']['embedding_dim'], data['glove']['max_sequence_length'], data['glove']['num_classes'], data['glove']['embedding_matrix'], word_tokenizer, tag_tokenizer = prepareData(train_data, val_data, test_data, glove)
data['bert']['X_train'], data['bert']['X_val'], data['bert']['X_test'], data['bert']['Y_train'], data['bert']['Y_val'], data['bert']['Y_test'], data['bert']['vocab_size'], data['bert']['embedding_dim'], data['bert']['max_sequence_length'], data['bert']['num_classes'], data['bert']['embedding_matrix'], word_tokenizer, tag_tokenizer = prepareData(train_data, val_data, test_data, bert)

In [None]:
class MacroF1ScoreCallback(Callback):
    def __init__(self, train_data, val_data):
        super().__init__()
        self.train_data = train_data
        self.val_data = val_data
        self.train_f1s = []
        self.val_f1s = []

    def on_epoch_end(self, epoch, logs=None):
        train_pred = np.argmax(self.model.predict(self.train_data[0]), axis=-1)
        train_true = np.argmax(self.train_data[1], axis=-1)
        train_f1 = f1_score(train_true.flatten(), train_pred.flatten(), average='macro')

        val_pred = np.argmax(self.model.predict(self.val_data[0]), axis=-1)
        val_true = np.argmax(self.val_data[1], axis=-1)
        val_f1 = f1_score(val_true.flatten(), val_pred.flatten(), average='macro')

        self.train_f1s.append(train_f1)
        self.val_f1s.append(val_f1)
        print(f'Epoch {epoch + 1} - Train Macro-F1: {train_f1:.4f} - Val Macro-F1: {val_f1:.4f}')


## VANILLA RNN

In [None]:
def trainRNN(embedding):
    X_train, Y_train, X_val, Y_val, embedding_matrix = data[embedding]['X_train'], data[embedding]['Y_train'], data[embedding]['X_val'], data[embedding]['Y_val'], data[embedding]['embedding_matrix']
    
    NUM_CLASSES = data[embedding]['num_classes']
    VOCABULARY_SIZE = data[embedding]['vocab_size']
    EMBEDDING_SIZE = data[embedding]['embedding_dim']
    MAX_SEQUENCE_LENGTH = data[embedding]['max_sequence_length']

    rnn_model = Sequential()
    rnn_model.add(Embedding(input_dim=VOCABULARY_SIZE,
                            output_dim=EMBEDDING_SIZE,
                            input_length=MAX_SEQUENCE_LENGTH,
                            weights=[embedding_matrix],
                            trainable=True))

    rnn_model.add(SimpleRNN(64, return_sequences=True))

    rnn_model.add(TimeDistributed(Dense(NUM_CLASSES, activation='softmax')))

    rnn_model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['acc'])

    macro_f1_callback = MacroF1ScoreCallback(train_data=(X_train, Y_train), val_data=(X_val, Y_val))

    macro_f1_scores = rnn_model.fit(X_train, Y_train, batch_size=128, epochs=50, validation_data=(X_val, Y_val), callbacks=[macro_f1_callback])

    rnn_model.save(f'Models/t2_rnn_{embedding}.h5')

    plt.figure(figsize=(12, 5))

    plt.subplot(1, 2, 1)
    plt.title(f'Loss - {embedding} RNN')
    plt.plot(macro_f1_scores.history['loss'], label='Training Loss')
    plt.plot(macro_f1_scores.history['val_loss'], label='Validation Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()

    plt.subplot(1, 2, 2)
    plt.title(f'Macro F1 - {embedding} RNN')
    plt.plot(macro_f1_callback.train_f1s, label='Training Macro-F1')
    plt.plot(macro_f1_callback.val_f1s, label='Validation Macro-F1')
    plt.xlabel('Epochs')
    plt.ylabel('Macro-F1 Score')
    plt.legend()

    plt.show()


In [None]:
trainRNN('word2vec')

In [None]:
trainRNN('glove')

In [None]:
trainRNN('bert')

## LSTM

In [None]:
def trainLSTM(embedding):
    X_train, Y_train, X_val, Y_val, embedding_matrix = data[embedding]['X_train'], data[embedding]['Y_train'], data[embedding]['X_val'], data[embedding]['Y_val'], data[embedding]['embedding_matrix']
    
    NUM_CLASSES = data[embedding]['num_classes']
    VOCABULARY_SIZE = data[embedding]['vocab_size']
    EMBEDDING_SIZE = data[embedding]['embedding_dim']
    MAX_SEQUENCE_LENGTH = data[embedding]['max_sequence_length']

    lstm_model = Sequential()
    lstm_model.add(Embedding(input_dim=VOCABULARY_SIZE,
                            output_dim=EMBEDDING_SIZE,
                            input_length=MAX_SEQUENCE_LENGTH,
                            weights=[embedding_matrix],
                            trainable=True))

    lstm_model.add(LSTM(64, return_sequences=True))

    lstm_model.add(TimeDistributed(Dense(NUM_CLASSES, activation='softmax')))

    lstm_model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['acc'])

    macro_f1_callback = MacroF1ScoreCallback(train_data=(X_train, Y_train), val_data=(X_val, Y_val))

    macro_f1_scores = lstm_model.fit(X_train, Y_train, batch_size=128, epochs=50, validation_data=(X_val, Y_val), callbacks=[macro_f1_callback])

    lstm_model.save(f'Models/t2_lstm_{embedding}.h5')

    plt.figure(figsize=(12, 5))

    plt.subplot(1, 2, 1)
    plt.title(f'Loss - {embedding} LSTM')
    plt.plot(macro_f1_scores.history['loss'], label='Training Loss')
    plt.plot(macro_f1_scores.history['val_loss'], label='Validation Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()

    plt.subplot(1, 2, 2)
    plt.title(f'Macro F1 - {embedding} LSTM')
    plt.plot(macro_f1_callback.train_f1s, label='Training Macro-F1')
    plt.plot(macro_f1_callback.val_f1s, label='Validation Macro-F1')
    plt.xlabel('Epochs')
    plt.ylabel('Macro-F1 Score')
    plt.legend()

    plt.show()


In [None]:
trainLSTM('word2vec')

In [None]:
trainLSTM('glove')

In [None]:
trainLSTM('bert')

## GRU

In [None]:
def trainGRU(embedding):
    X_train, Y_train, X_val, Y_val, embedding_matrix = data[embedding]['X_train'], data[embedding]['Y_train'], data[embedding]['X_val'], data[embedding]['Y_val'], data[embedding]['embedding_matrix']
    
    NUM_CLASSES = data[embedding]['num_classes']
    VOCABULARY_SIZE = data[embedding]['vocab_size']
    EMBEDDING_SIZE = data[embedding]['embedding_dim']
    MAX_SEQUENCE_LENGTH = data[embedding]['max_sequence_length']

    gru_model = Sequential()
    gru_model.add(Embedding(input_dim=VOCABULARY_SIZE,
                            output_dim=EMBEDDING_SIZE,
                            input_length=MAX_SEQUENCE_LENGTH,
                            weights=[embedding_matrix],
                            trainable=True))

    gru_model.add(GRU(64, return_sequences=True))

    gru_model.add(TimeDistributed(Dense(NUM_CLASSES, activation='softmax')))

    gru_model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['acc'])

    macro_f1_callback = MacroF1ScoreCallback(train_data=(X_train, Y_train), val_data=(X_val, Y_val))

    macro_f1_scores = gru_model.fit(X_train, Y_train, batch_size=128, epochs=50, validation_data=(X_val, Y_val), callbacks=[macro_f1_callback])

    gru_model.save(f'Models/t2_gru_{embedding}.h5')

    plt.figure(figsize=(12, 5))

    plt.subplot(1, 2, 1)
    plt.title(f'Loss - {embedding} GRU')
    plt.plot(macro_f1_scores.history['loss'], label='Training Loss')
    plt.plot(macro_f1_scores.history['val_loss'], label='Validation Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()

    plt.subplot(1, 2, 2)
    plt.title(f'Macro F1 - {embedding} GRU')
    plt.plot(macro_f1_callback.train_f1s, label='Training Macro-F1')
    plt.plot(macro_f1_callback.val_f1s, label='Validation Macro-F1')
    plt.xlabel('Epochs')
    plt.ylabel('Macro-F1 Score')
    plt.legend()

    plt.show()


In [None]:
trainGRU('word2vec')

In [None]:
trainGRU('glove')

In [None]:
trainGRU('bert')

## EVALUATION

In [None]:
def get_aspect_terms(test_pred, X_test):
    aspect_terms = []
    for i in range(len(test_pred)):
        aspect_term = []
        for j in range(len(test_pred[i])):
            if test_pred[i][j] == 1:
                aspect_term.append(word_tokenizer.index_word[X_test[i][j]])
        aspect_terms.append(aspect_term)
    return aspect_terms

In [None]:
def evaluate_model(model, X_test, Y_test):
    Y_pred = np.argmax(model.predict(X_test), axis=-1)
    Y_true = np.argmax(Y_test, axis=-1)

    aspect_terms = get_aspect_terms(Y_pred, X_test)

    print(f'Macro F1 Score: {f1_score(Y_true.flatten(), Y_pred.flatten(), average="macro"):.4f}')
    print(f'Accuracy: {accuracy_score(Y_true.flatten(), Y_pred.flatten())*100:.2f}%')

    return aspect_terms

In [None]:
rnn_word2vec = load_model('Models/t2_rnn_word2vec.h5')
rnn_glove = load_model('Models/t2_rnn_glove.h5')
rnn_bert = load_model('Models/t2_rnn_bert.h5')

lstm_word2vec = load_model('Models/t2_lstm_word2vec.h5')
lstm_glove = load_model('Models/t2_lstm_glove.h5')
lstm_bert = load_model('Models/t2_lstm_bert.h5')

gru_word2vec = load_model('Models/t2_gru_word2vec.h5')
gru_glove = load_model('Models/t2_gru_glove.h5')
gru_bert = load_model('Models/t2_gru_bert.h5')


print("RNN Word2Vec")
rnn_word2vec_aspect_terms = evaluate_model(rnn_word2vec, data['word2vec']['X_test'], data['word2vec']['Y_test'])

print()
print("RNN Glove")
rnn_glove_aspect_terms = evaluate_model(rnn_glove, data['glove']['X_test'], data['glove']['Y_test'])

print()
print("RNN bert")
rnn_bert_aspect_terms = evaluate_model(rnn_bert, data['bert']['X_test'], data['bert']['Y_test'])

print()
print("LSTM Word2Vec")
lstm_word2vec_aspect_terms = evaluate_model(lstm_word2vec, data['word2vec']['X_test'], data['word2vec']['Y_test'])

print()
print("LSTM Glove")
lstm_glove_aspect_terms = evaluate_model(lstm_glove, data['glove']['X_test'], data['glove']['Y_test'])

print()
print("LSTM bert")
lstm_bert_aspect_terms = evaluate_model(lstm_bert, data['bert']['X_test'], data['bert']['Y_test'])

print()
print("GRU Word2Vec")
gru_word2vec_aspect_terms = evaluate_model(gru_word2vec, data['word2vec']['X_test'], data['word2vec']['Y_test'])

print()
print("GRU Glove")
gru_glove_aspect_terms = evaluate_model(gru_glove, data['glove']['X_test'], data['glove']['Y_test'])

print()
print("GRU bert")
gru_bert_aspect_terms = evaluate_model(gru_bert, data['bert']['X_test'], data['bert']['Y_test'])