# BERT Embeddings

Korišćenje *BERT embeddings*-a kao ulaz za klasifikaciju teksta uz pomoć neuronske mreže. 
Porediće se 4 modela:
* MLP sa Keras Embeddings
* GRU RNN sa Keras Embeddings
* MLP sa BERT Embeddings
* GRU RNN sa BERT Embeddings

In [None]:
import numpy as np

from tensorflow import keras
from keras.models import Sequential
from keras.datasets import imdb
from keras.layers import Dense, Embedding, GRU, Flatten, Input
from keras.preprocessing import sequence

import torch
from transformers import BertTokenizer, BertModel

from matplotlib import pyplot as plt
%matplotlib inline

np.random.seed(1244)

## Keras Embeddings

Učitavanje i priprema podataka za Keras Embeddings

In [None]:
max_features = 1000
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)

maxlen = 10
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
print("Train data shape X: ", x_train.shape)
print("Train data example X: ", x_train[0])
print("\nTrain data shape y: ", y_train.shape)
print("Train data example y:", y_train[0])
print("\nTest data shape: ", x_test.shape)
print("Test data example X: ", x_test[0])
print("\nTest data shape y: ", y_test.shape)
print("Test data example y: ", y_test[0])

epochs = 10

### MLP

In [None]:
model = Sequential()
model.add(Embedding(input_dim=max_features, output_dim=100))
model.add(Flatten())
model.add(Dense(64, activation="relu"))
model.add(Dense(32, activation="relu"))
model.add(Dense(1, activation="sigmoid"))

model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

res_mlp = model.fit(x_train, y_train, batch_size=256, epochs=epochs, validation_data=(x_test, y_test))

model.summary()

### GRU

In [None]:
model = Sequential()
model.add(Embedding(input_dim=max_features, output_dim=100))
model.add(GRU(20, return_sequences=False))
model.add(Dense(1, activation="sigmoid"))

model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

res_gru = model.fit(x_train, y_train, batch_size=256, epochs=epochs, validation_data=(x_test, y_test))

model.summary()

## BERT

Koristimo pretrenirani BERT model iz *transformers* biblioteke da generišemo *embeddings* za neuronskre mreže.

#### Učitavanje modela

In [None]:
bert_model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(bert_model_name)
bert_model = BertModel.from_pretrained(bert_model_name)

#### Pretprocesiranje podataka za BERT

In [None]:
# konverzija teksta u sekvence za BERT
# input_ids - ID tokena ulaznog teksta 
# attention_mask - 1 za stvarne tokene, 0 za padding
def preprocess_data(x_data, tokenizer, maxlen):
    texts = [' '.join(map(str, seq)) for seq in x_data]
    encodings = tokenizer(texts, truncation=True, padding=True, max_length=maxlen, return_tensors="pt")
    return encodings['input_ids'], encodings['attention_mask']

x_train_ids, x_train_masks = preprocess_data(x_train, tokenizer, maxlen)
x_test_ids, x_test_masks = preprocess_data(x_test, tokenizer, maxlen)

#### Generisanje BERT Embeddings

In [None]:
def get_bert_embeddings(input_ids, attention_mask):
    bert_model.eval() # postavljamo model u evaluacioni režim
    with torch.no_grad():
        outputs = bert_model(input_ids, attention_mask=attention_mask)
    return outputs.last_hidden_state

train_embeddings = get_bert_embeddings(x_train_ids, x_train_masks)
test_embeddings = get_bert_embeddings(x_test_ids, x_test_masks)
print("Train data shape X: ", train_embeddings.shape)
print("Train data example X: ", train_embeddings[0])
print("\nTrain data shape y: ", y_train.shape)
print("Train data example y:", y_train[0])
print("\nTest data shape: ", test_embeddings.shape)
print("Test data example X: ", test_embeddings[0])
print("\nTest data shape y: ", y_test.shape)
print("Test data example y: ", y_test[0])

### MLP

Promena oblika podataka u oblik neophodan za ulaz u Keras MLP:

In [None]:
# konverzija iz (batch_size, seq_len, hidden_dim) u (batch_size, hidden_dim)
# batch_size - broj ulazna u batch-u
# seq_len - broj tokena u sekvenci 
# hidden_dim - dimenzionalnost BERT embeddinga - 768
train_embeddings_mlp = train_embeddings.mean(dim=1).numpy()
test_embeddings_mlp = test_embeddings.mean(dim=1).numpy()

In [None]:
model = Sequential()
model.add(Input(shape=(train_embeddings_mlp.shape[1],)))
model.add(Dense(64, activation="relu"))
model.add(Dense(32, activation="relu"))
model.add(Dense(1, activation="sigmoid"))

model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

res_mlp_bert = model.fit(train_embeddings_mlp, y_train, batch_size=256, epochs=epochs, validation_data=(test_embeddings_mlp, y_test))

model.summary()

### GRU

Prebacivanje podataka u *numpy* nizove za ulaz u GRU:

In [None]:
train_embeddings_gru = train_embeddings.numpy()
test_embeddings_gru = test_embeddings.numpy()

In [None]:
model = Sequential()
model.add(Input(shape=(train_embeddings_gru.shape[1], train_embeddings_gru.shape[2])))
model.add(GRU(20, return_sequences=False))
model.add(Dense(1, activation="sigmoid"))

model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

res_gru_bert = model.fit(train_embeddings_gru, y_train, batch_size=256, epochs=epochs, validation_data=(test_embeddings_gru, y_test))

model.summary()

## Rezultati

### Loss

In [None]:
x = range(epochs)
plt.plot(x, res_mlp.history["loss"], label="MLP train")
plt.plot(x, res_mlp.history["val_loss"], label="MLP val")
plt.plot(x, res_gru.history["loss"], label="GRU train")
plt.plot(x, res_gru.history["val_loss"], label="GRU val")
plt.plot(x, res_mlp_bert.history["loss"], label="MLP BERT train")
plt.plot(x, res_mlp_bert.history["val_loss"], label="MLP BERT val")
plt.plot(x, res_gru_bert.history["loss"], label="GRU BERT train")
plt.plot(x, res_gru_bert.history["val_loss"], label="GRU BERT val")
plt.title("Loss")
plt.legend(loc="center left", bbox_to_anchor=(1, 0.5))
plt.show()

### Accuracy

In [None]:
x = range(epochs)
plt.plot(x, res_mlp.history["accuracy"], label="MLP train")
plt.plot(x, res_mlp.history["val_accuracy"], label="MLP val")
plt.plot(x, res_gru.history["accuracy"], label="GRU train")
plt.plot(x, res_gru.history["val_accuracy"], label="GRU val")
plt.plot(x, res_mlp_bert.history["accuracy"], label="MLP BERT train")
plt.plot(x, res_mlp_bert.history["val_accuracy"], label="MLP BERT val")
plt.plot(x, res_gru_bert.history["accuracy"], label="GRU BERT train")
plt.plot(x, res_gru_bert.history["val_accuracy"], label="GRU BERT val")
plt.title("Accuracy")
plt.legend(loc="center left", bbox_to_anchor=(1, 0.5))
plt.show()