In [2]:
import os
os.environ["TF_GPU_ALLOCATOR"] = "cuda_malloc_async"

# Set TensorFlow logging level (ERROR = 3, WARNING = 2, INFO = 1, ALL = 0)
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"  # suppress most logs
import tensorflow as tf
gpus = tf.config.list_physical_devices('GPU')
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Embedding, Bidirectional,  Layer, LSTM, Dense, Dropout, Input, Lambda, LayerNormalization, GlobalAveragePooling1D, MultiHeadAttention
from gensim.models import KeyedVectors, Word2Vec
from transformers import TFBertModel, BertTokenizer
import tensorflow_hub as hub
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import backend as K


EMBEDDING_DIM = 300

df_train = pd.read_csv("C19_train.csv", encoding_errors="ignore")
df_test = pd.read_csv("C19_test.csv", encoding_errors="ignore")



df_train["Sentiment"] = df_train["Sentiment"].apply(
    lambda x: "Positive" if x in ["Positive", "Extremely Positive"] else "Negative"
)
df_test["Sentiment"] = df_test["Sentiment"].apply(
    lambda x: "Positive" if x in ["Positive", "Extremely Positive"] else "Negative"
)
EMBEDDING_DIM = 100
MAX_LEN = 100

df_train["Sentiment"] = df_train["Sentiment"].str.capitalize()
df_test["Sentiment"] = df_test["Sentiment"].str.capitalize()

label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(df_train["Sentiment"])
y_test = label_encoder.transform(df_test["Sentiment"])
num_classes = len(label_encoder.classes_)


2025-04-03 16:11:31.901678: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1743711092.499450 1000557 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1743711092.600174 1000557 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1743711093.711292 1000557 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1743711093.711366 1000557 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1743711093.711373 1000557 computation_placer.cc:177] computation placer alr

In [3]:
def word_2_vec(with_grad: bool):
    print("Loading Word2Vec model...")
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(df_train["OriginalTweet"])

    X_train_seq = tokenizer.texts_to_sequences(df_train["OriginalTweet"])
    X_test_seq = tokenizer.texts_to_sequences(df_test["OriginalTweet"])

    X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_LEN)
    X_test_pad = pad_sequences(X_test_seq, maxlen=MAX_LEN)

    y_train_cat = to_categorical(y_train, num_classes)
    y_test_cat = to_categorical(y_test, num_classes)

    word_index = tokenizer.word_index

    w2v_model = KeyedVectors.load_word2vec_format(
        "GoogleNews-vectors-negative300.bin", binary=True
    )
    sentences = df_train["OriginalTweet"].apply(lambda x: x.split()).tolist()
    w2v_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)
    EMBEDDING_DIM = 100
    embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))

    for word, i in word_index.items():
        if word in w2v_model.wv:
            embedding_matrix[i] = w2v_model.wv[word]
        model = Sequential()
    model.add(
        Embedding(
            input_dim=len(word_index) + 1,
            output_dim=EMBEDDING_DIM,
            weights=[embedding_matrix],
            input_length=MAX_LEN,
            trainable=with_grad
        )
    )
    model.add(Bidirectional(LSTM(64, return_sequences=True)))
    model.add(Bidirectional(LSTM(64)))
    model.add(Dropout(0.5))
    model.add(Dense(64, activation="relu"))
    model.add(Dropout(0.5))
    model.add(Dense(num_classes, activation="softmax"))

    model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

    # Train
    history = model.fit(
        X_train_pad, y_train_cat,
        validation_split=0.1,
        epochs=5,
        batch_size=32,
        shuffle=True
    )
    model.summary()


    print(model.evaluate(X_test_pad, y_test_cat))


In [4]:
def glove(with_grad: bool):
    print("Loading GloVe...")
    GLOVE_PATH = "glove.6B.100d.txt"  
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(df_train["OriginalTweet"])

    X_train_seq = tokenizer.texts_to_sequences(df_train["OriginalTweet"])
    X_test_seq = tokenizer.texts_to_sequences(df_test["OriginalTweet"])

    X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_LEN)
    X_test_pad = pad_sequences(X_test_seq, maxlen=MAX_LEN)

    y_train_cat = to_categorical(y_train, num_classes)
    y_test_cat = to_categorical(y_test, num_classes)

    word_index = tokenizer.word_index
    embeddings_index = {}
    with open(GLOVE_PATH, encoding="utf8") as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))

    for word, i in word_index.items():
        vector = embeddings_index.get(word)
        if vector is not None:
            embedding_matrix[i] = vector



    model = Sequential()
    model.add(
        Embedding(
            input_dim=len(word_index) + 1,
            output_dim=EMBEDDING_DIM,
            weights=[embedding_matrix],
            input_length=MAX_LEN,
            trainable=with_grad
        )
    )
    model.add(Bidirectional(LSTM(64, return_sequences=True)))
    model.add(Bidirectional(LSTM(64)))
    model.add(Dropout(0.5))
    model.add(Dense(64, activation="relu"))
    model.add(Dropout(0.5))
    model.add(Dense(num_classes, activation="softmax"))

    model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])
    history = model.fit(
        X_train_pad, y_train_cat,
        validation_split=0.1,
        epochs=5,
        batch_size=32,
        shuffle=True
    )
    model.summary()

    print(model.evaluate(X_test_pad, y_test_cat))


In [6]:
class SelfAttention(tf.keras.layers.Layer):
    def __init__(self, num_heads, key_dim):
        super(SelfAttention, self).__init__()
        self.attn = MultiHeadAttention(num_heads=num_heads, key_dim=key_dim)

    def call(self, inputs):
        return self.attn(inputs, inputs)

def glove_bilstm_attention():
    print("Loading GloVe...")
    GLOVE_PATH = "glove.6B.100d.txt"  
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(df_train["OriginalTweet"])

    X_train_seq = tokenizer.texts_to_sequences(df_train["OriginalTweet"])
    X_test_seq = tokenizer.texts_to_sequences(df_test["OriginalTweet"])

    X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_LEN)
    X_test_pad = pad_sequences(X_test_seq, maxlen=MAX_LEN)

    y_train_cat = to_categorical(y_train, num_classes)
    y_test_cat = to_categorical(y_test, num_classes)

    word_index = tokenizer.word_index
    embeddings_index = {}
    with open(GLOVE_PATH, encoding="utf8") as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))

    for word, i in word_index.items():
        vector = embeddings_index.get(word)
        if vector is not None:
            embedding_matrix[i] = vector


    model = Sequential()
    model.add(
        Embedding(
            input_dim=len(word_index) + 1,
            output_dim=EMBEDDING_DIM,
            weights=[embedding_matrix],
            input_length=MAX_LEN,
            trainable=False
        )
    )
    model.add(Bidirectional(LSTM(64, return_sequences=True)))
    model.add(SelfAttention(num_heads=3, key_dim=64)) 
    model.add(LayerNormalization())
    model.add(GlobalAveragePooling1D())
    model.add(Dropout(0.5))
    model.add(Dense(64, activation="relu"))
    model.add(Dropout(0.5))
    model.add(Dense(num_classes, activation="softmax"))
    model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

    history = model.fit(
        X_train_pad, y_train_cat,
        validation_split=0.1,
        epochs=5,
        batch_size=32,
        shuffle=True

    )
    model.summary()

    print(model.evaluate(X_test_pad, y_test_cat))


In [15]:
def bert(with_grad=False): 
    label_encoder = LabelEncoder()
    df_train["label"] = label_encoder.fit_transform(df_train["Sentiment"])
    df_test["label"] = label_encoder.transform(df_test["Sentiment"])
    num_classes = len(label_encoder.classes_)
    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
    
    def tokenize_texts(texts, max_len=128):
        return tokenizer(
            list(texts),
            truncation=True,
            padding='max_length',
            max_length=max_len,
            return_tensors='tf'
        )
    
    train_encodings = tokenize_texts(df_train["OriginalTweet"])
    test_encodings = tokenize_texts(df_test["OriginalTweet"])
    
    y_train = tf.keras.utils.to_categorical(df_train["label"], num_classes)
    y_test = tf.keras.utils.to_categorical(df_test["label"], num_classes)
    
    bert_model = TFBertModel.from_pretrained("bert-base-uncased")
    bert_model.trainable = with_grad
    input_ids = Input(shape=(128,), dtype=tf.int32, name="input_ids")
    attention_mask = Input(shape=(128,), dtype=tf.int32, name="attention_mask")


    bert_output = tf.keras.layers.Lambda(
        lambda inputs: bert_model(
            input_ids=inputs[0],
            attention_mask=inputs[1],
        ).last_hidden_state,
        output_shape=(128, 768)  # (sequence_length, hidden_size)
        )([input_ids, attention_mask])

    

    cls_token = bert_output[:, 0, :]
    
    x = Dropout(0.5)(cls_token)
    x = Dense(64, activation="relu")(x)
    x = Dropout(0.5)(x)
    output = Dense(num_classes, activation="softmax")(x)

    model = Model(inputs=[input_ids, attention_mask], outputs=output)
    model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])
    
    model.fit(
        {
            "input_ids": train_encodings["input_ids"],
            "attention_mask": train_encodings["attention_mask"]
        },
        y_train,
        validation_data=(
            {
                "input_ids": test_encodings["input_ids"],
                "attention_mask": test_encodings["attention_mask"]
            },
            y_test
        ),
        epochs=5,
        batch_size=32
    )
    bert_model.summary()
    model.summary()
    print(model.evaluate(
        {
            "input_ids": test_encodings["input_ids"],
            "attention_mask": test_encodings["attention_mask"]
        },
        y_test
    ))



In [3]:
def elmo():
    df_train_small = df_train.sample(10000, random_state=42)  # or even 500
    df_test_small = df_test.sample(1000, random_state=42)

    df = pd.concat([df_train_small, df_test_small])
    label_encoder = LabelEncoder()
    df["SentimentEncoded"] = label_encoder.fit_transform(df["Sentiment"])
    
    train_df = df.iloc[:len(df_train_small)]
    test_df = df.iloc[len(df_train_small):]
    
    elmo = hub.load("https://tfhub.dev/google/elmo/2")
    
    def elmo_embed(sentences):
        return elmo.signatures["default"](tf.constant(sentences))["elmo"]
    
    def batched_elmo_embed(texts, batch_size=64, max_len=64):
        embeddings = []
        for i in range(0, len(texts), batch_size):
            batch = texts[i:i+batch_size]
            emb = elmo_embed(batch).numpy()  # ✅ FIXED: no ['elmo']
            padded = np.zeros((len(batch), max_len, 1024))
            for j, seq in enumerate(emb):
                cut = min(len(seq), max_len)
                padded[j, :cut, :] = seq[:cut]
            embeddings.append(padded)
        return np.concatenate(embeddings, axis=0)

    X_train = batched_elmo_embed(train_df["OriginalTweet"].tolist())
    X_test = batched_elmo_embed(test_df["OriginalTweet"].tolist())

    
    y_train = train_df["SentimentEncoded"].values
    y_test = test_df["SentimentEncoded"].values
    
    model = tf.keras.Sequential([
        tf.keras.layers.Input(shape=(None, 1024)),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, return_sequences=True)),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
        tf.keras.layers.Dense(64, activation="relu"),
        tf.keras.layers.Dense(len(label_encoder.classes_), activation="softmax")
    ])
    
    model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
    
    # Train the model
    model.fit(X_train, y_train, epochs=5, batch_size=4, validation_data=(X_test, y_test))
    model.summary()
    print(model.evaluate(X_test, y_test))



In [8]:
tf.config.set_soft_device_placement(True)
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    print(f"GPUs detected: {len(gpus)}")
else:
    print("No GPUs detected.")


GPUs detected: 3


In [9]:
print("W2V NO GRAD + 2 BiLSTM")
word_2_vec(with_grad = False)

W2V NO GRAD + 2 BiLSTM
Loading Word2Vec model...


I0000 00:00:1743707194.600510  998073 gpu_process_state.cc:208] Using CUDA malloc Async allocator for GPU: 0
I0000 00:00:1743707194.602648  998073 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 9796 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 2080 Ti, pci bus id: 0000:81:00.0, compute capability: 7.5
I0000 00:00:1743707194.606406  998073 gpu_process_state.cc:208] Using CUDA malloc Async allocator for GPU: 1
I0000 00:00:1743707194.606702  998073 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 11430 MB memory:  -> device: 1, name: Tesla P100-PCIE-12GB, pci bus id: 0000:02:00.0, compute capability: 6.0
I0000 00:00:1743707194.607025  998073 gpu_process_state.cc:208] Using CUDA malloc Async allocator for GPU: 2
I0000 00:00:1743707194.607256  998073 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:2 with 11430 MB memory:  -> device: 2, name: Tesla P100-PCIE-12GB, pci bus id: 0000:03

Epoch 1/5


I0000 00:00:1743707200.822706  998505 cuda_dnn.cc:529] Loaded cuDNN version 90600


[1m1158/1158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m55s[0m 42ms/step - accuracy: 0.6191 - loss: 0.6526 - val_accuracy: 0.6849 - val_loss: 0.5780
Epoch 2/5
[1m1158/1158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 42ms/step - accuracy: 0.7114 - loss: 0.5657 - val_accuracy: 0.7206 - val_loss: 0.5549
Epoch 3/5
[1m1158/1158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 42ms/step - accuracy: 0.7373 - loss: 0.5372 - val_accuracy: 0.7362 - val_loss: 0.5307
Epoch 4/5
[1m1158/1158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 42ms/step - accuracy: 0.7476 - loss: 0.5220 - val_accuracy: 0.7247 - val_loss: 0.5355
Epoch 5/5
[1m1158/1158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 42ms/step - accuracy: 0.7555 - loss: 0.5041 - val_accuracy: 0.7432 - val_loss: 0.5112


[1m119/119[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 17ms/step - accuracy: 0.7458 - loss: 0.5199
[0.5133284330368042, 0.7509215474128723]


In [10]:
print("W2V WITH GRAD + 2 BiLSTM")
word_2_vec(with_grad = True)


W2V WITH GRAD + 2 BiLSTM
Loading Word2Vec model...
Epoch 1/5
[1m1158/1158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m54s[0m 44ms/step - accuracy: 0.6483 - loss: 0.6116 - val_accuracy: 0.8379 - val_loss: 0.3736
Epoch 2/5
[1m1158/1158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 43ms/step - accuracy: 0.9292 - loss: 0.1985 - val_accuracy: 0.7964 - val_loss: 0.4925
Epoch 3/5
[1m1158/1158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 43ms/step - accuracy: 0.9858 - loss: 0.0448 - val_accuracy: 0.8328 - val_loss: 0.5835
Epoch 4/5
[1m1158/1158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 43ms/step - accuracy: 0.9943 - loss: 0.0190 - val_accuracy: 0.8207 - val_loss: 0.6833
Epoch 5/5
[1m1158/1158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 43ms/step - accuracy: 0.9975 - loss: 0.0095 - val_accuracy: 0.8358 - val_loss: 0.7693


[1m119/119[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 17ms/step - accuracy: 0.8341 - loss: 0.7938
[0.7889688611030579, 0.8335966467857361]


In [11]:
print("GLOVE WITH NO GRAD + 2 BiLSTM")
glove(with_grad = False)

GLOVE WITH NO GRAD + 2 BiLSTM
Loading GloVe...
Epoch 1/5
[1m1158/1158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 43ms/step - accuracy: 0.6516 - loss: 0.6170 - val_accuracy: 0.7583 - val_loss: 0.5074
Epoch 2/5
[1m1158/1158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 42ms/step - accuracy: 0.7711 - loss: 0.4955 - val_accuracy: 0.7962 - val_loss: 0.4487
Epoch 3/5
[1m1158/1158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 42ms/step - accuracy: 0.7965 - loss: 0.4482 - val_accuracy: 0.8200 - val_loss: 0.4142
Epoch 4/5
[1m1158/1158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 42ms/step - accuracy: 0.8307 - loss: 0.3986 - val_accuracy: 0.8358 - val_loss: 0.3880
Epoch 5/5
[1m1158/1158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 42ms/step - accuracy: 0.8508 - loss: 0.3555 - val_accuracy: 0.8486 - val_loss: 0.3592


[1m119/119[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 17ms/step - accuracy: 0.8496 - loss: 0.3642
[0.3686657249927521, 0.8483412265777588]


In [12]:
print("GLOVE WITH GRAD + 2 BiLSTM")
glove(with_grad = True)

GLOVE WITH GRAD + 2 BiLSTM
Loading GloVe...
Epoch 1/5
[1m1158/1158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m54s[0m 44ms/step - accuracy: 0.6716 - loss: 0.5874 - val_accuracy: 0.8642 - val_loss: 0.3157
Epoch 2/5
[1m1158/1158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 43ms/step - accuracy: 0.9221 - loss: 0.2138 - val_accuracy: 0.8618 - val_loss: 0.3457
Epoch 3/5
[1m1158/1158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 43ms/step - accuracy: 0.9811 - loss: 0.0562 - val_accuracy: 0.8681 - val_loss: 0.4717
Epoch 4/5
[1m1158/1158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 43ms/step - accuracy: 0.9921 - loss: 0.0251 - val_accuracy: 0.8812 - val_loss: 0.5177
Epoch 5/5
[1m1158/1158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 43ms/step - accuracy: 0.9962 - loss: 0.0123 - val_accuracy: 0.8618 - val_loss: 0.7110


[1m119/119[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 18ms/step - accuracy: 0.8772 - loss: 0.6604
[0.6552151441574097, 0.8741443157196045]


In [16]:
print("BERT WITH GRAD + 2 BiLSTM")
bert(True)


BERT WITH GRAD + 2 BiLSTM


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

Epoch 1/5
[1m1287/1287[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m152s[0m 107ms/step - accuracy: 0.5977 - loss: 0.6799 - val_accuracy: 0.6482 - val_loss: 0.6387
Epoch 2/5
[1m1287/1287[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m127s[0m 99ms/step - accuracy: 0.6417 - loss: 0.6360 - val_accuracy: 0.6656 - val_loss: 0.6178
Epoch 3/5
[1m1287/1287[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m127s[0m 99ms/step - accuracy: 0.6524 - loss: 0.6264 - val_accuracy: 0.6783 - val_loss: 0.6128
Epoch 4/5
[1m1287/1287[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m127s[0m 99ms/step - accuracy: 0.6584 - loss: 0.6218 - val_accuracy: 0.6788 - val_loss: 0.6156
Epoch 5/5
[1m1287/1287[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m127s[0m 99ms/step - accuracy: 0.6540 - loss: 0.6243 - val_accuracy: 0.6759 - val_loss: 0.6136
Model: "tf_bert_model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBer

[1m119/119[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 90ms/step - accuracy: 0.6665 - loss: 0.6201
[0.6136287450790405, 0.6758820414543152]


In [17]:
print("BERT WITHOUT GRAD + 2 BiLSTM")
bert(False)


BERT WITHOUT GRAD + 2 BiLSTM


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

Epoch 1/5
[1m1287/1287[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m152s[0m 107ms/step - accuracy: 0.5902 - loss: 0.6886 - val_accuracy: 0.6651 - val_loss: 0.6199
Epoch 2/5
[1m1287/1287[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m127s[0m 99ms/step - accuracy: 0.6486 - loss: 0.6307 - val_accuracy: 0.6725 - val_loss: 0.6286
Epoch 3/5
[1m1287/1287[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m127s[0m 99ms/step - accuracy: 0.6518 - loss: 0.6237 - val_accuracy: 0.6864 - val_loss: 0.6111
Epoch 4/5
[1m1287/1287[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m127s[0m 99ms/step - accuracy: 0.6556 - loss: 0.6208 - val_accuracy: 0.6780 - val_loss: 0.6159
Epoch 5/5
[1m1287/1287[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m128s[0m 99ms/step - accuracy: 0.6568 - loss: 0.6200 - val_accuracy: 0.6806 - val_loss: 0.6127
Model: "tf_bert_model_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBer

[1m119/119[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 90ms/step - accuracy: 0.6672 - loss: 0.6196
[0.6126747727394104, 0.6806213855743408]


In [4]:
print("ELMO WITHOUT GRAD + 2BiLSTM")
elmo() 

ELMO WITHOUT GRAD + 2BiLSTM


I0000 00:00:1743711132.314608 1000557 gpu_process_state.cc:208] Using CUDA malloc Async allocator for GPU: 0
I0000 00:00:1743711132.317160 1000557 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 9796 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 2080 Ti, pci bus id: 0000:81:00.0, compute capability: 7.5
I0000 00:00:1743711132.320989 1000557 gpu_process_state.cc:208] Using CUDA malloc Async allocator for GPU: 1
I0000 00:00:1743711132.321230 1000557 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 11430 MB memory:  -> device: 1, name: Tesla P100-PCIE-12GB, pci bus id: 0000:02:00.0, compute capability: 6.0
I0000 00:00:1743711132.321504 1000557 gpu_process_state.cc:208] Using CUDA malloc Async allocator for GPU: 2
I0000 00:00:1743711132.321730 1000557 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:2 with 11430 MB memory:  -> device: 2, name: Tesla P100-PCIE-12GB, pci bus id: 0000:03

Epoch 1/5
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m101s[0m 34ms/step - accuracy: 0.6556 - loss: 0.6184 - val_accuracy: 0.7800 - val_loss: 0.4990
Epoch 2/5
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 33ms/step - accuracy: 0.8001 - loss: 0.4378 - val_accuracy: 0.7870 - val_loss: 0.4576
Epoch 3/5
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 32ms/step - accuracy: 0.8666 - loss: 0.3257 - val_accuracy: 0.7930 - val_loss: 0.4491
Epoch 4/5
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 33ms/step - accuracy: 0.9002 - loss: 0.2404 - val_accuracy: 0.7730 - val_loss: 0.4872
Epoch 5/5
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 33ms/step - accuracy: 0.9415 - loss: 0.1525 - val_accuracy: 0.7650 - val_loss: 0.6435


[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - accuracy: 0.7733 - loss: 0.6095
[0.6435228586196899, 0.7649999856948853]


In [8]:
print("GLOVE WITH BILSTM and MULTIHEAD ATTENTION")
glove_bilstm_attention()



GLOVE WITH BILSTM and MULTIHEAD ATTENTION
Loading GloVe...
Epoch 1/5
[1m1158/1158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 32ms/step - accuracy: 0.6514 - loss: 0.6280 - val_accuracy: 0.7891 - val_loss: 0.4600
Epoch 2/5
[1m1158/1158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 31ms/step - accuracy: 0.8058 - loss: 0.4474 - val_accuracy: 0.8372 - val_loss: 0.3642
Epoch 3/5
[1m1158/1158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 32ms/step - accuracy: 0.8542 - loss: 0.3601 - val_accuracy: 0.8669 - val_loss: 0.3335
Epoch 4/5
[1m1158/1158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 31ms/step - accuracy: 0.8737 - loss: 0.3146 - val_accuracy: 0.8807 - val_loss: 0.3022
Epoch 5/5
[1m1158/1158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 31ms/step - accuracy: 0.8937 - loss: 0.2747 - val_accuracy: 0.8865 - val_loss: 0.2830


[1m119/119[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - accuracy: 0.8675 - loss: 0.3155
[0.3029524087905884, 0.8746708631515503]
