In [7]:
import os
os.environ["TF_GPU_ALLOCATOR"] = "cuda_malloc_async"

# Set TensorFlow logging level (ERROR = 3, WARNING = 2, INFO = 1, ALL = 0)
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"  # suppress most logs
import tensorflow as tf
gpus = tf.config.list_physical_devices('GPU')
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Embedding, Bidirectional,  Layer, LSTM, Dense, Dropout, Input, Lambda, LayerNormalization, GlobalAveragePooling1D, MultiHeadAttention
from gensim.models import KeyedVectors, Word2Vec
from transformers import TFBertModel, BertTokenizer
import tensorflow_hub as hub
import pandas as pd
import numpy as np

EMBEDDING_DIM = 300

df_train = pd.read_csv("C19_train.csv", encoding_errors="ignore")
df_test = pd.read_csv("C19_test.csv", encoding_errors="ignore")


df_train["Sentiment"] = df_train["Sentiment"].apply(
    lambda x: "Positive" if x in ["Positive", "Extremely Positive"] else "Negative"
)
df_test["Sentiment"] = df_test["Sentiment"].apply(
    lambda x: "Positive" if x in ["Positive", "Extremely Positive"] else "Negative"
)
EMBEDDING_DIM = 100
MAX_LEN = 100

df_train["Sentiment"] = df_train["Sentiment"].str.capitalize()
df_test["Sentiment"] = df_test["Sentiment"].str.capitalize()

label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(df_train["Sentiment"])
y_test = label_encoder.transform(df_test["Sentiment"])
num_classes = len(label_encoder.classes_)


NameError: name 'F1Score' is not defined

In [2]:
def word_2_vec(with_grad: bool):
    print("Loading Word2Vec model...")
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(df_train["OriginalTweet"])

    X_train_seq = tokenizer.texts_to_sequences(df_train["OriginalTweet"])
    X_test_seq = tokenizer.texts_to_sequences(df_test["OriginalTweet"])

    X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_LEN)
    X_test_pad = pad_sequences(X_test_seq, maxlen=MAX_LEN)

    y_train_cat = to_categorical(y_train, num_classes)
    y_test_cat = to_categorical(y_test, num_classes)

    word_index = tokenizer.word_index

    w2v_model = KeyedVectors.load_word2vec_format(
        "GoogleNews-vectors-negative300.bin", binary=True
    )
    sentences = df_train["OriginalTweet"].apply(lambda x: x.split()).tolist()
    w2v_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)
    EMBEDDING_DIM = 100
    embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))

    for word, i in word_index.items():
        if word in w2v_model.wv:
            embedding_matrix[i] = w2v_model.wv[word]
        model = Sequential()
    model.add(
        Embedding(
            input_dim=len(word_index) + 1,
            output_dim=EMBEDDING_DIM,
            weights=[embedding_matrix],
            input_length=MAX_LEN,
            trainable=with_grad
        )
    )
    model.add(Bidirectional(LSTM(64, return_sequences=True)))
    model.add(Bidirectional(LSTM(64)))
    model.add(Dropout(0.5))
    model.add(Dense(64, activation="relu"))
    model.add(Dropout(0.5))
    model.add(Dense(num_classes, activation="softmax"))

    model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

    # Train
    history = model.fit(
        X_train_pad, y_train_cat,
        validation_split=0.1,
        epochs=5,
        batch_size=32,
        shuffle=True
    )
    model.summary()


    loss, acc = model.evaluate(X_test_pad, y_test_cat)
    print(f"Test Accuracy: {acc:.2f}")


In [3]:
def glove(with_grad: bool):
    print("Loading GloVe...")
    GLOVE_PATH = "glove.6B.100d.txt"  
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(df_train["OriginalTweet"])

    X_train_seq = tokenizer.texts_to_sequences(df_train["OriginalTweet"])
    X_test_seq = tokenizer.texts_to_sequences(df_test["OriginalTweet"])

    X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_LEN)
    X_test_pad = pad_sequences(X_test_seq, maxlen=MAX_LEN)

    y_train_cat = to_categorical(y_train, num_classes)
    y_test_cat = to_categorical(y_test, num_classes)

    word_index = tokenizer.word_index
    embeddings_index = {}
    with open(GLOVE_PATH, encoding="utf8") as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))

    for word, i in word_index.items():
        vector = embeddings_index.get(word)
        if vector is not None:
            embedding_matrix[i] = vector



    model = Sequential()
    model.add(
        Embedding(
            input_dim=len(word_index) + 1,
            output_dim=EMBEDDING_DIM,
            weights=[embedding_matrix],
            input_length=MAX_LEN,
            trainable=with_grad
        )
    )
    model.add(Bidirectional(LSTM(64, return_sequences=True)))
    model.add(Bidirectional(LSTM(64)))
    model.add(Dropout(0.5))
    model.add(Dense(64, activation="relu"))
    model.add(Dropout(0.5))
    model.add(Dense(num_classes, activation="softmax"))

    model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])
    history = model.fit(
        X_train_pad, y_train_cat,
        validation_split=0.1,
        epochs=5,
        batch_size=32,
        shuffle=True
    )
    model.summary()

    loss, acc = model.evaluate(X_test_pad, y_test_cat)
    print(f"Test Accuracy: {acc:.2f}")


In [4]:
class SelfAttention(tf.keras.layers.Layer):
    def __init__(self, num_heads, key_dim):
        super(SelfAttention, self).__init__()
        self.attn = MultiHeadAttention(num_heads=num_heads, key_dim=key_dim)

    def call(self, inputs):
        return self.attn(inputs, inputs)

def glove_bilstm_attention():
    print("Loading GloVe...")
    GLOVE_PATH = "glove.6B.100d.txt"  
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(df_train["OriginalTweet"])

    X_train_seq = tokenizer.texts_to_sequences(df_train["OriginalTweet"])
    X_test_seq = tokenizer.texts_to_sequences(df_test["OriginalTweet"])

    X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_LEN)
    X_test_pad = pad_sequences(X_test_seq, maxlen=MAX_LEN)

    y_train_cat = to_categorical(y_train, num_classes)
    y_test_cat = to_categorical(y_test, num_classes)

    word_index = tokenizer.word_index
    embeddings_index = {}
    with open(GLOVE_PATH, encoding="utf8") as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))

    for word, i in word_index.items():
        vector = embeddings_index.get(word)
        if vector is not None:
            embedding_matrix[i] = vector


    model = Sequential()
    model.add(
        Embedding(
            input_dim=len(word_index) + 1,
            output_dim=EMBEDDING_DIM,
            weights=[embedding_matrix],
            input_length=MAX_LEN,
            trainable=False
        )
    )
    model.add(Bidirectional(LSTM(64, return_sequences=True)))
    model.add(SelfAttention(num_heads=3, key_dim=64)) 
    model.add(LayerNormalization())
    model.add(GlobalAveragePooling1D())
    model.add(Dropout(0.5))
    model.add(Dense(64, activation="relu"))
    model.add(Dropout(0.5))
    model.add(Dense(num_classes, activation="softmax"))
    model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

    history = model.fit(
        X_train_pad, y_train_cat,
        validation_split=0.1,
        epochs=5,
        batch_size=32,
        shuffle=True

    )
    model.summary()

    loss, acc = model.evaluate(X_test_pad, y_test_cat)
    print(f"Test Accuracy: {acc:.2f}")


In [5]:
def bert(with_grad=False): 
    label_encoder = LabelEncoder()
    df_train["label"] = label_encoder.fit_transform(df_train["Sentiment"])
    df_test["label"] = label_encoder.transform(df_test["Sentiment"])
    num_classes = len(label_encoder.classes_)
    tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
    
    def tokenize_texts(texts, max_len=128):
        return tokenizer(
            list(texts),
            truncation=True,
            padding='max_length',
            max_length=max_len,
            return_tensors='tf'
        )
    
    train_encodings = tokenize_texts(df_train["OriginalTweet"])
    test_encodings = tokenize_texts(df_test["OriginalTweet"])
    
    y_train = tf.keras.utils.to_categorical(df_train["label"], num_classes)
    y_test = tf.keras.utils.to_categorical(df_test["label"], num_classes)
    
    bert_model = TFBertModel.from_pretrained("bert-base-cased")
    bert_model.trainable = with_grad
    input_ids = Input(shape=(128,), dtype=tf.int32, name="input_ids")
    attention_mask = Input(shape=(128,), dtype=tf.int32, name="attention_mask")


    bert_output = tf.keras.layers.Lambda(
        lambda inputs: bert_model(
            input_ids=inputs[0],
            attention_mask=inputs[1],
        ).last_hidden_state,
        output_shape=(128, 768)  # (sequence_length, hidden_size)
        )([input_ids, attention_mask])

    

    cls_token = bert_output[:, 0, :]
    
    x = Dropout(0.5)(cls_token)
    x = Dense(64, activation="relu")(x)
    x = Dropout(0.5)(x)
    output = Dense(num_classes, activation="softmax")(x)

    model = Model(inputs=[input_ids, attention_mask], outputs=output)
    model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])
    
    model.fit(
        {
            "input_ids": train_encodings["input_ids"],
            "attention_mask": train_encodings["attention_mask"]
        },
        y_train,
        validation_data=(
            {
                "input_ids": test_encodings["input_ids"],
                "attention_mask": test_encodings["attention_mask"]
            },
            y_test
        ),
        epochs=5,
        batch_size=32
    )
    bert_model.summary()
    model.summary()
    return model


In [9]:
def elmo():
    df_train_small = df_train.sample(10000, random_state=42)  # or even 500
    df_test_small = df_test.sample(1000, random_state=42)

    df = pd.concat([df_train_small, df_test_small])
    label_encoder = LabelEncoder()
    df["SentimentEncoded"] = label_encoder.fit_transform(df["Sentiment"])
    
    train_df = df.iloc[:len(df_train_small)]
    test_df = df.iloc[len(df_train_small):]
    
    elmo = hub.load("https://tfhub.dev/google/elmo/2")
    
    def elmo_embed(sentences):
        return elmo.signatures["default"](tf.constant(sentences))["elmo"]
    
    def batched_elmo_embed(texts, batch_size=64, max_len=64):
        embeddings = []
        for i in range(0, len(texts), batch_size):
            batch = texts[i:i+batch_size]
            emb = elmo_embed(batch).numpy()  # ✅ FIXED: no ['elmo']
            padded = np.zeros((len(batch), max_len, 1024))
            for j, seq in enumerate(emb):
                cut = min(len(seq), max_len)
                padded[j, :cut, :] = seq[:cut]
            embeddings.append(padded)
        return np.concatenate(embeddings, axis=0)

    X_train = batched_elmo_embed(train_df["OriginalTweet"].tolist())
    X_test = batched_elmo_embed(test_df["OriginalTweet"].tolist())

    
    y_train = train_df["SentimentEncoded"].values
    y_test = test_df["SentimentEncoded"].values
    
    model = tf.keras.Sequential([
        tf.keras.layers.Input(shape=(None, 1024)),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, return_sequences=True)),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
        tf.keras.layers.Dense(64, activation="relu"),
        tf.keras.layers.Dense(len(label_encoder.classes_), activation="softmax")
    ])
    
    model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
    
    # Train the model
    model.fit(X_train, y_train, epochs=5, batch_size=4, validation_data=(X_test, y_test))
    model.summary()


In [7]:
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    print(f"GPUs detected: {len(gpus)}")
else:
    print("No GPUs detected.")


GPUs detected: 3


In [8]:
print("GLOVE WITH BILSTM and MULTIHEAD ATTENTION")
with tf.device('/GPU:0'):
    glove_bilstm_attention()



GLOVE WITH BILSTM and MULTIHEAD ATTENTION
Loading GloVe...


I0000 00:00:1743530286.893779 3623456 gpu_process_state.cc:208] Using CUDA malloc Async allocator for GPU: 0
I0000 00:00:1743530287.073746 3623456 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 30506 MB memory:  -> device: 0, name: NVIDIA RTX 5000 Ada Generation, pci bus id: 0000:d8:00.0, compute capability: 8.9
I0000 00:00:1743530287.077546 3623456 gpu_process_state.cc:208] Using CUDA malloc Async allocator for GPU: 1
I0000 00:00:1743530287.078107 3623456 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 9796 MB memory:  -> device: 1, name: NVIDIA GeForce RTX 2080 Ti, pci bus id: 0000:3b:00.0, compute capability: 7.5
I0000 00:00:1743530287.078627 3623456 gpu_process_state.cc:208] Using CUDA malloc Async allocator for GPU: 2
I0000 00:00:1743530287.079074 3623456 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:2 with 9796 MB memory:  -> device: 2, name: NVIDIA GeForce RTX 2080 Ti, pci 

Epoch 1/5


I0000 00:00:1743530305.573225 3624436 cuda_dnn.cc:529] Loaded cuDNN version 90600


[1m1158/1158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 27ms/step - accuracy: 0.6566 - loss: 0.6237 - val_accuracy: 0.7908 - val_loss: 0.4538
Epoch 2/5
[1m1158/1158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 27ms/step - accuracy: 0.8007 - loss: 0.4505 - val_accuracy: 0.8401 - val_loss: 0.3762
Epoch 3/5
[1m1158/1158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 27ms/step - accuracy: 0.8424 - loss: 0.3829 - val_accuracy: 0.8627 - val_loss: 0.3396
Epoch 4/5
[1m1158/1158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 28ms/step - accuracy: 0.8724 - loss: 0.3188 - val_accuracy: 0.8710 - val_loss: 0.3204
Epoch 5/5
[1m1158/1158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 28ms/step - accuracy: 0.8861 - loss: 0.2896 - val_accuracy: 0.8805 - val_loss: 0.2961


[1m119/119[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 14ms/step - accuracy: 0.8671 - loss: 0.3137
Test Accuracy: 0.87


In [9]:
print("W2V NO GRAD + 2 BiLSTM")
word_2_vec(with_grad = False)

W2V NO GRAD + 2 BiLSTM
Loading Word2Vec model...
Epoch 1/5
[1m1158/1158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 37ms/step - accuracy: 0.6076 - loss: 0.6580 - val_accuracy: 0.6936 - val_loss: 0.5894
Epoch 2/5
[1m1158/1158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 37ms/step - accuracy: 0.7091 - loss: 0.5675 - val_accuracy: 0.7177 - val_loss: 0.5571
Epoch 3/5
[1m1158/1158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 37ms/step - accuracy: 0.7338 - loss: 0.5375 - val_accuracy: 0.7252 - val_loss: 0.5411
Epoch 4/5
[1m1158/1158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 37ms/step - accuracy: 0.7473 - loss: 0.5176 - val_accuracy: 0.7374 - val_loss: 0.5228
Epoch 5/5
[1m1158/1158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 37ms/step - accuracy: 0.7635 - loss: 0.4948 - val_accuracy: 0.7563 - val_loss: 0.5075


[1m119/119[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 18ms/step - accuracy: 0.7443 - loss: 0.5164
Test Accuracy: 0.75


In [10]:
print("W2V WITH GRAD + 2 BiLSTM")
word_2_vec(with_grad = True)


W2V WITH GRAD + 2 BiLSTM
Loading Word2Vec model...
Epoch 1/5
[1m1158/1158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 37ms/step - accuracy: 0.6578 - loss: 0.6108 - val_accuracy: 0.8513 - val_loss: 0.3433
Epoch 2/5
[1m1158/1158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 37ms/step - accuracy: 0.9296 - loss: 0.1908 - val_accuracy: 0.8336 - val_loss: 0.4400
Epoch 3/5
[1m1158/1158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 37ms/step - accuracy: 0.9859 - loss: 0.0440 - val_accuracy: 0.8394 - val_loss: 0.6083
Epoch 4/5
[1m1158/1158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 36ms/step - accuracy: 0.9954 - loss: 0.0167 - val_accuracy: 0.8246 - val_loss: 0.9629
Epoch 5/5
[1m1158/1158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 37ms/step - accuracy: 0.9983 - loss: 0.0082 - val_accuracy: 0.8472 - val_loss: 0.7459


[1m119/119[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 18ms/step - accuracy: 0.8404 - loss: 0.8326
Test Accuracy: 0.84


In [11]:
print("GLOVE WITH NO GRAD + 2 BiLSTM")
glove(with_grad = False)

GLOVE WITH NO GRAD + 2 BiLSTM
Loading GloVe...
Epoch 1/5
[1m1158/1158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 37ms/step - accuracy: 0.6677 - loss: 0.6102 - val_accuracy: 0.7566 - val_loss: 0.5037
Epoch 2/5
[1m1158/1158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 37ms/step - accuracy: 0.7666 - loss: 0.4969 - val_accuracy: 0.7539 - val_loss: 0.4870
Epoch 3/5
[1m1158/1158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 37ms/step - accuracy: 0.8028 - loss: 0.4388 - val_accuracy: 0.8110 - val_loss: 0.4151
Epoch 4/5
[1m1158/1158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 37ms/step - accuracy: 0.8299 - loss: 0.3936 - val_accuracy: 0.8311 - val_loss: 0.3831
Epoch 5/5
[1m1158/1158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 37ms/step - accuracy: 0.8496 - loss: 0.3552 - val_accuracy: 0.8358 - val_loss: 0.3708


[1m119/119[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 17ms/step - accuracy: 0.8331 - loss: 0.3798
Test Accuracy: 0.84


In [12]:
print("GLOVE WITH GRAD + 2 BiLSTM")
glove(with_grad = True)

GLOVE WITH GRAD + 2 BiLSTM
Loading GloVe...
Epoch 1/5
[1m1158/1158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 38ms/step - accuracy: 0.6815 - loss: 0.5800 - val_accuracy: 0.8598 - val_loss: 0.3280
Epoch 2/5
[1m1158/1158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 37ms/step - accuracy: 0.9205 - loss: 0.2157 - val_accuracy: 0.8409 - val_loss: 0.3806
Epoch 3/5
[1m1158/1158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 37ms/step - accuracy: 0.9797 - loss: 0.0603 - val_accuracy: 0.8793 - val_loss: 0.3634
Epoch 4/5
[1m1158/1158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 37ms/step - accuracy: 0.9920 - loss: 0.0266 - val_accuracy: 0.8710 - val_loss: 0.5883
Epoch 5/5
[1m1158/1158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 37ms/step - accuracy: 0.9964 - loss: 0.0141 - val_accuracy: 0.8790 - val_loss: 0.5369


[1m119/119[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 18ms/step - accuracy: 0.8780 - loss: 0.5480
Test Accuracy: 0.87


In [13]:
print("BERT WITH GRAD + 2 BiLSTM")
bert(True)


BERT WITH GRAD + 2 BiLSTM


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

Epoch 1/5


I0000 00:00:1743531506.459741 3624438 service.cc:152] XLA service 0x233786c0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1743531506.459773 3624438 service.cc:160]   StreamExecutor device (0): NVIDIA RTX 5000 Ada Generation, Compute Capability 8.9
I0000 00:00:1743531506.459780 3624438 service.cc:160]   StreamExecutor device (1): NVIDIA GeForce RTX 2080 Ti, Compute Capability 7.5
I0000 00:00:1743531506.459783 3624438 service.cc:160]   StreamExecutor device (2): NVIDIA GeForce RTX 2080 Ti, Compute Capability 7.5


[1m   7/1287[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m27s[0m 21ms/step - accuracy: 0.5128 - loss: 0.9577

I0000 00:00:1743531516.765368 3624438 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m1287/1287[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m70s[0m 40ms/step - accuracy: 0.5649 - loss: 0.7006 - val_accuracy: 0.6556 - val_loss: 0.6388
Epoch 2/5
[1m1287/1287[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 23ms/step - accuracy: 0.6158 - loss: 0.6543 - val_accuracy: 0.6559 - val_loss: 0.6335
Epoch 3/5
[1m1287/1287[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 23ms/step - accuracy: 0.6306 - loss: 0.6444 - val_accuracy: 0.6622 - val_loss: 0.6320
Epoch 4/5
[1m1287/1287[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 23ms/step - accuracy: 0.6280 - loss: 0.6427 - val_accuracy: 0.6611 - val_loss: 0.6352
Epoch 5/5
[1m1287/1287[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 23ms/step - accuracy: 0.6313 - loss: 0.6438 - val_accuracy: 0.6538 - val_loss: 0.6342
Model: "tf_bert_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      m

<Functional name=functional_5, built=True>

In [10]:
print("ELMO WITHOUT GRAD + 2BiLSTM")
elmo() 

ELMO WITHOUT GRAD + 2BiLSTM
Epoch 1/5
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m74s[0m 28ms/step - accuracy: 0.6338 - loss: 0.6332 - val_accuracy: 0.7800 - val_loss: 0.4748
Epoch 2/5
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m65s[0m 26ms/step - accuracy: 0.8163 - loss: 0.4170 - val_accuracy: 0.7950 - val_loss: 0.4523
Epoch 3/5
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m65s[0m 26ms/step - accuracy: 0.8649 - loss: 0.3259 - val_accuracy: 0.7970 - val_loss: 0.5057
Epoch 4/5
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m64s[0m 25ms/step - accuracy: 0.9042 - loss: 0.2331 - val_accuracy: 0.7930 - val_loss: 0.4877
Epoch 5/5
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m65s[0m 26ms/step - accuracy: 0.9448 - loss: 0.1459 - val_accuracy: 0.7940 - val_loss: 0.6295
