In [2]:
import os
os.environ["TF_GPU_ALLOCATOR"] = "cuda_malloc_async"

# Set TensorFlow logging level (ERROR = 3, WARNING = 2, INFO = 1, ALL = 0)
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"  # suppress most logs
import tensorflow as tf
gpus = tf.config.list_physical_devices('GPU')
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Embedding, Bidirectional,  Layer, LSTM, Dense, Dropout, Input, Lambda, LayerNormalization, GlobalAveragePooling1D, MultiHeadAttention
from gensim.models import KeyedVectors, Word2Vec
from transformers import TFBertModel, BertTokenizer
import tensorflow_hub as hub
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import backend as K


EMBEDDING_DIM = 300

df_train = pd.read_csv("C19_train.csv", encoding_errors="ignore")
df_test = pd.read_csv("C19_test.csv", encoding_errors="ignore")


df_train["Sentiment"] = df_train["Sentiment"].apply(
    lambda x: "Positive" if x in ["Positive", "Extremely Positive"] else "Negative"
)
df_test["Sentiment"] = df_test["Sentiment"].apply(
    lambda x: "Positive" if x in ["Positive", "Extremely Positive"] else "Negative"
)
EMBEDDING_DIM = 100
MAX_LEN = 100

df_train["Sentiment"] = df_train["Sentiment"].str.capitalize()
df_test["Sentiment"] = df_test["Sentiment"].str.capitalize()

label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(df_train["Sentiment"])
y_test = label_encoder.transform(df_test["Sentiment"])
num_classes = len(label_encoder.classes_)


2025-04-01 18:49:58.767005: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1743547799.398772 3542429 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1743547799.496517 3542429 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1743547800.553080 3542429 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1743547800.553161 3542429 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1743547800.553169 3542429 computation_placer.cc:177] computation placer alr

In [29]:
def word_2_vec(with_grad: bool):
    print("Loading Word2Vec model...")
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(df_train["OriginalTweet"])

    X_train_seq = tokenizer.texts_to_sequences(df_train["OriginalTweet"])
    X_test_seq = tokenizer.texts_to_sequences(df_test["OriginalTweet"])

    X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_LEN)
    X_test_pad = pad_sequences(X_test_seq, maxlen=MAX_LEN)

    y_train_cat = to_categorical(y_train, num_classes)
    y_test_cat = to_categorical(y_test, num_classes)

    word_index = tokenizer.word_index

    w2v_model = KeyedVectors.load_word2vec_format(
        "GoogleNews-vectors-negative300.bin", binary=True
    )
    sentences = df_train["OriginalTweet"].apply(lambda x: x.split()).tolist()
    w2v_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)
    EMBEDDING_DIM = 100
    embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))

    for word, i in word_index.items():
        if word in w2v_model.wv:
            embedding_matrix[i] = w2v_model.wv[word]
        model = Sequential()
    model.add(
        Embedding(
            input_dim=len(word_index) + 1,
            output_dim=EMBEDDING_DIM,
            weights=[embedding_matrix],
            input_length=MAX_LEN,
            trainable=with_grad
        )
    )
    model.add(Bidirectional(LSTM(64, return_sequences=True)))
    model.add(Bidirectional(LSTM(64)))
    model.add(Dropout(0.5))
    model.add(Dense(64, activation="relu"))
    model.add(Dropout(0.5))
    model.add(Dense(num_classes, activation="softmax"))

    model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

    # Train
    history = model.fit(
        X_train_pad, y_train_cat,
        validation_split=0.1,
        epochs=5,
        batch_size=32,
        shuffle=True
    )
    model.summary()


    print(model.evaluate(X_test_pad, y_test_cat))


In [30]:
def glove(with_grad: bool):
    print("Loading GloVe...")
    GLOVE_PATH = "glove.6B.100d.txt"  
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(df_train["OriginalTweet"])

    X_train_seq = tokenizer.texts_to_sequences(df_train["OriginalTweet"])
    X_test_seq = tokenizer.texts_to_sequences(df_test["OriginalTweet"])

    X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_LEN)
    X_test_pad = pad_sequences(X_test_seq, maxlen=MAX_LEN)

    y_train_cat = to_categorical(y_train, num_classes)
    y_test_cat = to_categorical(y_test, num_classes)

    word_index = tokenizer.word_index
    embeddings_index = {}
    with open(GLOVE_PATH, encoding="utf8") as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))

    for word, i in word_index.items():
        vector = embeddings_index.get(word)
        if vector is not None:
            embedding_matrix[i] = vector



    model = Sequential()
    model.add(
        Embedding(
            input_dim=len(word_index) + 1,
            output_dim=EMBEDDING_DIM,
            weights=[embedding_matrix],
            input_length=MAX_LEN,
            trainable=with_grad
        )
    )
    model.add(Bidirectional(LSTM(64, return_sequences=True)))
    model.add(Bidirectional(LSTM(64)))
    model.add(Dropout(0.5))
    model.add(Dense(64, activation="relu"))
    model.add(Dropout(0.5))
    model.add(Dense(num_classes, activation="softmax"))

    model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])
    history = model.fit(
        X_train_pad, y_train_cat,
        validation_split=0.1,
        epochs=5,
        batch_size=32,
        shuffle=True
    )
    model.summary()

    print(model.evaluate(X_test_pad, y_test_cat))


In [9]:
class SelfAttention(tf.keras.layers.Layer):
    def __init__(self, num_heads, key_dim):
        super(SelfAttention, self).__init__()
        self.attn = MultiHeadAttention(num_heads=num_heads, key_dim=key_dim)

    def call(self, inputs):
        return self.attn(inputs, inputs)

def glove_bilstm_attention():
    print("Loading GloVe...")
    GLOVE_PATH = "glove.6B.100d.txt"  
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(df_train["OriginalTweet"])

    X_train_seq = tokenizer.texts_to_sequences(df_train["OriginalTweet"])
    X_test_seq = tokenizer.texts_to_sequences(df_test["OriginalTweet"])

    X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_LEN)
    X_test_pad = pad_sequences(X_test_seq, maxlen=MAX_LEN)

    y_train_cat = to_categorical(y_train, num_classes)
    y_test_cat = to_categorical(y_test, num_classes)

    word_index = tokenizer.word_index
    embeddings_index = {}
    with open(GLOVE_PATH, encoding="utf8") as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))

    for word, i in word_index.items():
        vector = embeddings_index.get(word)
        if vector is not None:
            embedding_matrix[i] = vector


    model = Sequential()
    model.add(
        Embedding(
            input_dim=len(word_index) + 1,
            output_dim=EMBEDDING_DIM,
            weights=[embedding_matrix],
            input_length=MAX_LEN,
            trainable=False
        )
    )
    model.add(Bidirectional(LSTM(64, return_sequences=True)))
    model.add(SelfAttention(num_heads=3, key_dim=64)) 
    model.add(LayerNormalization())
    model.add(GlobalAveragePooling1D())
    model.add(Dropout(0.5))
    model.add(Dense(64, activation="relu"))
    model.add(Dropout(0.5))
    model.add(Dense(num_classes, activation="softmax"))
    model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

    history = model.fit(
        X_train_pad, y_train_cat,
        validation_split=0.1,
        epochs=5,
        batch_size=32,
        shuffle=True

    )
    model.summary()

    print(model.evaluate(X_test_pad, y_test_cat))


In [32]:
def bert(with_grad=False): 
    label_encoder = LabelEncoder()
    df_train["label"] = label_encoder.fit_transform(df_train["Sentiment"])
    df_test["label"] = label_encoder.transform(df_test["Sentiment"])
    num_classes = len(label_encoder.classes_)
    tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
    
    def tokenize_texts(texts, max_len=128):
        return tokenizer(
            list(texts),
            truncation=True,
            padding='max_length',
            max_length=max_len,
            return_tensors='tf'
        )
    
    train_encodings = tokenize_texts(df_train["OriginalTweet"])
    test_encodings = tokenize_texts(df_test["OriginalTweet"])
    
    y_train = tf.keras.utils.to_categorical(df_train["label"], num_classes)
    y_test = tf.keras.utils.to_categorical(df_test["label"], num_classes)
    
    bert_model = TFBertModel.from_pretrained("bert-base-cased")
    bert_model.trainable = with_grad
    input_ids = Input(shape=(128,), dtype=tf.int32, name="input_ids")
    attention_mask = Input(shape=(128,), dtype=tf.int32, name="attention_mask")


    bert_output = tf.keras.layers.Lambda(
        lambda inputs: bert_model(
            input_ids=inputs[0],
            attention_mask=inputs[1],
        ).last_hidden_state,
        output_shape=(128, 768)  # (sequence_length, hidden_size)
        )([input_ids, attention_mask])

    

    cls_token = bert_output[:, 0, :]
    
    x = Dropout(0.5)(cls_token)
    x = Dense(64, activation="relu")(x)
    x = Dropout(0.5)(x)
    output = Dense(num_classes, activation="softmax")(x)

    model = Model(inputs=[input_ids, attention_mask], outputs=output)
    model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])
    
    model.fit(
        {
            "input_ids": train_encodings["input_ids"],
            "attention_mask": train_encodings["attention_mask"]
        },
        y_train,
        validation_data=(
            {
                "input_ids": test_encodings["input_ids"],
                "attention_mask": test_encodings["attention_mask"]
            },
            y_test
        ),
        epochs=5,
        batch_size=32
    )
    bert_model.summary()
    model.summary()
    print(model.evaluate(
        {
            "input_ids": test_encodings["input_ids"],
            "attention_mask": test_encodings["attention_mask"]
        },
        y_test
    ))



In [6]:
def elmo():
    df_train_small = df_train.sample(10000, random_state=42)  # or even 500
    df_test_small = df_test.sample(1000, random_state=42)

    df = pd.concat([df_train_small, df_test_small])
    label_encoder = LabelEncoder()
    df["SentimentEncoded"] = label_encoder.fit_transform(df["Sentiment"])
    
    train_df = df.iloc[:len(df_train_small)]
    test_df = df.iloc[len(df_train_small):]
    
    elmo = hub.load("https://tfhub.dev/google/elmo/2")
    
    def elmo_embed(sentences):
        return elmo.signatures["default"](tf.constant(sentences))["elmo"]
    
    def batched_elmo_embed(texts, batch_size=64, max_len=64):
        embeddings = []
        for i in range(0, len(texts), batch_size):
            batch = texts[i:i+batch_size]
            emb = elmo_embed(batch).numpy()  # ✅ FIXED: no ['elmo']
            padded = np.zeros((len(batch), max_len, 1024))
            for j, seq in enumerate(emb):
                cut = min(len(seq), max_len)
                padded[j, :cut, :] = seq[:cut]
            embeddings.append(padded)
        return np.concatenate(embeddings, axis=0)

    X_train = batched_elmo_embed(train_df["OriginalTweet"].tolist())
    X_test = batched_elmo_embed(test_df["OriginalTweet"].tolist())

    
    y_train = train_df["SentimentEncoded"].values
    y_test = test_df["SentimentEncoded"].values
    
    model = tf.keras.Sequential([
        tf.keras.layers.Input(shape=(None, 1024)),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, return_sequences=True)),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
        tf.keras.layers.Dense(64, activation="relu"),
        tf.keras.layers.Dense(len(label_encoder.classes_), activation="softmax")
    ])
    
    model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
    
    # Train the model
    model.fit(X_train, y_train, epochs=5, batch_size=4, validation_data=(X_test, y_test))
    model.summary()
    print(model.evaluate(X_test, y_test))



In [34]:
tf.config.set_soft_device_placement(True)
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    print(f"GPUs detected: {len(gpus)}")
else:
    print("No GPUs detected.")


GPUs detected: 3


In [35]:
print("W2V NO GRAD + 2 BiLSTM")
word_2_vec(with_grad = False)

W2V NO GRAD + 2 BiLSTM
Loading Word2Vec model...
Epoch 1/5




[1m1158/1158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 43ms/step - accuracy: 0.6210 - loss: 0.6519 - val_accuracy: 0.6958 - val_loss: 0.5819
Epoch 2/5
[1m1158/1158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 42ms/step - accuracy: 0.7060 - loss: 0.5727 - val_accuracy: 0.7194 - val_loss: 0.5570
Epoch 3/5
[1m1158/1158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 42ms/step - accuracy: 0.7335 - loss: 0.5423 - val_accuracy: 0.7332 - val_loss: 0.5291
Epoch 4/5
[1m1158/1158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 42ms/step - accuracy: 0.7465 - loss: 0.5219 - val_accuracy: 0.7471 - val_loss: 0.5168
Epoch 5/5
[1m1158/1158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 42ms/step - accuracy: 0.7631 - loss: 0.4970 - val_accuracy: 0.7561 - val_loss: 0.5022


[1m119/119[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 17ms/step - accuracy: 0.7515 - loss: 0.5076
[0.506100594997406, 0.7548710107803345]


In [36]:
print("W2V WITH GRAD + 2 BiLSTM")
word_2_vec(with_grad = True)


W2V WITH GRAD + 2 BiLSTM
Loading Word2Vec model...
Epoch 1/5
[1m1158/1158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 45ms/step - accuracy: 0.6586 - loss: 0.6076 - val_accuracy: 0.8338 - val_loss: 0.3864
Epoch 2/5
[1m1158/1158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m52s[0m 45ms/step - accuracy: 0.9276 - loss: 0.1995 - val_accuracy: 0.8355 - val_loss: 0.4085
Epoch 3/5
[1m1158/1158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m52s[0m 45ms/step - accuracy: 0.9852 - loss: 0.0468 - val_accuracy: 0.8409 - val_loss: 0.5341
Epoch 4/5
[1m1158/1158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m52s[0m 45ms/step - accuracy: 0.9947 - loss: 0.0175 - val_accuracy: 0.8105 - val_loss: 0.9397
Epoch 5/5
[1m1158/1158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m52s[0m 45ms/step - accuracy: 0.9968 - loss: 0.0111 - val_accuracy: 0.8107 - val_loss: 1.1624


[1m119/119[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 18ms/step - accuracy: 0.7935 - loss: 1.3521
[1.2515798807144165, 0.7993680834770203]


In [37]:
print("GLOVE WITH NO GRAD + 2 BiLSTM")
glove(with_grad = False)

GLOVE WITH NO GRAD + 2 BiLSTM
Loading GloVe...
Epoch 1/5
[1m1158/1158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 43ms/step - accuracy: 0.6390 - loss: 0.6301 - val_accuracy: 0.7561 - val_loss: 0.5202
Epoch 2/5
[1m1158/1158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 43ms/step - accuracy: 0.8219 - loss: 0.4065 - val_accuracy: 0.8241 - val_loss: 0.3999
Epoch 5/5
[1m1158/1158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 43ms/step - accuracy: 0.8428 - loss: 0.3657 - val_accuracy: 0.8290 - val_loss: 0.3837


[1m119/119[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 18ms/step - accuracy: 0.8324 - loss: 0.3855
[0.3877251446247101, 0.8346498012542725]


In [38]:
print("GLOVE WITH GRAD + 2 BiLSTM")
glove(with_grad = True)

GLOVE WITH GRAD + 2 BiLSTM
Loading GloVe...
Epoch 1/5
[1m1158/1158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 45ms/step - accuracy: 0.6802 - loss: 0.5766 - val_accuracy: 0.8700 - val_loss: 0.3118
Epoch 2/5
[1m1158/1158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m52s[0m 45ms/step - accuracy: 0.9922 - loss: 0.0255 - val_accuracy: 0.8717 - val_loss: 0.5431
Epoch 5/5
[1m1158/1158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m52s[0m 45ms/step - accuracy: 0.9951 - loss: 0.0155 - val_accuracy: 0.8673 - val_loss: 0.8272


[1m119/119[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 18ms/step - accuracy: 0.8685 - loss: 0.7704
[0.798363447189331, 0.8636124134063721]


In [39]:
print("BERT WITH GRAD + 2 BiLSTM")
bert(True)


BERT WITH GRAD + 2 BiLSTM


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

Epoch 1/5


I0000 00:00:1743545174.256154 3540074 service.cc:152] XLA service 0x7fc135d036c0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1743545174.256451 3540074 service.cc:160]   StreamExecutor device (0): Tesla P100-PCIE-12GB, Compute Capability 6.0
I0000 00:00:1743545174.256773 3540074 service.cc:160]   StreamExecutor device (1): Tesla P100-PCIE-12GB, Compute Capability 6.0
I0000 00:00:1743545174.257301 3540074 service.cc:160]   StreamExecutor device (2): Tesla P100-PCIE-12GB, Compute Capability 6.0


[1m   1/1287[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m5:26:58[0m 15s/step - accuracy: 0.5938 - loss: 0.8164

I0000 00:00:1743545180.408984 3540074 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m1287/1287[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m203s[0m 146ms/step - accuracy: 0.5693 - loss: 0.7001 - val_accuracy: 0.6535 - val_loss: 0.6374
Epoch 2/5
[1m1287/1287[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m193s[0m 150ms/step - accuracy: 0.6223 - loss: 0.6505 - val_accuracy: 0.6559 - val_loss: 0.6352
Epoch 3/5
[1m 416/1287[0m [32m━━━━━━[0m[37m━━━━━━━━━━━━━━[0m [1m2:01[0m 139ms/step - accuracy: 0.6309 - loss: 0.6422

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



[1m1287/1287[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m195s[0m 152ms/step - accuracy: 0.6312 - loss: 0.6427 - val_accuracy: 0.6538 - val_loss: 0.6275
Epoch 4/5
[1m1287/1287[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m195s[0m 152ms/step - accuracy: 0.6315 - loss: 0.6427 - val_accuracy: 0.6590 - val_loss: 0.6320
Epoch 5/5
[1m1287/1287[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m196s[0m 152ms/step - accuracy: 0.6330 - loss: 0.6427 - val_accuracy: 0.6603 - val_loss: 0.6296
Model: "tf_bert_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  108310272 
                                                                 
Total params: 108310272 (413.17 MB)
Trainable params: 108310272 (413.17 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


[1m119/119[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 138ms/step - accuracy: 0.6551 - loss: 0.6343
[0.6296168565750122, 0.660347580909729]


In [40]:
print("BERT WITHOUT GRAD + 2 BiLSTM")
bert(False)


BERT WITH GRAD + 2 BiLSTM


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

Epoch 1/5
[1m1287/1287[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m213s[0m 154ms/step - accuracy: 0.5621 - loss: 0.7091 - val_accuracy: 0.6493 - val_loss: 0.6422
Epoch 2/5
[1m1287/1287[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m195s[0m 152ms/step - accuracy: 0.6142 - loss: 0.6534 - val_accuracy: 0.6614 - val_loss: 0.6360
Epoch 3/5
[1m 466/1287[0m [32m━━━━━━━[0m[37m━━━━━━━━━━━━━[0m [1m1:54[0m 139ms/step - accuracy: 0.6230 - loss: 0.6456

KeyboardInterrupt: 

In [7]:
print("ELMO WITHOUT GRAD + 2BiLSTM")
elmo() 

ELMO WITHOUT GRAD + 2BiLSTM
Epoch 1/5
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m84s[0m 31ms/step - accuracy: 0.6553 - loss: 0.6239 - val_accuracy: 0.7900 - val_loss: 0.4583
Epoch 2/5
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m75s[0m 30ms/step - accuracy: 0.8220 - loss: 0.4144 - val_accuracy: 0.8080 - val_loss: 0.4367
Epoch 3/5
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m76s[0m 30ms/step - accuracy: 0.8607 - loss: 0.3285 - val_accuracy: 0.8010 - val_loss: 0.4668
Epoch 4/5
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m76s[0m 30ms/step - accuracy: 0.9046 - loss: 0.2339 - val_accuracy: 0.7960 - val_loss: 0.4992
Epoch 5/5
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m75s[0m 30ms/step - accuracy: 0.9453 - loss: 0.1484 - val_accuracy: 0.7900 - val_loss: 0.6228


[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step - accuracy: 0.8019 - loss: 0.6054
[0.6228476166725159, 0.7900000214576721]


In [10]:
print("GLOVE WITH BILSTM and MULTIHEAD ATTENTION")
glove_bilstm_attention()



GLOVE WITH BILSTM and MULTIHEAD ATTENTION
Loading GloVe...




Epoch 1/5
[1m1158/1158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 32ms/step - accuracy: 0.6551 - loss: 0.6258 - val_accuracy: 0.7906 - val_loss: 0.4510
Epoch 2/5
[1m1158/1158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 32ms/step - accuracy: 0.8107 - loss: 0.4338 - val_accuracy: 0.8491 - val_loss: 0.3515
Epoch 3/5
[1m1158/1158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 32ms/step - accuracy: 0.8472 - loss: 0.3651 - val_accuracy: 0.8588 - val_loss: 0.3234
Epoch 4/5
[1m1158/1158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 31ms/step - accuracy: 0.8766 - loss: 0.3162 - val_accuracy: 0.8749 - val_loss: 0.3019
Epoch 5/5
[1m1158/1158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 31ms/step - accuracy: 0.8937 - loss: 0.2777 - val_accuracy: 0.8814 - val_loss: 0.2964


[1m119/119[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - accuracy: 0.8670 - loss: 0.3254
[0.3254137933254242, 0.8723012208938599]
