In [3]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Embedding, Bidirectional,  Layer, LSTM, Dense, Dropout, Input, Lambda, LayerNormalization, GlobalAveragePooling1D, MultiHeadAttention
from gensim.models import KeyedVectors, Word2Vec
from transformers import TFBertModel, BertTokenizer
import tensorflow as tf
import pandas as pd
import numpy as np
import tensorflow_hub as hub

EMBEDDING_DIM = 300

df_train = pd.read_csv("C19_train.csv", encoding_errors="ignore")
df_test = pd.read_csv("C19_test.csv", encoding_errors="ignore")


df_train["Sentiment"] = df_train["Sentiment"].apply(
    lambda x: "Positive" if x in ["Positive", "Extremely Positive"] else "Negative"
)
df_test["Sentiment"] = df_test["Sentiment"].apply(
    lambda x: "Positive" if x in ["Positive", "Extremely Positive"] else "Negative"
)
EMBEDDING_DIM = 100
MAX_LEN = 100

df_train["Sentiment"] = df_train["Sentiment"].str.capitalize()
df_test["Sentiment"] = df_test["Sentiment"].str.capitalize()

label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(df_train["Sentiment"])
y_test = label_encoder.transform(df_test["Sentiment"])
num_classes = len(label_encoder.classes_)

In [11]:
def word_2_vec(with_grad: bool):
    print("Loading Word2Vec model...")
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(df_train["OriginalTweet"])

    X_train_seq = tokenizer.texts_to_sequences(df_train["OriginalTweet"])
    X_test_seq = tokenizer.texts_to_sequences(df_test["OriginalTweet"])

    X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_LEN)
    X_test_pad = pad_sequences(X_test_seq, maxlen=MAX_LEN)

    y_train_cat = to_categorical(y_train, num_classes)
    y_test_cat = to_categorical(y_test, num_classes)

    word_index = tokenizer.word_index

    w2v_model = KeyedVectors.load_word2vec_format(
        "GoogleNews-vectors-negative300.bin", binary=True
    )
    sentences = df_train["OriginalTweet"].apply(lambda x: x.split()).tolist()
    w2v_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)
    EMBEDDING_DIM = 100
    embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))

    for word, i in word_index.items():
        if word in w2v_model.wv:
            embedding_matrix[i] = w2v_model.wv[word]
        model = Sequential()
    model.add(
        Embedding(
            input_dim=len(word_index) + 1,
            output_dim=EMBEDDING_DIM,
            weights=[embedding_matrix],
            input_length=MAX_LEN,
            trainable=with_grad
        )
    )
    model.add(Bidirectional(LSTM(64, return_sequences=True)))
    model.add(Bidirectional(LSTM(64)))
    model.add(Dropout(0.5))
    model.add(Dense(64, activation="relu"))
    model.add(Dropout(0.5))
    model.add(Dense(num_classes, activation="softmax"))

    model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

    # Train
    history = model.fit(
        X_train_pad, y_train_cat,
        validation_split=0.1,
        epochs=5,
        batch_size=32,
        shuffle=True
    )
    model.summary()


    loss, acc = model.evaluate(X_test_pad, y_test_cat)
    print(f"Test Accuracy: {acc:.2f}")


In [12]:
def glove(with_grad: bool):
    print("Loading GloVe...")
    GLOVE_PATH = "glove.6B.100d.txt"  
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(df_train["OriginalTweet"])

    X_train_seq = tokenizer.texts_to_sequences(df_train["OriginalTweet"])
    X_test_seq = tokenizer.texts_to_sequences(df_test["OriginalTweet"])

    X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_LEN)
    X_test_pad = pad_sequences(X_test_seq, maxlen=MAX_LEN)

    y_train_cat = to_categorical(y_train, num_classes)
    y_test_cat = to_categorical(y_test, num_classes)

    word_index = tokenizer.word_index
    embeddings_index = {}
    with open(GLOVE_PATH, encoding="utf8") as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))

    for word, i in word_index.items():
        vector = embeddings_index.get(word)
        if vector is not None:
            embedding_matrix[i] = vector



    model = Sequential()
    model.add(
        Embedding(
            input_dim=len(word_index) + 1,
            output_dim=EMBEDDING_DIM,
            weights=[embedding_matrix],
            input_length=MAX_LEN,
            trainable=with_grad
        )
    )
    model.add(Bidirectional(LSTM(64, return_sequences=True)))
    model.add(Bidirectional(LSTM(64)))
    model.add(Dropout(0.5))
    model.add(Dense(64, activation="relu"))
    model.add(Dropout(0.5))
    model.add(Dense(num_classes, activation="softmax"))

    model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])
    history = model.fit(
        X_train_pad, y_train_cat,
        validation_split=0.1,
        epochs=5,
        batch_size=32,
        shuffle=True
    )
    model.summary()

    loss, acc = model.evaluate(X_test_pad, y_test_cat)
    print(f"Test Accuracy: {acc:.2f}")


In [13]:
class SelfAttention(tf.keras.layers.Layer):
    def __init__(self, num_heads, key_dim):
        super(SelfAttention, self).__init__()
        self.attn = MultiHeadAttention(num_heads=num_heads, key_dim=key_dim)

    def call(self, inputs):
        return self.attn(inputs, inputs)

def glove_bilstm_attention():
    print("Loading GloVe...")
    GLOVE_PATH = "glove.6B.100d.txt"  
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(df_train["OriginalTweet"])

    X_train_seq = tokenizer.texts_to_sequences(df_train["OriginalTweet"])
    X_test_seq = tokenizer.texts_to_sequences(df_test["OriginalTweet"])

    X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_LEN)
    X_test_pad = pad_sequences(X_test_seq, maxlen=MAX_LEN)

    y_train_cat = to_categorical(y_train, num_classes)
    y_test_cat = to_categorical(y_test, num_classes)

    word_index = tokenizer.word_index
    embeddings_index = {}
    with open(GLOVE_PATH, encoding="utf8") as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))

    for word, i in word_index.items():
        vector = embeddings_index.get(word)
        if vector is not None:
            embedding_matrix[i] = vector


    model = Sequential()
    model.add(
        Embedding(
            input_dim=len(word_index) + 1,
            output_dim=EMBEDDING_DIM,
            weights=[embedding_matrix],
            input_length=MAX_LEN,
            trainable=False
        )
    )
    model.add(Bidirectional(LSTM(64, return_sequences=True)))
    model.add(SelfAttention(num_heads=3, key_dim=64)) 
    model.add(LayerNormalization())
    model.add(GlobalAveragePooling1D())
    model.add(Dropout(0.5))
    model.add(Dense(64, activation="relu"))
    model.add(Dropout(0.5))
    model.add(Dense(num_classes, activation="softmax"))
    model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

    history = model.fit(
        X_train_pad, y_train_cat,
        validation_split=0.1,
        epochs=5,
        batch_size=32,
        shuffle=True

    )
    model.summary()

    loss, acc = model.evaluate(X_test_pad, y_test_cat)
    print(f"Test Accuracy: {acc:.2f}")


In [14]:
def bert(with_grad=False): #TODO: GPU
    label_encoder = LabelEncoder()
    df_train["label"] = label_encoder.fit_transform(df_train["Sentiment"])
    df_test["label"] = label_encoder.transform(df_test["Sentiment"])
    num_classes = len(label_encoder.classes_)
    tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
    
    def tokenize_texts(texts, max_len=128):
        return tokenizer(
            list(texts),
            truncation=True,
            padding='max_length',
            max_length=max_len,
            return_tensors='tf'
        )
    
    train_encodings = tokenize_texts(df_train["OriginalTweet"])
    test_encodings = tokenize_texts(df_test["OriginalTweet"])
    
    y_train = tf.keras.utils.to_categorical(df_train["label"], num_classes)
    y_test = tf.keras.utils.to_categorical(df_test["label"], num_classes)
    
    bert_model = TFBertModel.from_pretrained("bert-base-uncased")
    input_ids = Input(shape=(128,), dtype=tf.int32, name="input_ids")
    attention_mask = Input(shape=(128,), dtype=tf.int32, name="attention_mask")


    bert_output = tf.keras.layers.Lambda(
        lambda inputs: bert_model(
            input_ids=inputs[0],
            attention_mask=inputs[1],
            training=with_grad
        ).last_hidden_state,
        output_shape=(128, 768)  # (sequence_length, hidden_size)
        )([input_ids, attention_mask])

    

    cls_token = bert_output[:, 0, :]
    
    x = Dropout(0.5)(cls_token)
    x = Dense(64, activation="relu")(x)
    x = Dropout(0.5)(x)
    output = Dense(num_classes, activation="softmax")(x)

    model = Model(inputs=[input_ids, attention_mask], outputs=output)
    model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])
    
    model.fit(
        {
            "input_ids": train_encodings["input_ids"],
            "attention_mask": train_encodings["attention_mask"]
        },
        y_train,
        validation_data=(
            {
                "input_ids": test_encodings["input_ids"],
                "attention_mask": test_encodings["attention_mask"]
            },
            y_test
        ),
        epochs=3,
        batch_size=32
    )
    model.summary()
    return model


In [None]:
def elmo():
    df = pd.concat([df_train, df_test])
    label_encoder = LabelEncoder()
    df["SentimentEncoded"] = label_encoder.fit_transform(df["Sentiment"])
    
    train_df = df.iloc[:len(df_train)]
    test_df = df.iloc[len(df_train):]
    
    elmo = hub.load("https://tfhub.dev/google/elmo/2")
    
    def elmo_embed(sentences):
        return elmo.signatures["default"](tf.constant(sentences))["elmo"]
    
    X_train = elmo_embed(train_df["OriginalTweet"].tolist()).numpy()
    X_test = elmo_embed(test_df["OriginalTweet"].tolist()).numpy()
    
    y_train = train_df["SentimentEncoded"].values
    y_test = test_df["SentimentEncoded"].values
    
    model = tf.keras.Sequential([
        tf.keras.layers.Input(shape=(None, 1024)),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, return_sequences=True)),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
        tf.keras.layers.Dense(64, activation="relu"),
        tf.keras.layers.Dense(len(label_encoder.classes_), activation="softmax")
    ])
    
    model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
    model.summary()
    
    # Train the model
    model.fit(X_train, y_train, epochs=5, batch_size=32, validation_data=(X_test, y_test))


In [15]:
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    print(f"GPUs detected: {len(gpus)}")
else:
    print("No GPUs detected.")


No GPUs detected.


In [16]:
print("GLOVE WITH BILSTM and MULTIHEAD ATTENTION")
glove_bilstm_attention()

GLOVE WITH BILSTM and MULTIHEAD ATTENTION
Loading GloVe...
Epoch 1/5




[1m1158/1158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 36ms/step - accuracy: 0.6413 - loss: 0.6394 - val_accuracy: 0.7813 - val_loss: 0.4592
Epoch 2/5
[1m1158/1158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 39ms/step - accuracy: 0.8032 - loss: 0.4517 - val_accuracy: 0.8372 - val_loss: 0.3721
Epoch 3/5
[1m 361/1158[0m [32m━━━━━━[0m[37m━━━━━━━━━━━━━━[0m [1m30s[0m 38ms/step - accuracy: 0.8435 - loss: 0.3808

KeyboardInterrupt: 

In [None]:
print("W2V NO GRAD + 2 BiLSTM")
word_2_vec(with_grad = False)

In [None]:
print("W2V WITH GRAD + 2 BiLSTM")
word_2_vec(with_grad = True)


In [None]:
print("GLOVE WITH NO GRAD + 2 BiLSTM")
glove(with_grad = False)

In [None]:
print("GLOVE WITH GRAD + 2 BiLSTM")
glove(with_grad = True)

In [None]:
print("BERT WITH GRAD + 2 BiLSTM")
model, label_encoder = bert(True)


In [8]:
print("ELMO WITHOUT GRAD + 2BiLSTM")
elmo()

ELMO WITHOUT GRAD + 2BiLSTM


ValueError: Only instances of `keras.Layer` can be added to a Sequential model. Received: <tensorflow_hub.keras_layer.KerasLayer object at 0x3958851d0> (of type <class 'tensorflow_hub.keras_layer.KerasLayer'>)