# Attention-based BiLSTM

The first model tested is an attention based BiLSTM. This model will act as a benchmark used to compare the results of the LLMs to. Furthermore, the reasoning behind choosing this type of model specifically is that BiLSTMs do not suffer from the vanishing gradients problems like RNNs. Evidently, utilizing the attention mechanism in this model should further boost its perfmance.

## Imports

In [1]:
!pip install fasttext
!pip install keras-tuner


Collecting fasttext
  Downloading fasttext-0.9.3.tar.gz (73 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/73.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.4/73.4 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting pybind11>=2.2 (from fasttext)
  Using cached pybind11-2.13.6-py3-none-any.whl.metadata (9.5 kB)
Using cached pybind11-2.13.6-py3-none-any.whl (243 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (pyproject.toml) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.3-cp311-cp311-linux_x86_64.whl size=4313472 sha256=5f02b72055b872031ca7dd7c4892c22a1848643d1e045bf1d71b9334f02c6cc9
  Stored in directory: /root/.cache/pip/wheels/65/4f/35/5057db0249224e9ab55a51

In [None]:
from keras.layers import *
from keras.models import *
from keras import backend as K
import numpy as np
import pandas as pd
import os
import re
import nltk
import gensim
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.metrics import F1Score
import pickle
from tensorflow.keras.models import load_model
from nltk.stem import WordNetLemmatizer
import fasttext
import tensorflow as tf
from tensorflow.keras.layers import Layer
import keras_tuner as kt
from tensorflow.keras.optimizers import SGD, AdamW, RMSprop
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dropout, Dense

## Data Pre-processing

There are further pre-processing steps required for this model to perform well. The first one is to keep only alphaneumeric characters, while also removing stopwords. Links were also removed from the dataset for this model as it is not able to recognize them for what they are. Finally, the train, validation, and test datasets are tokenized and stored.

In [None]:

nltk.download('stopwords')
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()

def tweet_to_words(tweet):
    """Convert tweet text into a sequence of words while keeping punctuation."""
    text = tweet.lower()
    # Modify regex to keep specific punctuation
    text = re.sub(r"[^a-zA-Z0-9 ]", " ", text)  # Notice the space inside the brackets
    words = text.split()
    words = [w for w in words if w not in stopwords.words("italian")]
    words = [PorterStemmer().stem(w) for w in words]
    return words


def remove_links_from_tweets(tweet):
    """Remove URLs from tweets."""
    url_pattern = r"https?://\S+|www\.\S+"
    return re.sub(url_pattern, "", tweet)


# Load datasets
#train_set = pd.read_csv('/content/drive/MyDrive/Thesis/Data/train_alt.csv')
train_set = pd.read_csv('/content/drive/MyDrive/Italian thesis/Training dataset/train.csv')
train_set=train_set.dropna()

y_train = train_set['label']
train_set.drop('label', axis=1, inplace=True)

#val_set = pd.read_csv('/content/drive/MyDrive/Thesis/Data/val_alt.csv')
val_set = pd.read_csv('/content/drive/MyDrive/Italian thesis/Training dataset/val.csv')
y_val = val_set['label']
val_set.drop('label', axis=1, inplace=True)

#test_set = pd.read_csv('/content/drive/MyDrive/Thesis/Data/test_alt.csv')
test_set = pd.read_csv('/content/drive/MyDrive/Italian thesis/Training dataset/test.csv')
y_test = test_set['label']
test_set.drop('label', axis=1, inplace=True)

# Preprocess tweets
for dataset in [train_set, val_set, test_set]:
    dataset['italian text'] = dataset['italian text'].apply(remove_links_from_tweets)
    dataset['italian text'] = dataset['italian text'].apply(tweet_to_words)

# Tokenize and pad sequences
max_words = 5000
max_len = 50

def tokenize_pad_sequences(text, tokenizer=None, fit=True):
    if fit:
        tokenizer = Tokenizer(num_words=max_words, lower=True)
        tokenizer.fit_on_texts(text)

    sequences = tokenizer.texts_to_sequences(text)
    padded = pad_sequences(sequences, padding="post", maxlen=max_len)

    return padded, tokenizer

# Fit tokenizer on training data
X_train, tokenizer = tokenize_pad_sequences(train_set['italian text'], fit=True)
X_val, _ = tokenize_pad_sequences(val_set['italian text'], tokenizer, fit=False)
X_test, _ = tokenize_pad_sequences(test_set['italian text'], tokenizer, fit=False)



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


## Word Embeddings

The word embeddings used for this model are from a fastText model pre-trained on an italian corpus. They have a dimensions of 300, were trained using CBOW, with n-gram of lengths 5, have a window size of 5 and 10 negatives. They were trained using Common Crawl and Wikipedia.

In [None]:


# Load the FastText model
print("Loading FastText model...")
ft = fasttext.load_model('/content/drive/MyDrive/Italian thesis/cc.it.300.bin')

# Save tokenizer for future use
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

# Create embedding matrix
embedding_size = 300  # FastText vectors have 300 dimensions
embedding_matrix = np.zeros((max_words, embedding_size))

word_index = tokenizer.word_index
for word, i in word_index.items():
    if i < max_words:
        embedding_matrix[i] = ft.get_word_vector(word)



Loading FastText model...


## Model Building

In this section, the first thing done is defining the attention mechanism. Then, the model itself is defined. The final architecture has a word embeddings layer that can be finetuned, one Bidirectional LSTM layers, and two dense layers. Multiple model hyperparameters are finetuned before the final ones are chosen.

In [None]:


class Attention(Layer):
    def __init__(self, return_sequences=True):
        super(Attention, self).__init__()
        self.return_sequences = return_sequences

    def build(self, input_shape):
        self.W = self.add_weight(name="att_weight",
                                 shape=(input_shape[-1], 1),
                                 initializer="normal",
                                 trainable=True)

        self.b = self.add_weight(name="att_bias",
                                 shape=(input_shape[1], 1),
                                 initializer="zeros",
                                 trainable=True)

        super(Attention, self).build(input_shape)

    def call(self, inputs):
        e = tf.nn.tanh(tf.tensordot(inputs, self.W, axes=1) + self.b)
        attention_weights = tf.nn.softmax(e, axis=1)

        output = inputs * attention_weights

        if self.return_sequences:
            return output
        else:
            return tf.reduce_sum(output, axis=1)


### Hyperparameter Tuning

In [None]:

# Early Stopping Callback
early_stopping = EarlyStopping(monitor="val_accuracy", patience=3, restore_best_weights=True)
nn_f1 = F1Score(name='f1_score', average='macro')

class MyHyperModel(kt.HyperModel):
    def __init__(self, max_words, embedding_size, embedding_matrix, max_len):
        self.max_words = max_words
        self.embedding_size = embedding_size
        self.embedding_matrix = embedding_matrix
        self.max_len = max_len

    def build(self, hp):
        model = Sequential()
        model.add(Embedding(input_dim=self.max_words,
                            output_dim=self.embedding_size,
                            weights=[self.embedding_matrix],
                            input_length=self.max_len,
                            trainable=True))

        model.add(Bidirectional(LSTM(hp.Int("units_lstm_1", min_value=32, max_value=128, step=32), return_sequences=True)))
        #model.add(Bidirectional(LSTM(hp.Int("units_lstm_2", min_value=16, max_value=64, step=16), return_sequences=True)))

        model.add(Attention(return_sequences=False))  # Attention layer

        model.add(Dense(hp.Int("dense_units", min_value=128, max_value=512, step=64), activation="relu"))  # Tunable Dense layer
        model.add(Dropout(hp.Float("dropout", min_value=0.3, max_value=0.6, step=0.1)))
        model.add(Dense(4, activation="softmax"))

        # Select optimizer dynamically
        optimizer_choice = hp.Choice("optimizer", ["SGD", "AdamW", "RMSprop"])
        learning_rate = hp.Float("learning_rate", min_value=0.0001, max_value=0.1, sampling="log")
        weight_decay = hp.Float("weight_decay", min_value=1e-6, max_value=1e-3, sampling="log")

        if optimizer_choice == "SGD":
            momentum = hp.Float("momentum", min_value=0.7, max_value=0.9, step=0.1)
            optimizer = SGD(learning_rate=learning_rate, momentum=momentum, weight_decay=weight_decay)
        elif optimizer_choice == "AdamW":
            beta_1 = hp.Float("beta_1", min_value=0.85, max_value=0.99, step=0.05)
            beta_2 = hp.Float("beta_2", min_value=0.95, max_value=0.999, step=0.05)
            optimizer = AdamW(learning_rate=learning_rate, beta_1=beta_1, beta_2=beta_2, weight_decay=weight_decay)
        elif optimizer_choice == "RMSprop":
            rho = hp.Float("rho", min_value=0.8, max_value=0.99, step=0.05)
            optimizer = RMSprop(learning_rate=learning_rate, rho=rho, decay=weight_decay)

        model.compile(loss="sparse_categorical_crossentropy", optimizer=optimizer, metrics=["accuracy", nn_f1])
        return model

    def fit(self, hp, model, *args, **kwargs):
        return model.fit(
            *args,
            batch_size=hp.Choice("batch_size", [32, 64, 128]),
            **kwargs,
        )

# Initialize KerasTuner
tuner = kt.Hyperband(
    MyHyperModel(max_words, embedding_size, embedding_matrix, max_len),
    objective="val_accuracy",
    max_epochs=20,
    factor=3,
    directory="my_tuner_dir",
    project_name="lstm_tuning",
)

# Run the search
tuner.search(X_train, y_train,
             validation_data=(X_val, y_val),
             epochs=10,
             callbacks=[early_stopping])

# Get the best hyperparameters
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]


Trial 30 Complete [00h 00m 17s]
val_accuracy: 0.7387754917144775

Best val_accuracy So Far: 0.7612245082855225
Total elapsed time: 00h 08m 13s


In [None]:
print(f"Best Hyperparameters:\n"
      f"  - Units (LSTM 1): {best_hps.get('units_lstm_1')}\n"
      #f"  - Units (LSTM 2): {best_hps.get('units_lstm_2')}\n"
      f"  - Dropout: {best_hps.get('dropout')}\n"
      f"  - Optimizer: {best_hps.get('optimizer')}\n"
      f"  - Learning Rate: {best_hps.get('learning_rate')}\n"
      f"  - Weight Decay: {best_hps.get('weight_decay')}\n"
      f"  - Units(Dense): {best_hps.get('dense_units')}\n"
      f"  - Batch Size:{best_hps.get('batch_size')}")



if best_hps.get("optimizer") == "SGD":
    print(f"  - Momentum: {best_hps.get('momentum')}")
elif best_hps.get("optimizer") == "AdamW":
    print(f"  - Beta 1: {best_hps.get('beta_1')}")
    print(f"  - Beta 2: {best_hps.get('beta_2')}")
elif best_hps.get("optimizer") == "RMSprop":
    print(f"  - Rho: {best_hps.get('rho')}")

Best Hyperparameters:
  - Units (LSTM 1): 64
  - Dropout: 0.4
  - Optimizer: RMSprop
  - Learning Rate: 0.005332108747674453
  - Weight Decay: 1.2567208642033443e-05
  - Units(Dense): 192
  - Batch Size:32
  - Rho: 0.9


### Final Model

In [None]:


nn_f1 = F1Score(name='f1_score', average='macro')
early_stopping = EarlyStopping(monitor="val_accuracy", patience=3, restore_best_weights=True)

model = Sequential(name="AB-BiLSTM")
model.add(Embedding(input_dim=max_words,
                    output_dim=embedding_size,
                    weights=[embedding_matrix],
                    input_length=max_len,
                    trainable=True))

model.add(Bidirectional(LSTM(64, return_sequences=True)))

model.add(Attention(return_sequences=False))  

model.add(Dense(192, activation="relu"))  
model.add(Dropout(0.4))
model.add(Dense(4, activation="softmax"))

optimizer = RMSprop(learning_rate=0.005332108747674453, rho=0.9, decay=1.2567208642033443e-05)

model.compile(loss="sparse_categorical_crossentropy", optimizer=optimizer, metrics=["accuracy", nn_f1])

model.summary()


In [13]:

history = model.fit(X_train, y_train,
                    validation_data=(X_val, y_val),
                    callbacks=[early_stopping],
                    batch_size=32,
                    epochs=20,
                    verbose=1)



Epoch 1/20
[1m368/368[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 11ms/step - accuracy: 0.5783 - f1_score: 0.4209 - loss: 0.9588 - val_accuracy: 0.7367 - val_f1_score: 0.4279 - val_loss: 0.6498
Epoch 2/20
[1m368/368[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 9ms/step - accuracy: 0.7823 - f1_score: 0.4240 - loss: 0.5291 - val_accuracy: 0.7340 - val_f1_score: 0.4267 - val_loss: 0.6913
Epoch 3/20
[1m368/368[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 9ms/step - accuracy: 0.8431 - f1_score: 0.4245 - loss: 0.4017 - val_accuracy: 0.7503 - val_f1_score: 0.4220 - val_loss: 0.6730
Epoch 4/20
[1m368/368[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 9ms/step - accuracy: 0.8900 - f1_score: 0.4241 - loss: 0.2967 - val_accuracy: 0.7231 - val_f1_score: 0.4244 - val_loss: 0.8376
Epoch 5/20
[1m368/368[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 9ms/step - accuracy: 0.9184 - f1_score: 0.4251 - loss: 0.2195 - val_accuracy: 0.7082 - val_f1_score: 0.42

## Results


By looking at the metrics of the AB-BiLSTM model’s performance as shown by the performance metrics, it is evident that the model does not perform at a good enough level. The model has an overall accuracy and F1-score of 75%. The underlying numbers further explain this as it is evident the model has a hard time specifically when it comes to distinguishing between the ’Depression’ and ’Suicidal’ classes, with the class specific for both sitting at 63% and 71% respectively. This can be explained by some of the overlap between both mental health conditions, which can share multiple symptoms. This makes it understandably hard for the model to distinguish between the classes in some of the data points. The model performs much better on the ’Normal’ and the ’Anxiety’ classes as they are more ’obvious’.

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score

test_loss, test_acc, test_f1 = model.evaluate(X_test, y_test, verbose=0)

y_pred = model.predict(X_test, batch_size=64)

y_pred_classes = y_pred.argmax(axis=-1)

test_precision = precision_score(y_test, y_pred_classes, average='macro')
test_recall = recall_score(y_test, y_pred_classes, average='macro')
test_f1_scor = f1_score(y_test, y_pred_classes, average='macro')

print(f"Test Accuracy: {test_acc}")
print(f"Test Precision: {test_precision}")
print(f"Test Recall: {test_recall}")
print(f"Test F1 Score: {test_f1_scor}")


[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step
Test Accuracy: 0.7517006993293762
Test Precision: 0.7524316916295322
Test Recall: 0.7595370370370371
Test F1 Score: 0.7534062985888131


In [15]:
from sklearn.metrics import classification_report

# Generate the classification report
report = classification_report(y_test, y_pred_classes, target_names=['Depression', 'Normal', 'Anxiety', 'Suicidal'])

# Print the report
print("Classification Report:")
print(report)


Classification Report:
              precision    recall  f1-score   support

  Depression       0.68      0.55      0.60       400
      Normal       0.86      0.91      0.88       400
     Anxiety       0.80      0.85      0.83       270
    Suicidal       0.67      0.74      0.70       400

    accuracy                           0.75      1470
   macro avg       0.75      0.76      0.75      1470
weighted avg       0.75      0.75      0.75      1470



In [16]:
from sklearn.metrics import confusion_matrix
id2label = {0: 'Depression', 1: 'Normal', 2: 'Anxiety', 3: 'Suicidal'}
label2id = {v: k for k, v in id2label.items()}
cm = confusion_matrix(y_test, y_pred_classes)
cm_labels = np.array([id2label[i] for i in range(len(id2label))])
cm_with_labels = pd.DataFrame(cm, index=cm_labels, columns=cm_labels)
print("Confusion Matrix:")
print(cm_with_labels)

Confusion Matrix:
            Depression  Normal  Anxiety  Suicidal
Depression         218      26       38       118
Normal               5     363       13        19
Anxiety             22      10      229         9
Suicidal            76      24        5       295


In [17]:
model.summary()

## Indicator Predictions

In order to build the indicator, the model will be used to classify a different set of tweets. These tweets are sampled (3000 for each day) from a dataset that contains 15 million tweets from the first 5 months of 2020. The same pre-processing steps are applied to this new dataset, these tweets are then using as input and the final predictions as well as the confidence values are then stored.

In [None]:
from tqdm import tqdm
tqdm.pandas()
data = pd.read_csv('/content/drive/MyDrive/Italian thesis/Training dataset/italian_with_predictions_large.csv')

In [None]:
import re
import nltk
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pickle

nltk.download("stopwords")
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
def tweet_to_words(tweet):
    """Convert tweet text into a sequence of words while keeping punctuation."""
    text = tweet.lower()
    text = re.sub(r"[^a-zA-Z0-9\s.!?,;:()\"\'-]", " ", text)  
    words = text.split()
    words = [w for w in words if w not in stopwords.words("italian")]
    words = [lemmatizer.lemmatize(w) for w in words] 
    return words


def remove_links_from_tweets(tweet):
    """Remove URLs from tweets."""
    url_pattern = r"https?://\S+|www\.\S+"
    return re.sub(url_pattern, "", tweet)

with open("tokenizer.pickle", "rb") as handle:
    tokenizer = pickle.load(handle)

max_len = 50

def predict_sentiment(text, model, tokenizer):
    text = remove_links_from_tweets(text) 
    words = tweet_to_words(text)  

    sequences = tokenizer.texts_to_sequences([words])
    padded_sequence = pad_sequences(sequences, padding="post", maxlen=max_len)

    prediction = model.predict(padded_sequence, verbose=0)

    predicted_label = np.argmax(prediction, axis=-1)[0]  
    return predicted_label

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:

data['lstm_prediction'] = data['testo'].progress_apply(lambda x: predict_sentiment(x, model, tokenizer))

100%|██████████| 608000/608000 [12:33:25<00:00, 13.45it/s]


In [None]:
data.to_csv("/content/drive/MyDrive/Italian thesis/Training dataset/italian_with_predictions_large.csv", index=False)

print("Predictions added to the DataFrame.")

Predictions added to the DataFrame.


In [None]:
data.head()

Unnamed: 0,testo,tweet_date,llama_prediction,lstm_prediction
0,tollivincenzo avendo milioni a riserva era il ...,2020-01-31,1,1
1,rt matteosalvinimi dati ufficiali istat econom...,2020-01-31,1,3
2,vado a letto con la consapevolezza che la mia ...,2020-01-31,1,1
3,oggi alle finisce il mercato ma se ne apre un ...,2020-01-31,1,1
4,rt danvmor ahahahah non è una sessione di merc...,2020-01-31,1,1
