# LSTM Model

https://github.com/LukeDitria/pytorch_tutorials/blob/main/section12_sequential/solutions/Pytorch6_LSTM_Text_Classification.ipynb

In [None]:
import tensorflow as tf
from tensorflow import keras
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import time

In [None]:
# First dataset
train_df = pd.read_csv("processed_train.csv")
test_df = pd.read_csv("processed_test.csv")

# Second dataset
train_df1 = pd.read_csv("processed_train1.csv")
test_df1 = pd.read_csv("processed_test1.csv")

# Combined dataset
train_df2 = pd.read_csv("combined_processed_train.csv")
test_df2 = pd.read_csv("combined_processed_test.csv")

# Eval dataset
test_df3 = pd.read_csv("scam_dataset_eval_processed.csv")

In [None]:
print(train_df.shape)
print(test_df.shape)

(1280, 3)
(320, 3)


In [None]:
train_df.head()

Unnamed: 0,text,type,label
0,Hello is this John My name is Officer Johnson ...,ssn,1
1,Hello is this John My name is Officer Johnson ...,ssn,1
2,Hello is this Mr Johnson My name is Officer Ja...,ssn,1
3,Hello is this John My name is Officer Johnson ...,ssn,1
4,Hello this is Officer Johnson from the Social ...,ssn,1


In [None]:
from transformers import BertTokenizer

# Load Pretrained BERT Tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Tokenize text
train_tokens = tokenizer(list(train_df['text']), padding="max_length", truncation=True, max_length=100, return_tensors="tf")
test_tokens = tokenizer(list(test_df['text']), padding="max_length", truncation=True, max_length=100, return_tensors="tf")

train_tokens1 = tokenizer(list(train_df1['text']), padding="max_length", truncation=True, max_length=100, return_tensors="tf")
test_tokens1 = tokenizer(list(test_df1['text']), padding="max_length", truncation=True, max_length=100, return_tensors="tf")

train_tokens2 = tokenizer(list(train_df2['text']), padding="max_length", truncation=True, max_length=100, return_tensors="tf")
test_tokens2 = tokenizer(list(test_df2['text']), padding="max_length", truncation=True, max_length=100, return_tensors="tf")

test_tokens3 = tokenizer(list(test_df3['text']), padding="max_length", truncation=True, max_length=100, return_tensors="tf")

In [None]:
train_tokens

{'input_ids': <tf.Tensor: shape=(1280, 100), dtype=int32, numpy=
array([[  101,  7592,  2003, ...,  2017,  2065,   102],
       [  101,  7592,  2003, ..., 11082,  2074,   102],
       [  101,  7592,  2003, ...,  3036,  2193,   102],
       ...,
       [  101,  7632,  2003, ...,     0,     0,     0],
       [  101,  7632,  2003, ...,     0,     0,     0],
       [  101,  7632,  2003, ...,     0,     0,     0]], dtype=int32)>, 'token_type_ids': <tf.Tensor: shape=(1280, 100), dtype=int32, numpy=
array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(1280, 100), dtype=int32, numpy=
array([[1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]], dtype=in

In [None]:
# Create LSTM model
from tensorflow.keras import layers
import os

# Embedding: https://www.tensorflow.org/tutorials/text/word_embeddings
# Turns positive integers (indexes) into dense vectors of fixed size. (other approach could be one-hot-encoding)

# Word embeddings give us a way to use an efficient, dense representation in which similar words have
# a similar encoding. Importantly, you do not have to specify this encoding by hand. An embedding is a
# dense vector of floating point values (the length of the vector is a parameter you specify).

SEED = 42
np.random.seed(SEED)
tf.random.set_seed(SEED)

# Force deterministic TensorFlow operations
os.environ["TF_DETERMINISTIC_OPS"] = "1"  # Ensures deterministic ops
tf.config.experimental.enable_op_determinism()  # Forces deterministic execution (TF 2.9+)

vocab_size = tokenizer.vocab_size

model = keras.models.Sequential()
model.add(layers.Embedding(vocab_size, 128))

# The layer will take as input an integer matrix of size (batch, input_length),
# and the largest integer (i.e. word index) in the input should be no larger than num_words (vocabulary size).
# Now model.output_shape is (None, input_length, 32), where `None` is the batch dimension.


model.add(layers.LSTM(64, dropout=0.2))
model.add(layers.Dense(1, activation="sigmoid"))

model.summary()

In [None]:
loss = keras.losses.BinaryCrossentropy(from_logits=False)
optim = keras.optimizers.Adam(learning_rate=0.01)
metrics = ["accuracy"]

model.compile(loss=loss, optimizer=optim, metrics=metrics)

In [None]:
# model.fit(tokens, train_df['label'], epochs=5, validation_data=(test_tokens, test_df['label']), verbose=2)
X_train = train_tokens['input_ids']
y_train = train_df['label']
X_val = test_tokens['input_ids']
y_val = test_df['label']

X_train1 = train_tokens1['input_ids']
y_train1 = train_df1['label']
X_val1 = test_tokens1['input_ids']
y_val1 = test_df1['label']

X_train2 = train_tokens2['input_ids']
y_train2 = train_df2['label']
X_val2 = test_tokens2['input_ids']
y_val2 = test_df2['label']

X_val3 = test_tokens3['input_ids']
y_val3 = test_df3['label']

model.fit(X_train2, y_train2, epochs=3, validation_data=(X_val2, y_val2), verbose=2)

Epoch 1/3
140/140 - 18s - 127ms/step - accuracy: 0.9984 - loss: 0.0064 - val_accuracy: 0.9946 - val_loss: 0.0191
Epoch 2/3
140/140 - 16s - 112ms/step - accuracy: 0.9998 - loss: 0.0021 - val_accuracy: 0.9955 - val_loss: 0.0141
Epoch 3/3
140/140 - 15s - 109ms/step - accuracy: 1.0000 - loss: 2.3998e-04 - val_accuracy: 0.9955 - val_loss: 0.0150


<keras.src.callbacks.history.History at 0x7c8a29394190>

In [None]:
predictions = model.predict(X_val2)
predictions = [1 if p > 0.5 else 0 for p in predictions]

[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 20ms/step


In [None]:
# Calculating the evaluation metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

y_test= y_val2
y_pred = predictions
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

In [None]:
print('Accuracy:', accuracy)
print('Precision:', precision)
print('Recall:', recall)
print('F1-score:', f1)

Accuracy: 0.9955357142857143
Precision: 0.9964221824686941
Recall: 0.9946428571428572
F1-score: 0.9955317247542449


In [None]:
print(vocab_size)

30522


In [None]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.3.0-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.15.2-py3-none-any.whl.metadata (7.3 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna-4.3.0-py3-none-any.whl (386 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m386.6/386.6 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.15.2-py3-none-any.whl (231 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m231.9/231.9 kB[0m [31m20.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, alembic, optuna
Successfully installed alembic-1.15.2 colorlog-6.9.0 optuna-4.3.0


In [None]:
import optuna

# Hyperparameter finetuning
def objective(trial):
    embedding_dim = trial.suggest_categorical("embedding_dim", [32, 64, 128])
    lstm_units = trial.suggest_categorical("lstm_units", [32, 64, 128])
    dropout_rate = trial.suggest_categorical("dropout", [0.1, 0.2, 0.3, 0.4, 0.5])
    learning_rate = trial.suggest_categorical("lr", [1e-5, 1e-4, 1e-3, 1e-2])
    epochs = trial.suggest_categorical("epochs", [3, 5])

    model = keras.Sequential([
        layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim),
        layers.LSTM(lstm_units, dropout=dropout_rate),
        layers.Dense(1, activation="sigmoid")
    ])

    model.compile(
        loss=keras.losses.BinaryCrossentropy(from_logits=False),
        optimizer=keras.optimizers.Adam(learning_rate=learning_rate),
        metrics=["accuracy"]
    )

    history = model.fit(
        X_train, y_train,
        epochs=epochs,
        batch_size=32,
        validation_data=(X_val, y_val),
        verbose=0
    )

    return history.history["val_accuracy"][-1]

# Run the study
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=10)

# Print the best parameters
print("Best parameters found:")
print(study.best_trial.params)


[I 2025-04-23 03:54:45,421] A new study created in memory with name: no-name-b1f49def-3107-4491-accb-bc5a685e3878
[I 2025-04-23 03:55:01,271] Trial 0 finished with value: 0.699999988079071 and parameters: {'embedding_dim': 128, 'lstm_units': 32, 'dropout': 0.1, 'lr': 0.0001, 'epochs': 3}. Best is trial 0 with value: 0.699999988079071.
[I 2025-04-23 03:55:18,396] Trial 1 finished with value: 0.956250011920929 and parameters: {'embedding_dim': 32, 'lstm_units': 64, 'dropout': 0.5, 'lr': 0.0001, 'epochs': 5}. Best is trial 1 with value: 0.956250011920929.
[I 2025-04-23 03:55:30,857] Trial 2 finished with value: 0.699999988079071 and parameters: {'embedding_dim': 32, 'lstm_units': 64, 'dropout': 0.2, 'lr': 1e-05, 'epochs': 3}. Best is trial 1 with value: 0.956250011920929.
[I 2025-04-23 03:56:41,467] Trial 3 finished with value: 0.809374988079071 and parameters: {'embedding_dim': 64, 'lstm_units': 64, 'dropout': 0.2, 'lr': 0.001, 'epochs': 3}. Best is trial 1 with value: 0.956250011920929.

Best parameters found:
{'embedding_dim': 128, 'lstm_units': 64, 'dropout': 0.2, 'lr': 0.01, 'epochs': 3}
