## Detection of spam using LTSM and attention

### Dataset

https://www.kaggle.com/datasets/mandygu/lingspam-dataset

In [1]:
!kaggle datasets download -d mandygu/lingspam-dataset

Dataset URL: https://www.kaggle.com/datasets/mandygu/lingspam-dataset
License(s): unknown
Downloading lingspam-dataset.zip to /content
 64% 2.00M/3.12M [00:00<00:00, 2.51MB/s]
100% 3.12M/3.12M [00:01<00:00, 3.16MB/s]


In [2]:
import zipfile
import os

with zipfile.ZipFile('/content/lingspam-dataset.zip', 'r') as zip_ref:
  zip_ref.extractall('kaggle_spam_ds')

In [3]:
import pandas as pd

kaggle_df = pd.read_csv('kaggle_spam_ds/messages.csv')
kaggle_df.head()

Unnamed: 0,subject,message,label
0,job posting - apple-iss research center,content - length : 3386 apple-iss research cen...,0
1,,"lang classification grimes , joseph e . and ba...",0
2,query : letter frequencies for text identifica...,i am posting this inquiry for sergei atamas ( ...,0
3,risk,a colleague and i are researching the differin...,0
4,request book information,earlier this morning i was on the phone with a...,0


### Preprocessing of dataset

In [4]:
from sklearn.model_selection import train_test_split

N = 1000

kaggle_df = kaggle_df.sample(n=N, random_state=42)

Removing of puntuations

In [5]:
import string

punctuations_list = string.punctuation

def remove_punctuations(text):
  temp = str.maketrans('', '', punctuations_list)
  return text.translate(temp)

kaggle_df['message']= kaggle_df['message'].apply(lambda x: remove_punctuations(x))
kaggle_df.head()

Unnamed: 0,subject,message,label
1905,ials ( 6th lang teacher ed ),the university of edinburgh institute for appl...,0
1055,esl in children cut off from their native lang...,2 years ago my wife and i adopted two kids fr...,0
2471,free live sexxx ! ! ! !,attention video sex lovers ...,1
1133,ausschreibung professur universitaet heidelberg,am sprachwissenschaftlichen institut der unive...,0
1200,cssi conference on spatial cognition,mind iii annual conference of the cognitive s...,0


Removing of stop words

In [6]:
import nltk
from nltk.corpus import stopwords
from wordcloud import WordCloud
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [7]:
def remove_stopwords(text):
  stop_words = stopwords.words('english')
  imp_words = []

  for word in str(text).split():
    word = word.lower()
    if word not in stop_words:
      imp_words.append(word)

  output = " ".join(imp_words)
  return output


kaggle_df['message'] = kaggle_df['message'].apply(lambda text: remove_stopwords(text))
kaggle_df.head()

Unnamed: 0,subject,message,label
1905,ials ( 6th lang teacher ed ),university edinburgh institute applied languag...,0
1055,esl in children cut off from their native lang...,2 years ago wife adopted two kids russia time ...,0
2471,free live sexxx ! ! ! !,attention video sex lovers never pay video sex...,1
1133,ausschreibung professur universitaet heidelberg,sprachwissenschaftlichen institut der universi...,0
1200,cssi conference on spatial cognition,mind iii annual conference cognitive science s...,0


In [8]:
emails = kaggle_df['message']
labels = kaggle_df['label']

X_train, X_test, y_train, y_test = train_test_split(emails, labels, test_size=0.2, random_state=42)

Tokinazing of text using pretrained BERT

In [9]:
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [10]:
from transformers import BertTokenizer, BertModel

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased').to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [11]:
from tqdm import tqdm

embeddings = []
for email in tqdm(X_train):
  inputs = tokenizer(email, return_tensors='pt', truncation=True, max_length=512)
  inputs = {k: v.to(device) for k, v in inputs.items()}

  with torch.no_grad():
    outputs = model(**inputs)

  word_embeddings = outputs.last_hidden_state
  embeddings.append(word_embeddings)

100%|██████████| 800/800 [14:31<00:00,  1.09s/it]


## LSTM

In [38]:
from torch import nn


class LSTM_classificator(nn.Module):
  def __init__(self, vocab_size, hidden_dim, output_dim, embedding_dim=256, num_heads=8, droupout=0.5):
    super(LSTM_classificator, self).__init__()

    # embedding layer
    self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)

    # lstm
    self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_dim, batch_first=True)

    # multihead attention
    self.attention = nn.MultiheadAttention(embed_dim=hidden_dim, num_heads=num_heads)

    # hidden layer
    self.dense = nn.Sequential(
        nn.Linear(hidden_dim, hidden_dim),
        nn.ReLU(),
        nn.Dropout(droupout)
    )

    # output in [0, 1]
    self.output = nn.Sequential(
        nn.Linear(hidden_dim, output_dim),
        nn.Sigmoid()
    )

  def forward(self, input_sequence):
    embeddings = self.embedding(input_sequence)
    hidden_states, _ = self.lstm(embeddings)

    # permute shape (needed for multuhead attention)
    hidden_states = hidden_states.permute(1, 0, 2)  # (sequence_length, batch_size, hidden_dim)

    # attention
    context_vector, _ = self.attention(hidden_states, hidden_states, hidden_states)

    # permute back
    context_vector = context_vector.permute(1, 0, 2)  # (batch_size, sequence_length, hidden_dim)
    context_vector = context_vector.mean(dim=1)  # (batch_size, hidden_dim)

    dense_output = self.dense(context_vector)
    output = self.output(dense_output)

    return output

## Training loop

Hyperparameters

In [39]:
input_dim = tokenizer.vocab_size + 1
hidden_dim = 256
output_dim = 1
droupout = 0.5

In [40]:
model = LSTM_classificator(input_dim, hidden_dim, output_dim, droupout=droupout)

In [41]:
batch_size = 32
lr = 0.0001
n_epochs = 10
decay_factor = 1.00004

In [42]:
model = model.to(device)
bce = nn.BCELoss().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=decay_factor)

In [47]:
from torch.utils.data import DataLoader, TensorDataset


tokenized_inputs = []
for text in tqdm(X_train):
  embedding = tokenizer(text, truncation=True, padding='max_length', max_length=512, return_tensors='pt')
  embedding = embedding['input_ids'].squeeze(0)
  tokenized_inputs.append(embedding)

labels = torch.tensor(y_train.values, dtype=torch.float32)

train_data = TensorDataset(torch.stack(tokenized_inputs), labels)
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)

100%|██████████| 800/800 [00:13<00:00, 59.80it/s] 


TODO: to be improved (hyperparams, plots, techniques to improve accuracy) + evaluation + testing on another dataset

In [48]:
losses = []

for epoch in range(n_epochs):
  print(f"Epoch {epoch+1}/{n_epochs}")
  epoch_loss = 0.0

  for batch_X, batch_y in train_loader:
    batch_X = batch_X.to(device)
    batch_y = batch_y.to(device)

    # forward
    output_y = model(batch_X)  # (batch_size, 1)
    output_y = output_y.squeeze(1)  # (batch_size,)

    # loss
    loss = bce(output_y, batch_y.float())

    # backward
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    # add loss value
    epoch_loss += loss.item()

  scheduler.step()

  epoch_loss /= len(train_loader)
  losses.append(epoch_loss)
  print(f"Loss: {epoch_loss:.4f}")


Epoch 1/10
Epoch 1, Loss: 0.6225
Epoch 2/10
Epoch 2, Loss: 0.5710
Epoch 3/10
Epoch 3, Loss: 0.5528
Epoch 4/10
Epoch 4, Loss: 0.5411
Epoch 5/10
Epoch 5, Loss: 0.5118
Epoch 6/10
Epoch 6, Loss: 0.4611
Epoch 7/10
Epoch 7, Loss: 0.3951
Epoch 8/10
Epoch 8, Loss: 0.2634
Epoch 9/10
Epoch 9, Loss: 0.1751
Epoch 10/10
Epoch 10, Loss: 0.1431
