In [2]:
import pandas as pd

# Load the dataset
data = pd.read_csv('dataset/email.csv')

# Display the first few rows
print(data.head())


  Category                                            Message
0      ham  Go until jurong point, crazy.. Available only ...
1      ham                      Ok lar... Joking wif u oni...
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...
3      ham  U dun say so early hor... U c already then say...
4      ham  Nah I don't think he goes to usf, he lives aro...


In [3]:
from sklearn.preprocessing import LabelEncoder

# Convert categories to numeric labels
label_encoder = LabelEncoder()
data['Category'] = label_encoder.fit_transform(data['Category'])

# Get label mappings
label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print("Label Mapping:", label_mapping)


Label Mapping: {'ham': 0, 'spam': 1, '{"mode":"full"': 2}


In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    data['Message'], data['Category'], test_size=0.2, random_state=42
)


In [5]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')




In [6]:
def tokenize_texts(texts, labels, tokenizer, max_length=128):
    inputs = tokenizer(
        list(texts),
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )
    inputs['labels'] = torch.tensor(labels.tolist())
    return inputs

import torch

train_inputs = tokenize_texts(X_train, y_train, tokenizer)
test_inputs = tokenize_texts(X_test, y_test, tokenizer)


In [9]:
from torch.utils.data import DataLoader, TensorDataset

train_dataset = TensorDataset(
    train_inputs['input_ids'], train_inputs['attention_mask'], train_inputs['labels']
)
test_dataset = TensorDataset(
    test_inputs['input_ids'], test_inputs['attention_mask'], test_inputs['labels']
)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)


In [14]:
from transformers import BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)
model.train()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [15]:
from transformers import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Training Loop
from tqdm import tqdm

for epoch in range(3):  # Train for 3 epochs
    print(f"Epoch {epoch + 1}")
    epoch_loss = 0
    for batch in tqdm(train_loader):
        input_ids, attention_mask, labels = [b.to(device) for b in batch]
        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()

    print(f"Epoch Loss: {epoch_loss}")


Epoch 1


100%|██████████| 279/279 [08:06<00:00,  1.74s/it]


Epoch Loss: 26.240346199600026
Epoch 2


100%|██████████| 279/279 [07:56<00:00,  1.71s/it]


Epoch Loss: 8.157004839740694
Epoch 3


100%|██████████| 279/279 [08:03<00:00,  1.73s/it]

Epoch Loss: 4.322584908339195





In [22]:
model.save_pretrained('./phishing-detector')
tokenizer.save_pretrained('./phishing-detector')


('./phishing-detector/tokenizer_config.json',
 './phishing-detector/special_tokens_map.json',
 './phishing-detector/vocab.txt',
 './phishing-detector/added_tokens.json')

In [21]:
model.eval()
from sklearn.metrics import classification_report

predictions, true_labels = [], []
with torch.no_grad():
    for batch in tqdm(test_loader):
        input_ids, attention_mask, labels = [b.to(device) for b in batch]
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs.logits, dim=1)
        predictions.extend(preds.cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

label_mapping = {0: 'Ham', 1: 'Spam'}  # Replace with your actual label names
labels = list(label_mapping.keys())    # [0, 1]
target_names = list(label_mapping.values())  # ['Ham', 'Spam']

# Generate the classification report
print(classification_report(true_labels, predictions, labels=labels, target_names=target_names))

100%|██████████| 70/70 [00:34<00:00,  2.05it/s]

              precision    recall  f1-score   support

         Ham       1.00      0.99      0.99       958
        Spam       0.96      0.97      0.97       157

    accuracy                           0.99      1115
   macro avg       0.98      0.98      0.98      1115
weighted avg       0.99      0.99      0.99      1115






In [13]:
unique_labels = set(labels.tolist())  # Extract unique labels
print(unique_labels)  # Verify they are in the range [0, num_labels - 1]


{0, 1, 2}


In [20]:
print(f"Unique true_labels: {set(true_labels)}")
print(f"Unique predictions: {set(predictions)}")


Unique true_labels: {0, 1}
Unique predictions: {0, 1}
