In [2]:
import os
import pandas as pd

# base path of your project
BASE_DIR = "/workspaces/Transformers/SpamClassificationUsingBert"

# dataset path
DATA_PATH = os.path.join(BASE_DIR, "Data", "spam.csv")

print(f" Data path: {DATA_PATH}")


 Data path: /workspaces/Transformers/SpamClassificationUsingBert/Data/spam.csv


In [3]:
df = pd.read_csv(DATA_PATH, encoding="latin-1")
df.head(3)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,


In [4]:
# read the CSV file (latin-1 avoids Unicode errors in UCI spam dataset)
df = pd.read_csv(DATA_PATH, encoding="latin-1")

if "v1" in df.columns and "v2" in df.columns:
    df = df.rename(columns={"v1": "label", "v2": "text"})
    df = df[["label", "text"]]

# drop NaN if any
df = df.dropna().reset_index(drop=True)

print("Data loaded successfully!")
print(f"Total rows: {len(df)}")
print(f"Unique labels: {df['label'].unique()}")

Data loaded successfully!
Total rows: 5572
Unique labels: ['ham' 'spam']


In [5]:
# Convert 'ham' → 0 and 'spam' → 1
df['label_encoded'] = df['label'].map({'ham': 0, 'spam': 1})

# Verify results
print(df[['label', 'label_encoded']].head())


  label  label_encoded
0   ham              0
1   ham              0
2  spam              1
3   ham              0
4   ham              0


In [6]:
df.head()

Unnamed: 0,label,text,label_encoded
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [7]:
df = df.drop('label', axis = 'columns')
df.head()

Unnamed: 0,text,label_encoded
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0


In [8]:
df = df.rename(columns={'label_encoded': 'label'})
df.head()

Unnamed: 0,text,label
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0


In [9]:
df.label.value_counts()

label
0    4825
1     747
Name: count, dtype: int64

## We have to handle class imabalance 

simply we are going to pick 1000 samples of ham, like we have not a gbu machine

In [10]:
df_spam = df[df.label==1]
df_ham_small = df[df.label==0].sample(1000)
df_small = pd.concat([df_spam, df_ham_small])

df_small.label.value_counts()

label
0    1000
1     747
Name: count, dtype: int64

In [11]:
from sklearn.model_selection import train_test_split


# Perform stratified train-test split
X_train, X_test, y_train, y_test = train_test_split(
    df_small.text,
    df_small.label,
    test_size=0.2,
    stratify=df_small.label,
    random_state=42
)

print("Training set size:", X_train.shape[0])
print("Testing set size:", X_test.shape[0])
print("\nTrain label distribution:\n", y_train.value_counts())
print("\nTest label distribution:\n", y_test.value_counts())


Training set size: 1397
Testing set size: 350

Train label distribution:
 label
0    800
1    597
Name: count, dtype: int64

Test label distribution:
 label
0    200
1    150
Name: count, dtype: int64


In [12]:
from transformers import BertTokenizer

# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


  from .autonotebook import tqdm as notebook_tqdm


In [13]:
def tokenize_texts(texts, tokenizer, max_len=128):

    encoded = tokenizer.batch_encode_plus(
        texts.tolist(),
        add_special_tokens=True,      # [CLS], [SEP]
        max_length=max_len,
        padding='max_length',         # pad all to max_len
        truncation=True,              # truncate longer texts
        return_attention_mask=True,   # create attention masks
        return_tensors='pt'           # return PyTorch tensors
    )
    return encoded


In [14]:
# Tokenize training and test data
train_encodings = tokenize_texts(X_train, tokenizer)
test_encodings = tokenize_texts(X_test, tokenizer)

print("Tokenization complete!")
print("Train input_ids shape:", train_encodings['input_ids'].shape)
print("Test input_ids shape:", test_encodings['input_ids'].shape)


Tokenization complete!
Train input_ids shape: torch.Size([1397, 128])
Test input_ids shape: torch.Size([350, 128])


In [15]:
# Inspect one tokenized sample
# sample_idx = 0
# print("Original text:\n", X_train.iloc[sample_idx])
# print("\nToken IDs:\n", train_encodings['input_ids'][sample_idx])
# print("\nDecoded back:\n", tokenizer.decode(train_encodings['input_ids'][sample_idx]))


In [16]:
import torch
from torch.utils.data import Dataset


In [17]:
class SpamDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels.iloc[idx], dtype=torch.long)
        return item


In [18]:
# Create PyTorch datasets
train_dataset = SpamDataset(train_encodings, y_train)
test_dataset = SpamDataset(test_encodings, y_test)

print(" Dataset objects created successfully!")
print("Train samples:", len(train_dataset))
print("Test samples:", len(test_dataset))


 Dataset objects created successfully!
Train samples: 1397
Test samples: 350


In [19]:
from transformers import BertForSequenceClassification

# Load pretrained BERT for binary classification
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=2  # binary: ham/spam
)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from torch.utils.data import DataLoader
from torch.optim import AdamW
from transformers import get_scheduler
from tqdm import tqdm

# Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# DataLoaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# Optimizer & scheduler
optimizer = AdamW(model.parameters(), lr=2e-5)
num_epochs = 2
num_training_steps = num_epochs * len(train_loader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

: 

In [None]:
# Training loop
model.train()
progress_bar = tqdm(range(num_training_steps))

for epoch in range(num_epochs):
    for batch in train_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

print("Training complete!")

  0%|          | 0/176 [00:00<?, ?it/s]