In [14]:
import pandas as pd
from collections import Counter
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import string
import re
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from transformers import BertTokenizer, BertForSequenceClassification
import torch
from torch import nn
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

In [3]:

# --- Data Loading (from your provided code) ---
def load_data(file_path):
    encodings_to_try = ['utf-8', 'latin1', 'ISO-8859-1', 'cp1252']
    for encoding in encodings_to_try:
        try:
            data = pd.read_csv(file_path, encoding=encoding)
            print(f"Successfully loaded data with encoding: {encoding}")
            return data
        except UnicodeDecodeError:
            print(f"Failed to load with encoding: {encoding}")
            continue
        except Exception as e:
            print(f"An unexpected error occurred: {e}")
            return None
    print("Could not load the file with any of the attempted encodings.")
    return None

df = load_data('spam.csv')

if df is not None:
    print("\nDataFrame Head:")
    print(df.head())
    print("\nDataFrame Info:")
    df.info()

df = df[["Spam", "Message"]]
print(df.head())


Successfully loaded data with encoding: utf-8

DataFrame Head:
   Spam                                            Message Unnamed: 2  \
0   ham  Go until jurong point, crazy.. Available only ...        NaN   
1   ham                      Ok lar... Joking wif u oni...        NaN   
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...        NaN   
3   ham  U dun say so early hor... U c already then say...        NaN   
4   ham  Nah I don't think he goes to usf, he lives aro...        NaN   

  Unnamed: 3 Unnamed: 4  
0        NaN        NaN  
1        NaN        NaN  
2        NaN        NaN  
3        NaN        NaN  
4        NaN        NaN  

DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Spam        5572 non-null   object
 1   Message     5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12

In [4]:
# --- NLTK Downloads and Preprocessing Setup (from your provided code) ---
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

stopwords = set(stopwords.words('english'))
porterStemmer = PorterStemmer()

def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    tokens = [word for word in tokens if word.isalpha()]
    tokens = [token for token in tokens if token not in string.punctuation]
    tokens = [token for token in tokens if token not in stopwords]
    tokens = [porterStemmer.stem(token) for token in tokens]
    processed_text = ' '.join(tokens)
    processed_text = re.sub(r'http\S+|www\S+|https\S+', '', processed_text, flags=re.MULTILINE)
    processed_text = re.sub(r'\@\w+|\#', '', processed_text)
    processed_text = re.sub(r'\d+', '', processed_text)
    processed_text = re.sub(r'\s+', ' ', processed_text).strip()
    return processed_text

df['Processed_Message'] = df['Message'].apply(preprocess_text)
print(df.head())

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


   Spam                                            Message  \
0   ham  Go until jurong point, crazy.. Available only ...   
1   ham                      Ok lar... Joking wif u oni...   
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...   
3   ham  U dun say so early hor... U c already then say...   
4   ham  Nah I don't think he goes to usf, he lives aro...   

                                   Processed_Message  
0  go jurong point crazi avail bugi n great world...  
1                              ok lar joke wif u oni  
2  free entri wkli comp win fa cup final tkt may ...  
3                u dun say earli hor u c alreadi say  
4               nah think goe usf live around though  


In [5]:
# --- Feature and Target Variable Definition ---
X = df['Processed_Message']
y = df['Spam'].map({'ham':0,'spam':1 })
# --- Oversampling with RandomOverSampler (Applied to original X and y) ---
# It's better to split first, then oversample only the training data to avoid data leakage.
# However, for simplicity and to directly follow your last provided code structure:
# Ensure X is a DataFrame for RandomOverSampler if it's a Series
ros = RandomOverSampler(random_state=42)
X_resampled_df, y_resampled = ros.fit_resample(pd.DataFrame(X), y)

# Convert X_resampled_df back to a Series or list of strings for BERT tokenization
X_resampled_list = X_resampled_df.iloc[:, 0].tolist() # Get the 'Processed_Message' column as a list
y_resampled_list = y_resampled.tolist()

print(f"Original: {Counter(y)}, Resampled: {Counter(y_resampled)}")



Original: Counter({0: 4825, 1: 747}), Resampled: Counter({0: 4825, 1: 4825})


In [6]:
# --- Train-Test Split (after oversampling) ---
# Note: When using oversampling *before* splitting, ensure your test set truly represents unseen data.
# A more robust approach often is: split -> oversample train -> tokenize train/test.
# But following your current structure, we'll split the already resampled data.
X_train_raw, X_test_raw, y_train_raw, y_test_raw = train_test_split(
    X_resampled_list, y_resampled_list, test_size=0.2, random_state=42, stratify=y_resampled_list
)

print(f"\nShape of X_train_raw: {len(X_train_raw)}")
print(f"Shape of y_train_raw: {len(y_train_raw)}")
print(f"Shape of X_test_raw: {len(X_test_raw)}")
print(f"Shape of y_test_raw: {len(y_test_raw)}")
print(f"Training set distribution: {Counter(y_train_raw)}")
print(f"Test set distribution: {Counter(y_test_raw)}")



Shape of X_train_raw: 7720
Shape of y_train_raw: 7720
Shape of X_test_raw: 1930
Shape of y_test_raw: 1930
Training set distribution: Counter({1: 3860, 0: 3860})
Test set distribution: Counter({1: 965, 0: 965})


In [7]:
# --- BERT Tokenization ---
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)

max_length = 64 # Define the maximum length for padding

print("\nTokenizing training data...")
input_ids_train = []
attention_masks_train = []
for sent in X_train_raw:
    encoded_dict = tokenizer.encode_plus(
        sent,
        add_special_tokens=True, # Add '[CLS]' and '[SEP]'
        max_length=max_length,   # Pad & truncate all sentences
        padding='max_length',    # Pad to max_length
        truncation=True,         # Truncate to max_length
        return_attention_mask=True,
        return_tensors='pt',     # Return PyTorch tensors
    )
    input_ids_train.append(encoded_dict['input_ids'])
    attention_masks_train.append(encoded_dict['attention_mask'])


Tokenizing training data...


In [8]:
# Convert lists of tensors to single tensors
input_ids_train = torch.cat(input_ids_train, dim=0)
attention_masks_train = torch.cat(attention_masks_train, dim=0)
labels_train = torch.tensor(y_train_raw)


In [9]:
print("Tokenizing test data...")
input_ids_test = []
attention_masks_test = []
for sent in X_test_raw:
    encoded_dict = tokenizer.encode_plus(
        sent,
        add_special_tokens=True,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt',
    )
    input_ids_test.append(encoded_dict['input_ids'])
    attention_masks_test.append(encoded_dict['attention_mask'])

input_ids_test = torch.cat(input_ids_test, dim=0)
attention_masks_test = torch.cat(attention_masks_test, dim=0)
labels_test = torch.tensor(y_test_raw)

Tokenizing test data...


In [11]:
# Create PyTorch DataLoaders
batch_size = 32

train_dataset = TensorDataset(input_ids_train, attention_masks_train, labels_train)
train_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=batch_size)

test_dataset = TensorDataset(input_ids_test, attention_masks_test, labels_test)
test_sampler = SequentialSampler(test_dataset)
test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=batch_size)

print("\nBERT inputs prepared.")
print(f"Shape of input_ids_train: {input_ids_train.shape}")
print(f"Shape of labels_train: {labels_train.shape}")
print(f"Shape of input_ids_test: {input_ids_test.shape}")
print(f"Shape of labels_test: {labels_test.shape}")
print(f"Number of training batches: {len(train_dataloader)}")
print(f"Number of test batches: {len(test_dataloader)}")


BERT inputs prepared.
Shape of input_ids_train: torch.Size([7720, 64])
Shape of labels_train: torch.Size([7720])
Shape of input_ids_test: torch.Size([1930, 64])
Shape of labels_test: torch.Size([1930])
Number of training batches: 242
Number of test batches: 61


In [12]:

# --- BERT Model Fine-tuning ---
# Device setup
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

model = BertForSequenceClassification.from_pretrained(
    model_name,
    num_labels = 2, # Binary classification
    output_attentions = False,
    output_hidden_states = False,
)
model.to(device)

Using device: cuda


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [21]:
# Optimizer & Scheduler
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5, eps=1e-8)
from transformers import get_linear_schedule_with_warmup

epochs = 3 # A common number of epochs for BERT fine-tuning, adjust if needed
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

In [22]:
# Function to compute metrics
def compute_metrics(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    
    accuracy = accuracy_score(labels_flat, pred_flat)
    f1 = f1_score(labels_flat, pred_flat, average='binary') # 'binary' for 2 classes
    precision = precision_score(labels_flat, pred_flat, average='binary')
    recall = recall_score(labels_flat, pred_flat, average='binary')
    
    return {
        'accuracy': accuracy,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [23]:
# Training loop
print("\nStarting BERT fine-tuning...")
for epoch_i in range(0, epochs):
    print(f'======== Epoch {epoch_i + 1} / {epochs} ========')
    print('Training...')

    total_train_loss = 0
    model.train() # Set model to training mode

    for step, batch in enumerate(train_dataloader):
        if step % 50 == 0 and not step == 0:
            print(f'  Batch {step} of {len(train_dataloader)}.')

        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        model.zero_grad() # Clear previously calculated gradients

        outputs = model(b_input_ids,
                        token_type_ids=None,
                        attention_mask=b_input_mask,
                        labels=b_labels) # labels are used to calculate loss internally by BertForSequenceClassification

        loss = outputs.loss # Get the loss
        total_train_loss += loss.item() # Add to total loss
        loss.backward() # Perform a backward pass to calculate gradients
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) # Clip the norm of the gradients
        optimizer.step() # Update model parameters
        scheduler.step() # Update the learning rate

    avg_train_loss = total_train_loss / len(train_dataloader)
    print(f"\n  Average training loss: {avg_train_loss:.4f}")

    print("\nRunning Validation...")
    model.eval() # Set model to evaluation mode

    eval_accuracy = []
    eval_f1 = []
    eval_precision = []
    eval_recall = []
    
    for batch in test_dataloader:
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        with torch.no_grad(): # Disable gradient calculation for evaluation
            outputs = model(b_input_ids,
                            token_type_ids=None,
                            attention_mask=b_input_mask)
        
        logits = outputs.logits # Get the logits (raw output scores)
        
        # Move logits and labels to CPU to compute metrics
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        metrics = compute_metrics(logits, label_ids)
        eval_accuracy.append(metrics['accuracy'])
        eval_f1.append(metrics['f1'])
        eval_precision.append(metrics['precision'])
        eval_recall.append(metrics['recall'])

    print(f"  Validation Accuracy: {np.mean(eval_accuracy):.4f}")
    print(f"  Validation F1 Score: {np.mean(eval_f1):.4f}")
    print(f"  Validation Precision: {np.mean(eval_precision):.4f}")
    print(f"  Validation Recall: {np.mean(eval_recall):.4f}")

print("\nTraining complete!")


Starting BERT fine-tuning...
Training...
  Batch 50 of 242.
  Batch 100 of 242.
  Batch 150 of 242.
  Batch 200 of 242.

  Average training loss: 0.0697

Running Validation...
  Validation Accuracy: 0.9928
  Validation F1 Score: 0.9930
  Validation Precision: 0.9954
  Validation Recall: 0.9910
Training...
  Batch 50 of 242.
  Batch 100 of 242.
  Batch 150 of 242.
  Batch 200 of 242.

  Average training loss: 0.0129

Running Validation...
  Validation Accuracy: 0.9933
  Validation F1 Score: 0.9931
  Validation Precision: 0.9887
  Validation Recall: 0.9980
Training...
  Batch 50 of 242.
  Batch 100 of 242.
  Batch 150 of 242.
  Batch 200 of 242.

  Average training loss: 0.0052

Running Validation...
  Validation Accuracy: 0.9954
  Validation F1 Score: 0.9953
  Validation Precision: 0.9908
  Validation Recall: 1.0000

Training complete!


In [24]:
model.save_pretrained('./fine_tuned_bert_model/')
tokenizer.save_pretrained('./fine_tuned_bert_model/')

('./fine_tuned_bert_model/tokenizer_config.json',
 './fine_tuned_bert_model/special_tokens_map.json',
 './fine_tuned_bert_model/vocab.txt',
 './fine_tuned_bert_model/added_tokens.json')

In [25]:
loaded_model = BertForSequenceClassification.from_pretrained('./fine_tuned_bert_model/')
loaded_tokenizer = BertTokenizer.from_pretrained('./fine_tuned_bert_model/')

In [26]:
from sklearn.metrics import classification_report
# Evaluate the model on the test set
model.eval()
predictions, true_labels = [], []
for batch in test_dataloader:
    b_input_ids = batch[0].to(device)
    b_input_mask = batch[1].to(device)
    b_labels = batch[2].to(device)

    with torch.no_grad():
        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
    
    logits = outputs.logits
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()

    predictions.append(logits)
    true_labels.append(label_ids)
predictions = np.concatenate(predictions, axis=0)
true_labels = np.concatenate(true_labels, axis=0)
print("\nClassification Report:")
print(classification_report(np.argmax(predictions, axis=1), true_labels, target_names=['ham', 'spam']))
# Print shapes of the tensors
print(f"Shape of input_ids_train: {input_ids_train.shape}")
print(f"Shape of labels_train: {labels_train.shape}")
print(f"Shape of input_ids_test: {input_ids_test.shape}")
print(f"Shape of labels_test: {labels_test.shape}") 


Classification Report:
              precision    recall  f1-score   support

         ham       0.99      1.00      1.00       956
        spam       1.00      0.99      1.00       974

    accuracy                           1.00      1930
   macro avg       1.00      1.00      1.00      1930
weighted avg       1.00      1.00      1.00      1930

Shape of input_ids_train: torch.Size([7720, 64])
Shape of labels_train: torch.Size([7720])
Shape of input_ids_test: torch.Size([1930, 64])
Shape of labels_test: torch.Size([1930])
