In [None]:
!pip install transformers
!pip install scikit-learn
!pip install torch


Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [None]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset
from torch.nn import CrossEntropyLoss
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments


In [None]:
df = pd.read_csv('/content/Email_Balanced (1).csv')  # Your CSV file here

# Encode labels: 'ham' = 0, 'spam' = 1
df['label'] = df['Category'].map({'ham': 0, 'spam': 1})

# Basic text cleaning
df['Message'] = df['Message'].apply(lambda x: str(x).lower())

X_train, X_test, y_train, y_test = train_test_split(
    df['Message'],
    df['label'],
    test_size=0.2,
    stratify=df['label'],
    random_state=42
)



In [None]:
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
class_weights = torch.tensor(class_weights, dtype=torch.float)
print("Class Weights:", class_weights)


Class Weights: tensor([1.0008, 0.9992])


In [None]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

train_encodings = tokenizer(list(X_train), truncation=True, padding=True, max_length=512)
test_encodings = tokenizer(list(X_test), truncation=True, padding=True, max_length=512)

class EmailDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item
    def __len__(self):
        return len(self.labels)

train_dataset = EmailDataset(train_encodings, list(y_train))
test_dataset = EmailDataset(test_encodings, list(y_test))

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertTokenizer'. 
The class this function is called from is 'DistilBertTokenizer'.


In [None]:
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fct = CrossEntropyLoss(weight=class_weights.to(model.device))
        loss = loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss


In [None]:
training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy="epoch", # Changed 'evaluation_strategy' to 'eval_strategy'
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
)

In [None]:
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

trainer.train()




<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mnagavallikareethu[0m ([33mnagavallikareethu-nri-institute-of-technology[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss
1,No log,0.075288
2,No log,0.114822


Epoch,Training Loss,Validation Loss
1,No log,0.075288
2,No log,0.114822


In [None]:
predictions = trainer.predict(test_dataset)
pred_labels = np.argmax(predictions.predictions, axis=1)

print("Accuracy:", accuracy_score(y_test, pred_labels))
print("\nClassification Report:\n", classification_report(y_test, pred_labels))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, pred_labels))


In [None]:
model.save_pretrained('./fine_tuned_distilbert_spam_model/')
tokenizer.save_pretrained('./fine_tuned_distilbert_spam_model/')

('./fine_tuned_distilbert_spam_model/tokenizer_config.json',
 './fine_tuned_distilbert_spam_model/special_tokens_map.json',
 './fine_tuned_distilbert_spam_model/vocab.txt',
 './fine_tuned_distilbert_spam_model/added_tokens.json')

In [None]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import torch

# Load the fine-tuned model (from training)
model_path = "./fine_tuned_distilbert_spam_model/"  # This is the folder where you saved the model
model = DistilBertForSequenceClassification.from_pretrained(model_path)
tokenizer = DistilBertTokenizer.from_pretrained(model_path)

# Set model to evaluation mode
model.eval()

# Function to make predictions
def predict_email(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    probs = torch.nn.functional.softmax(logits, dim=1)
    predicted_class = torch.argmax(probs).item()

    label = "Ham" if predicted_class == 0 else "Spam"
    confidence = probs[0][predicted_class].item()

    return label, confidence

# Example: Take user input
user_input = input("Enter the email content to check (Ham/Spam): ")

# Predict
label, confidence = predict_email(user_input)
print(f"\nPrediction: {label} (Confidence: {confidence*100:.2f}%)")


Enter the email content to check (Ham/Spam): Congratulations! You’ve won $1 million in our international lottery. Send your personal details to claim your prize.

Prediction: Spam (Confidence: 99.89%)
