# 1. Install Required Libraries

In [None]:
!pip install -q transformers==4.44.2 datasets torch scikit-learn pandas numpy matplotlib seaborn tqdm huggingface_hub

# 2. Hugging Face Login (if you have a HuggingFace account)

In [None]:
from huggingface_hub import notebook_login
notebook_login()

# 3. Imports & Setup

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random
import torch
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, f1_score, precision_score, recall_score, roc_curve, auc
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    AutoModelForCausalLM, TrainingArguments, Trainer, EarlyStoppingCallback, DistilBertConfig
)
from datasets import Dataset, load_dataset
import re
import warnings
import os
warnings.filterwarnings('ignore')
os.environ["WANDB_DISABLED"] = "true"

# 4. Device configuration

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
if torch.cuda.is_available():
    torch.backends.cuda.matmul.allow_tf32 = True
    torch.backends.cudnn.allow_tf32 = True
    print("GPU:", torch.cuda.get_device_name(0),
          f"({torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB)")
print("Using device:", device)

# 5. Set random seeds for reproducibility

In [None]:
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(42)

# 6. Load Dataset from HuggingFace

In [None]:
print("Loading dataset...")
dataset = load_dataset("zefang-liu/phishing-email-dataset", split='train')
df = dataset.to_pandas()
print("Raw dataset shape:", df.shape)
print("Email Type unique values:", df['Email Type'].unique())
print("\nLabel distribution before cleaning:\n", df['Email Type'].value_counts())

# 7. Label mapping (robust for any label names)

In [None]:
label_map = {name: i for i, name in enumerate(df['Email Type'].unique())}
df['label'] = df['Email Type'].map(label_map)
print("Label mapping used:", label_map)
print("Labels after mapping:", df['label'].value_counts(dropna=False))

# 8. Enhanced Preprocessing

In [None]:
_url_re = re.compile(r'http\S+|www\S+|https\S+', flags=re.MULTILINE)
_email_re = re.compile(r'\S+@\S+')
_special_re = re.compile(r'[^a-zA-Z\s\.\?\!]')
_spaces_re = re.compile(r'\s+')

def preprocess_text(text):
    if not isinstance(text, str):
        return ""
    text = text.lower().strip()
    text = _url_re.sub(' [URL] ', text)
    text = _email_re.sub(' [EMAIL] ', text)
    text = _special_re.sub(' ', text)
    text = _spaces_re.sub(' ', text).strip()
    return text

# 9. Apply preprocessing

In [None]:
df['Email Text'] = df['Email Text'].apply(preprocess_text)

# 10. Data cleaning

In [None]:
print("Before label cleaning:", df.shape)
df = df.dropna(subset=['label'])
print("After label cleaning:", df.shape)
df = df[df['Email Text'].str.len() > 10].reset_index(drop=True)
print("After removing short texts:", df.shape)
print("NaN in label:", df['label'].isna().sum())
print("NaN in text:", df['Email Text'].isna().sum())

# 11. Train-Test Split

In [None]:
train_df, test_df = train_test_split(
    df[['Email Text', 'label']],
    test_size=0.2,
    random_state=42,
    stratify=df['label']
)
print(f"Train size: {len(train_df)}, Test size: {len(test_df)}")
print("Sample preprocessed email:\n", train_df['Email Text'].iloc[0][:200], "...")

# 12. Data Augmentation with DistilGPT2 (Colab-optimized)

In [None]:
print("\nLoading DistilGPT2 for data augmentation...")
gen_model_name = 'distilgpt2'
gen_tokenizer = AutoTokenizer.from_pretrained(gen_model_name)
gen_tokenizer.pad_token = gen_tokenizer.eos_token
gen_model = AutoModelForCausalLM.from_pretrained(gen_model_name).to(device)
for param in gen_model.parameters():
    param.requires_grad = False

def generate_email(prompt, max_length=100, temperature=0.9, top_p=0.9):
    inputs = gen_tokenizer.encode(prompt, return_tensors='pt').to(device)
    with torch.no_grad():
        outputs = gen_model.generate(
            inputs,
            max_length=max_length,
            temperature=temperature,
            do_sample=True,
            top_p=top_p,
            pad_token_id=gen_tokenizer.eos_token_id,
            repetition_penalty=1.2,
            num_return_sequences=1
        )
    generated = gen_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated[len(prompt):].strip()

phishing_prompts = [
    "Urgent: Your account security has been compromised. Please verify your identity immediately at: ",
    "Important: Unusual login activity detected on your account. Confirm your details here: ",
    "Alert: Your subscription payment failed. Update your payment information now: "
]
safe_prompts = [
    "Hello, here's our monthly newsletter with updates on our services. ",
    "Thank you for your recent inquiry. We'll get back to you within 24 hours. ",
    "Reminder: Your upcoming appointment is scheduled for tomorrow at 3 PM. "
]

num_aug = 60  # Reduced for Colab memory

print("Generating augmented phishing emails...")
aug_phishing = []
for prompt in phishing_prompts:
    for _ in tqdm(range(num_aug // len(phishing_prompts)), desc=f'Phishing: {prompt[:20]}'):
        try:
            aug_text = generate_email(prompt, max_length=80)
            aug_phishing.append(prompt + aug_text)
        except Exception as e:
            print("Error:", e)

print("Generating augmented safe emails...")
aug_safe = []
for prompt in safe_prompts:
    for _ in tqdm(range(num_aug // len(safe_prompts)), desc=f'Safe: {prompt[:20]}'):
        try:
            aug_text = generate_email(prompt, max_length=80)
            aug_safe.append(prompt + aug_text)
        except Exception as e:
            print("Error:", e)

aug_phishing_df = pd.DataFrame({'Email Text': aug_phishing, 'label': label_map.get('Phishing Email', 1)})
aug_safe_df = pd.DataFrame({'Email Text': aug_safe, 'label': label_map.get('Safe Email', 0)})
train_aug_df = pd.concat([train_df, aug_phishing_df, aug_safe_df], ignore_index=True)
train_aug_df = train_aug_df.sample(frac=1, random_state=42).reset_index(drop=True)
print("Augmented train size:", len(train_aug_df))
print("Sample augmented phishing:\n", aug_phishing_df['Email Text'].iloc[0][:200], "...")

# 13. Visualization

In [None]:
plt.figure(figsize=(10, 4))
plt.subplot(1, 2, 1)
sns.countplot(data=df, x='label')
plt.title('Original: Phishing (1) vs Safe (0)')
plt.subplot(1, 2, 2)
sns.countplot(data=train_aug_df, x='label')
plt.title('Augmented: Phishing (1) vs Safe (0)')
plt.tight_layout()
plt.show()

# 14. Free up memory

In [None]:
del gen_model, gen_tokenizer
if torch.cuda.is_available():
    torch.cuda.empty_cache()

# 15. Load DistilBERT Tokenizer and Model

In [None]:
model_name = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    return tokenizer(
        examples['Email Text'],
        truncation=True,
        padding='max_length',
        max_length=128
    )

train_dataset = Dataset.from_pandas(train_aug_df[['Email Text', 'label']])
test_dataset = Dataset.from_pandas(test_df[['Email Text', 'label']])
train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
print(f"Tokenized train: {len(train_dataset)}, Test: {len(test_dataset)}")

config = DistilBertConfig.from_pretrained(
    model_name,
    num_labels=2,
    hidden_dropout_prob=0.2,
    attention_probs_dropout_prob=0.2
)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    config=config
).to(device)

# 16. TrainingArguments (Colab-optimized)

In [None]:
training_args = TrainingArguments(
    output_dir='./phishguard-results',
    num_train_epochs=2,
    per_device_train_batch_size=8,     # Lowered for Colab RAM
    per_device_eval_batch_size=16,     # Lowered for Colab RAM
    gradient_accumulation_steps=2,
    warmup_steps=50,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=30,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='f1',
    greater_is_better=True,
    report_to=None,
    fp16=torch.cuda.is_available(),
    dataloader_pin_memory=False,
)

# 17. Metrics

In [None]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average='weighted')
    precision = precision_score(labels, preds, average='weighted', zero_division=0)
    recall = recall_score(labels, preds, average='weighted', zero_division=0)
    return {'f1': f1, 'precision': precision, 'recall': recall}

# 18. Trainer

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

# 19. Train!

In [None]:
print("Starting training...")
trainer.train()

# 20. Save Model & Tokenizer

In [None]:
trainer.save_model('./phishguard-model')
tokenizer.save_pretrained('./phishguard-model')
print("Model trained and saved!")

# 21. Evaluation

In [None]:
print("Evaluating model...")
predictions = trainer.predict(test_dataset)
preds = np.argmax(predictions.predictions, axis=1)
labels = predictions.label_ids

print("\nClassification Report:")
print(classification_report(labels, preds, target_names=list(label_map.keys())))

cm = confusion_matrix(labels, preds)
plt.figure(figsize=(10, 4))
plt.subplot(1, 2, 1)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=list(label_map.keys()),
            yticklabels=list(label_map.keys()))
plt.title('Confusion Matrix')
plt.ylabel('True')
plt.xlabel('Predicted')

plt.subplot(1, 2, 2)
fpr, tpr, _ = roc_curve(labels, predictions.predictions[:, 1])
roc_auc = auc(fpr, tpr)
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.tight_layout()
plt.show()

print(f"Overall F1-Score: {f1_score(labels, preds, average='weighted'):.4f}")
print(f"ROC AUC: {roc_auc:.4f}")

# 22. Inference Function

In [None]:
loaded_model = AutoModelForSequenceClassification.from_pretrained('./phishguard-model').to(device)
loaded_tokenizer = AutoTokenizer.from_pretrained('./phishguard-model')

def predict_phishing(email_text):
    processed_text = preprocess_text(email_text)
    inputs = loaded_tokenizer(
        processed_text,
        return_tensors='pt',
        truncation=True,
        padding='max_length',
        max_length=128
    ).to(device)
    with torch.no_grad():
        outputs = loaded_model(**inputs)
    probs = torch.softmax(outputs.logits, dim=-1)
    pred = torch.argmax(probs, dim=-1).item()
    label = [k for k, v in label_map.items() if v == pred][0]
    confidence = probs[0][pred].item()
    return label, confidence, probs.cpu().numpy()[0]

# 23. Test Model

In [None]:
test_emails = [
    "Urgent: Your account is compromised. Click http://fake.com/reset now!",
    "Hi, meeting at 3 PM tomorrow. Regards, John.",
    "Congratulations! You've won a $1000 gift card. Claim your prize at: http://prize.xyz",
    "Your invoice #INV-2023-987 is ready for payment. Please review the attached document.",
]
print("\nTesting model with sample emails:")
for i, email in enumerate(test_emails):
    label, confidence, probs = predict_phishing(email)
    print(f"\nEmail {i+1}:")
    print(f"Text: {email[:100]}...")
    print(f"Prediction: {label} (Confidence: {confidence:.4f})")
    print(f"Probabilities: {probs}")

print("\nPhishGuard training and evaluation completed!")