In [1]:
import torch
from transformers import BertForSequenceClassification, BertTokenizer

# Define the device
device = torch.device("cpu")

# Load the model architecture
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)  # Adjust num_labels as needed
model.load_state_dict(torch.load('path to fine-tuned bert model', map_location=device))
model.to(device)
model.eval()

# Load the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')



  from .autonotebook import tqdm as notebook_tqdm
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Define the prediction function
def predict_email(text, model, tokenizer, max_len=128):
    # Tokenize the input text
    inputs = tokenizer.encode_plus(
        text,
        None,
        add_special_tokens=True,
        max_length=max_len,
        padding='max_length',
        return_token_type_ids=True,
        truncation=True
    )

    # Convert inputs to PyTorch tensors
    input_ids = torch.tensor(inputs['input_ids']).unsqueeze(0).to(device)
    attention_mask = torch.tensor(inputs['attention_mask']).unsqueeze(0).to(device)
    token_type_ids = torch.tensor(inputs['token_type_ids']).unsqueeze(0).to(device)

    # Predict using the model
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        logits = outputs.logits
        prediction = torch.argmax(logits, dim=1).item()

    return prediction

# Example usage
email_text = ''' put your email text here '''
prediction = predict_email(email_text, model, tokenizer)
print(f"Predicted label: {prediction}")


In [None]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
import pandas as pd
import os

# Function to preprocess text
def preprocess_text(text, tokenizer, max_len=128):
    inputs = tokenizer.encode_plus(
        text,
        None,
        add_special_tokens=True,
        max_length=max_len,
        padding='max_length',
        return_token_type_ids=True,
        truncation=True
    )
    input_ids = torch.tensor(inputs['input_ids']).unsqueeze(0).to(device)
    attention_mask = torch.tensor(inputs['attention_mask']).unsqueeze(0).to(device)
    token_type_ids = torch.tensor(inputs['token_type_ids']).unsqueeze(0).to(device)
    return input_ids, attention_mask, token_type_ids

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Load your dataset
dataset = pd.read_csv(r"cleaned_dataset.csv")

# Map email types to numerical labels
label_mapping = {'Safe Email': 0, 'Phishing Email': 1}
dataset['label'] = dataset['Email Type'].map(label_mapping)

# Remove rows with null values in 'Email Text' column
dataset = dataset.dropna(subset=['Email Text'])

# Checkpointing
checkpoint_path = r'progress_checkpoint.csv'
if os.path.exists(checkpoint_path):
    progress_df = pd.read_csv(checkpoint_path)
    start_index = len(progress_df)
    true_labels = progress_df['true_labels'].tolist()
    pred_labels = progress_df['pred_labels'].tolist()
else:
    start_index = 0
    true_labels = []
    pred_labels = []
    progress_df = pd.DataFrame(columns=['true_labels', 'pred_labels'])

# Check if start_index exceeds dataset length
if start_index >= len(dataset):
    print("Checkpoint indicates all data processed.")
else:
    print(f"Resuming from index: {start_index}")

# Make predictions
model.eval()
with torch.no_grad():
    for idx, row in enumerate(dataset.iterrows(), start=start_index):
        print(f"Index: {idx}, Dataset Length: {len(dataset)}")  # Debugging print
        if idx >= len(dataset):
            break  # Ensure the loop doesn't exceed the dataset length
    # for idx, row in enumerate(dataset.iterrows(), start=start_index):
        index, row = row
        text = row['Email Text']
        true_label = row['label']
        input_ids, attention_mask, token_type_ids = preprocess_text(text, tokenizer)

        outputs = model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        logits = outputs.logits
        prediction = torch.argmax(logits, dim=1).item()

        true_labels.append(true_label)
        pred_labels.append(prediction)

        # Save progress
        new_row = pd.DataFrame({'true_labels': [true_label], 'pred_labels': [prediction]})
        progress_df = pd.concat([progress_df, new_row], ignore_index=True)
        
        if idx % 100 == 0:  # Save every 100 steps
            progress_df.to_csv(checkpoint_path, index=False)
            print(f'Processed {idx + 1} / {len(dataset)}')

# Final save
progress_df.to_csv(checkpoint_path, index=False)

# Remove NaN values from true_labels and pred_labels
cleaned_df = progress_df.dropna()
true_labels = cleaned_df['true_labels'].tolist()
pred_labels = cleaned_df['pred_labels'].tolist()

# Calculate evaluation metrics
accuracy = accuracy_score(true_labels, pred_labels)
precision, recall, f1, _ = precision_recall_fscore_support(true_labels, pred_labels, average='weighted')
conf_matrix = confusion_matrix(true_labels, pred_labels)

# Print evaluation metrics
print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1-score: {f1}')
print(f'Confusion Matrix:\n{conf_matrix}')

# Optional: Convert confusion matrix to a DataFrame for better visualization
conf_matrix_df = pd.DataFrame(conf_matrix, index=['True_Safe', 'True_Phishing'], columns=['Pred_Safe', 'Pred_Phishing'])
print(conf_matrix_df)
