# Import libraries

In [None]:
import os
import sys
import warnings
import pandas as pd
import json
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report, ConfusionMatrixDisplay

# Hide warnings
warnings.filterwarnings('ignore')


# Import and prepare dataset

In [None]:
# Full dataset
test_df = pd.read_parquet('/data/workspace/dataset/full-dataset/raw/test.parquet')
test_df["Subject"] = test_df["Subject"].fillna("")
test_df["text_plain"] = test_df["text_plain"].fillna("")
test_df = test_df[test_df['target_3'] != 'self_phishing']

test_df.head()

In [None]:
json_df = test_df[['Subject', 'text_plain', 'target_1']].copy().reset_index()
json_df.head()

In [None]:
# Initialize final structure
output = {'malicious': {}, 'benign': {}}

# Loop through rows
for _, row in json_df.iterrows():
    label = row['target_1']
    path = row['path']
    subject = row['Subject']
    body = row['text_plain']
    
    output[label][path] = {
        'subjects': [subject],
        'bodies': [body]
    }

# Save to JSON file
with open("/data/workspace/dataset/phishsense/phishsense_input.json", "w") as f:
    json.dump(output, f, indent=2)

# Run Phishsense Model

Open a terminal and run the following command:
1. `cd /data/phishsense-1/app` 
2. `source ./venv/bin/activate` 
3. `python -uB src/app.py`

The following is curl will pass the input into Phishsense and obtain the model's predictions.

In [None]:
!curl \
    -H "Content-Type: application/json" \
    -d @/data/workspace/dataset/phishsense/phishsense_input.json \
    -o /data/workspace/dataset/phishsense/phishsense_output.json \
    localhost:8080


# Read in Phishsense's predictions

In [None]:
# Read in the phishsense output JSON file
with open("/data/workspace/dataset/phishsense/phishsense_output.json", "r") as f:
    phishsense_prediction = json.load(f)

In [None]:
rows = []

for label, emails in phishsense_prediction.items():
    for path, data in emails.items():
        row = {
            "path": path,
            "body_ceo_fraud": data["body"].get("CEO Fraud", 0),
            "body_legitimate": data["body"].get("Legitimate", 0),
            "body_phishing": data["body"].get("Phishing", 0),
            "body_spam": data["body"].get("Spam", 0),
            "subject_ceo_fraud": data["subject"].get("CEO Fraud", 0),
            "subject_legitimate": data["subject"].get("Legitimate", 0),
            "subject_phishing": data["subject"].get("Phishing", 0),
            "subject_spam": data["subject"].get("Spam", 0),
            "label": label  # optional: to keep track of benign/malicious
        }
        rows.append(row)

# Convert to DataFrame
phishsense_pred_df = pd.DataFrame(rows)

In [None]:
body_map = {
    'body_legitimate': 'benign',
    'body_spam': 'benign',
    'body_ceo_fraud': 'malicious',
    'body_phishing': 'malicious'
}
subject_map = {
    'subject_legitimate': 'benign',
    'subject_spam': 'benign',
    'subject_ceo_fraud': 'malicious',
    'subject_phishing': 'malicious'
}

phishsense_pred_df['body_prediction'] = (
    phishsense_pred_df[['body_ceo_fraud', 'body_legitimate', 'body_phishing', 'body_spam']]
    .idxmax(axis=1)
    .replace(body_map)
)
phishsense_pred_df['subject_prediction'] = (
    phishsense_pred_df[['subject_ceo_fraud', 'subject_legitimate', 'subject_phishing', 'subject_spam']]
    .idxmax(axis=1)
    .replace(subject_map)
)

In [None]:
body_pred_df = phishsense_pred_df[['path', 'label', 'body_prediction']].copy()
body_pred_df.head()


In [None]:
subject_pred_df = phishsense_pred_df[['path', 'label', 'subject_prediction']].copy()
subject_pred_df.head()

# Calculate the confusion matrix

## Phishsense - Body Model's prediction

In [None]:
y_true_body = body_pred_df['label']
y_pred_body = body_pred_df['body_prediction']

cm_body = confusion_matrix(y_true_body, y_pred_body)
tn_body, fp_body, fn_body, tp_body = cm_body.ravel()
disp = ConfusionMatrixDisplay(confusion_matrix=cm_body, display_labels=['benign', 'malicious'])

disp.plot()
print("Confusion Matrix:")
plt.show()

report_body = classification_report(y_true_body, y_pred_body)
print("\nClassification Report of Phishsense Body Model:\n", report_body)

fpr_body = fp_body / (fp_body + tn_body)
fnr_body = fn_body / (fn_body + tp_body)

print(f"\nFalse Positive Rate (FPR): {fpr_body:.4f}")
print(f"\nFalse Negative Rate (FNR): {fnr_body:.4f}")

## Phishsense - Subject Model's prediction

In [None]:
y_true_subject = subject_pred_df['label']
y_pred_subject = subject_pred_df['subject_prediction']

cm_subject = confusion_matrix(y_true_subject, y_pred_subject)
tn_subject, fp_subject, fn_subject, tp_subject = cm_subject.ravel()
disp = ConfusionMatrixDisplay(confusion_matrix=cm_subject, display_labels=['benign', 'malicious'])

disp.plot()
print("Confusion Matrix:")
plt.show()

report_subject = classification_report(y_true_subject, y_pred_subject)
print("\nClassification Report of Phishsense Subject Model:\n", report_subject)

fpr_subject = fp_subject / (fp_subject + tn_subject)
fnr_subject = fn_subject / (fn_subject + tp_subject)

print(f"\nFalse Positive Rate (FPR): {fpr_subject:.4f}")
print(f"\nFalse Negative Rate (FNR): {fnr_subject:.4f}")