In [18]:
import pandas as pd
import numpy as np
import re
from langdetect import detect

# Text Cleaning Function
def clean_text(col):
    # Regex pattern to remove urls and emails for better normalization
    url_email_pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+|'
                                   r'www\.[a-zA-Z0-9-]+(?:\.[a-zA-Z0-9-]+)*\.[a-zA-Z]{2,}|'
                                   r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,7}\b', re.IGNORECASE)
    # Remove any urls or emails for more normalization
    col = col.str.replace(url_email_pattern, '', regex=True)
    # Replace apostrophes with empty string
    col = col.str.replace(r"'", '', regex=False)
    # Remove all email prefixes (Fw/Re)
    col = col.str.replace(r'(?<!\w)(fw|re)(?=\s*:?)\s*:?\s+', '', case=False, regex=True)
    # Remove punctuations - replace with space instead of empty string to account for stemming
    col = col.str.replace(r'[^\w\s]|[_-]', ' ', regex=True)
    # Replace multiple spaces, tabs(\t), newlines(\n) with single space for uniformity and strip surrounding white space
    col = col.str.replace(r'\s+', ' ', regex=True).str.strip()
    # Convert all nulls to empty string for easier feature extraction
    col = col.fillna('')
    return col

def is_english(text):
    try:
        return detect(text) == 'en'
    except:
        return False

# Step 1: Load the dataset
file_path = r'data/raw_nazario_phishing_emails.csv'
nazario_df = pd.read_csv(file_path).iloc[1:]

# Step 2: Apply text cleaning to 'body' and 'subject'
nazario_df['subject'] = clean_text(nazario_df['subject'])
nazario_df['body'] = clean_text(nazario_df['body'])

# Step 3: Rename 'urls' column to 'url_label in your dataset
nazario_df['url_label'] = nazario_df['urls']

# Step 4: Remove unnecessary columns and keep only the required ones
nazario_df = nazario_df[['subject', 'body', 'url_label', 'label']]

# Step 5: Replace any empty strings to null, then drop all nulls
nazario_df = nazario_df.replace('', np.nan).dropna()

# Step 6: Drop duplicates
nazario_df = nazario_df.drop_duplicates()

# Step 7: Drop non-english rows for email body
nazario_df = nazario_df[nazario_df['body'].apply(is_english)]

# Step 8: Save cleaned dataframe
# nazario_df.to_csv('data/clean_nazario_phishing_emails.csv', index=False)

# Nazario dataframe preview
nazario_df

Unnamed: 0,subject,body,url_label,label
1,Verify Your Account,Business with cPanel WHM Dear client Our Techn...,1,1
2,Helpdesk Mailbox Alert,Your two incoming mails were placed on pending...,1,1
3,IT Service Help Desk,Password will expire in 3 days Click Here To V...,0,1
4,Final USAA Reminder Update Your Account Now,To ensure delivery to your inbox please add to...,1,1
5,utf 8 Q Dear 20Client 20 3a 20Update 20Your 20...,PayPal Secure Dear Client We have noticed that...,1,1
...,...,...,...,...
1560,Receipt for Your Payment to FTX,PayPal You sent a payment of 699 99 USD to FTX...,0,1
1561,Rectify Your Password With monkey org,monkey org Hi jose Pa s sword for Will Expire ...,1,1
1562,Netflix Were having some trouble with your cur...,HELLO Please note that your monthly payment ha...,1,1
1563,Your MetaMask wallet will be suspended,Verify your MetaMask Wallet Our system has sho...,1,1
