In [None]:
import pandas as pd
import re
import numpy as np
from langdetect import detect

# Note: Common text preprocessing practices (lowercasing and stemming) is applied only after data combination and feature extraction
def text_preprocessing(col):
    # Regex pattern to remove urls and emails for better normalization
    url_email_pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+|'
                                   r'www\.[a-zA-Z0-9-]+(?:\.[a-zA-Z0-9-]+)*\.[a-zA-Z]{2,}|'
                                   r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,7}\b', re.IGNORECASE)
    # Remove any urls or emails for more normalization
    col = col.str.replace(url_email_pattern, '', regex=True)
    # Replace apostrophes with empty string
    col = col.str.replace(r"'", '', regex=False)
    # Remove all email prefixes (Fw/Re)
    col = col.str.replace(r'(?<!\w)(fw|re)(?=\s*:?)\s*:?\s+', '', case=False, regex=True)
    # Remove punctuations - replace with space instead of empty string to account for stemming
    col = col.str.replace(r'[^\w\s]|[_-]', ' ', regex=True)
    # Replace multiple spaces, tabs(\t), newlines(\n) with single space for uniformity and strip surrounding white space
    col = col.str.replace(r'\s+', ' ', regex=True).str.strip()
    # Convert all nulls to empty string for easier feature extraction
    col = col.fillna('')
    return col

def is_english(text):
    try:
        return detect(text) == 'en'
    except:
        return False

# Note: Set to head(1000) for faster lookup, full 500k+ rows was used during cleaning
legit_emails_df = pd.read_parquet('data/raw_enron_legit_emails.parquet', columns=['message']).head(1000)

# Regex pattern to specifically capture email subject text
subject_pattern = re.compile(r'[Ss]ubject:\s*(?:[Rr][Ee]:?\s*)*(.+)')
legit_emails_df['subject'] = legit_emails_df['message'].str.extract(subject_pattern, expand=False)

# Regex pattern to specifically capture email body text
body_pattern = re.compile(r'\n\n(.+)', re.DOTALL) 
legit_emails_df['body'] = legit_emails_df['message'].str.extract(body_pattern, expand=False)
# Remove unnecessary [Forwarded by (content) Subject] portion in email body
legit_emails_df['body'] = legit_emails_df['body'].str.replace(r'[Ff]orwarded by[\s\S]*?[Ss]ubject', '', regex=True)

# Regex pattern to check for urls in email body and save as url_label
url_pattern = (r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+|'
               r'www\.[a-zA-Z0-9-]+(?:\.[a-zA-Z0-9-]+)*\.[a-zA-Z]{2,}')
legit_emails_df['url_label'] = legit_emails_df['body'].str.contains(url_pattern, regex=True).astype(int)

# Set phishing label of legit emails to 0 by default
legit_emails_df['label'] = 0

# Apply text preprocessing to email subject and body for cleaner output
legit_emails_df['subject'] = text_preprocessing(legit_emails_df['subject'])
legit_emails_df['body'] = text_preprocessing(legit_emails_df['body'])

# Set cleaned df
legit_emails_df = legit_emails_df[['subject', 'body', 'url_label', 'label']]

# Drop rows with duplicate subjects for more diversity
legit_emails_df = legit_emails_df.drop_duplicates()

# Replace potential empty strings to null, then drop all nulls
legit_emails_df = legit_emails_df.replace('', np.nan).dropna()

# Drop non-english rows
legit_emails_df = legit_emails_df[legit_emails_df['subject'].apply(is_english) & legit_emails_df['body'].apply(is_english)]

# Save cleaned dataframe to parquet file
# legit_emails_df.to_parquet('data/clean_enron_legit_emails.parquet', index=False)

# Preview cleaned dataframe
legit_emails_df