In [5]:
import pandas as pd
import numpy as np
import re
from langdetect import detect

def clean_text(col):
    # Regex pattern to remove urls and emails for better normalization
    url_email_pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+|'
                                   r'www\.[a-zA-Z0-9-]+(?:\.[a-zA-Z0-9-]+)*\.[a-zA-Z]{2,}|'
                                   r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,7}\b', re.IGNORECASE)
    # Remove any urls or emails for more normalization
    col = col.str.replace(url_email_pattern, '', regex=True)
    # Replace apostrophes with empty string
    col = col.str.replace(r"'", '', regex=False)
    # Remove all email prefixes (Fw/Re)
    col = col.str.replace(r'(?<!\w)(fw|re)(?=\s*:?)\s*:?\s+', '', case=False, regex=True)
    # Remove punctuations - replace with space instead of empty string to account for stemming
    col = col.str.replace(r'[^\w\s]|[_-]', ' ', regex=True)
    # Replace multiple spaces, tabs(\t), newlines(\n) with single space for uniformity and strip surrounding white space
    col = col.str.replace(r'\s+', ' ', regex=True).str.strip()
    # Convert all nulls to empty string for easier feature extraction
    col = col.fillna('')
    return col

def is_english(text):
    try:
        return detect(text) == 'en'
    except:
        return False

# Total 3332 lines
nigerian_df = pd.read_csv(r'data/raw_nigerian_phishing_emails.csv')

# Cleaning email text
nigerian_df['subject'] = clean_text(nigerian_df['subject'])
nigerian_df['body'] = clean_text(nigerian_df['body'])

# Rename nigerian urls column to url_label
nigerian_df['url_label'] = nigerian_df['urls']

# Keep required columns
nigerian_df = nigerian_df[['subject', 'body', 'url_label', 'label']]

# Replace any empty strings to null, then drop all nulls
nigerian_df = nigerian_df.replace('', np.nan).dropna()

# Drop all duplicates
nigerian_df = nigerian_df.drop_duplicates()

# Remove any non-english rows for email body
nigerian_df = nigerian_df[nigerian_df['body'].apply(is_english)]

# Save cleaned dataframe
# nigerian_df.to_csv('data/clean_nigerian_phishing_emails.csv', index=False)

# Nigerian dataframe preview
nigerian_df

Unnamed: 0,subject,body,url_label,label
0,URGENT BUSINESS ASSISTANCE AND PARTNERSHIP,FROM MR JAMES NGOLA CONFIDENTIAL TEL 233 27 58...,0,1
1,URGENT ASSISTANCE RELATIONSHIP P,Dear Friend I am Mr Ben Suleman a custom offic...,0,1
2,GOOD DAY TO YOU,FROM HIS ROYAL MAJESTY HRM CROWN RULER OF ELEM...,0,1
3,GOOD DAY TO YOU,FROM HIS ROYAL MAJESTY HRM CROWN RULER OF ELEM...,0,1
4,I Need Your Assistance,Dear sir It is with a heart full of hope that ...,0,1
...,...,...,...,...
3327,CONTACT GLOBAL MAX SHIPING COMPANY,Atten My Dear I have Paid the fee for your Che...,0,1
3328,TREAT AS URGENT,From Mr Ali Sherif African Development Bank AD...,1,1
3329,From Dr Usman Ibrahim Mr Wahid Yoffe property,FROM DR USMAN IBRAHIM DANKO AUDITING AND ACCOU...,1,1
3330,My Beloved In Christ,Beloved in the Lord Jesus Christ PLEASE ENDEAV...,1,1
