In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split
import re
import os

# UVIC Dataset
file_path = '/content/CaptstoneProjectData_2024.csv'
uvicData = pd.read_csv(file_path)

# Remove unnecessary columns
uvicData_cleaned = uvicData.drop(columns=['Unnamed: 2', 'Unnamed: 3'], errors='ignore')

# Replace empty 'Subject' with space
uvicData_cleaned['Subject'] = uvicData_cleaned['Subject'].fillna(' ')

# Check and remove rows with missing 'Body'
uvicData_cleaned = uvicData_cleaned.dropna(subset=['Body'])

# Normalize text: convert to lowercase, remove special characters, and trim whitespaces
uvicData_cleaned['Subject'] = uvicData_cleaned['Subject'].str.lower().str.replace('[^\w\s]', '', regex=True).str.strip()
uvicData_cleaned['Body'] = uvicData_cleaned['Body'].str.lower().str.replace('[^\w\s]', '', regex=True).str.strip()

# Confirm cleaning
print(uvicData_cleaned.head())

                                             Subject  \
0  review your shipment details  shipment notific...   
1                            υоur ассоunt іѕ оn hоld   
2  completed invoice  kz89tys2564 frombestbuycom ...   
3                              uvic important notice   
4             you have 6 suspended incoming messages   

                                                Body  
0  notice this message was sent from outside the ...  
1  votre réponse a bien été prise en compte\r\nht...  
2  notice this message was sent from outside the ...  
3  your uvic account has been filed under the lis...  
4  message generated from  uvicca source\r\n\r\n\...  


In [17]:
# Load the normal emails dataset
file_path = '/content/emails.csv'

normData = pd.read_csv(file_path)
normData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 517401 entries, 0 to 517400
Data columns (total 2 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   file     517401 non-null  object
 1   message  517401 non-null  object
dtypes: object(2)
memory usage: 7.9+ MB


In [18]:
normData.head()

Unnamed: 0,file,message
0,allen-p/_sent_mail/1.,Message-ID: <18782981.1075855378110.JavaMail.e...
1,allen-p/_sent_mail/10.,Message-ID: <15464986.1075855378456.JavaMail.e...
2,allen-p/_sent_mail/100.,Message-ID: <24216240.1075855687451.JavaMail.e...
3,allen-p/_sent_mail/1000.,Message-ID: <13505866.1075863688222.JavaMail.e...
4,allen-p/_sent_mail/1001.,Message-ID: <30922949.1075863688243.JavaMail.e...


In [19]:
def parse_email(message):
    lines = message.split('\n')
    subject = next((line.split(": ", 1)[1] for line in lines if line.lower().startswith('subject: ')), "")
    body_start = next(i for i, line in enumerate(lines) if line.strip() == '') + 1
    body = "\n".join(lines[body_start:])
    return subject, body

# Apply the function to the 'message' column
normData[['Subject', 'Body']] = normData['message'].apply(lambda x: pd.Series(parse_email(x)))
normData.head()

Unnamed: 0,file,message,Subject,Body
0,allen-p/_sent_mail/1.,Message-ID: <18782981.1075855378110.JavaMail.e...,,Here is our forecast\n\n
1,allen-p/_sent_mail/10.,Message-ID: <15464986.1075855378456.JavaMail.e...,Re:,Traveling to have a business meeting takes the...
2,allen-p/_sent_mail/100.,Message-ID: <24216240.1075855687451.JavaMail.e...,Re: test,test successful. way to go!!!
3,allen-p/_sent_mail/1000.,Message-ID: <13505866.1075863688222.JavaMail.e...,,"Randy,\n\n Can you send me a schedule of the s..."
4,allen-p/_sent_mail/1001.,Message-ID: <30922949.1075863688243.JavaMail.e...,Re: Hello,Let's shoot for Tuesday at 11:45.


In [20]:
normData['Subject'] = normData['Subject'].fillna(' ')
normData = normData.dropna(subset=['Body'])
normData = normData.drop(columns=['file', 'message'], errors='ignore')
# Normalize text: convert to lowercase, remove special characters, and trim whitespaces
normData['Subject'] = normData['Subject'].str.lower().str.replace('[^\w\s]', '', regex=True).str.strip()
normData['Body'] = normData['Body'].str.lower().str.replace('[^\w\s]', '', regex=True).str.strip()

# Showing the updated DataFrame with subject and body columns
normData.head()

Unnamed: 0,Subject,Body
0,,here is our forecast
1,re,traveling to have a business meeting takes the...
2,re test,test successful way to go
3,,randy\n\n can you send me a schedule of the sa...
4,re hello,lets shoot for tuesday at 1145


In [21]:
uvicData_cleaned['label'] = 1
normData['label'] = 0

normEmailData = normData.sample(n=2*len(uvicData_cleaned), random_state=42)

masterData = pd.concat([uvicData_cleaned, normEmailData], ignore_index=True)
masterData = masterData.sample(frac=1).reset_index(drop=True)
masterData.head()

Unnamed: 0,Subject,Body,label
0,good day,greetings\r\n\r\n\r\ni am sorry to encroach in...,1
1,rescheduled dates for pros 9th annual pricing ...,world events and resulting airlineindustry act...,0
2,re graph2,please find attached the q1 q2 q3 and q4 nyme...,0
3,transaction completed,hello advisinguvicca\r\n\r\npaypal ...,1
4,2022 employee benefits plan,the employee benefits program will now be supp...,1


In [22]:
# Feature Engineering: Length of the email body

def add_body_length(df):

    df['Body_Length'] = df['Body'].apply(lambda x: len(x) if isinstance(x, str) else pd.NA)
    return df

masterData = add_body_length(masterData.copy())
masterData.head()

Unnamed: 0,Subject,Body,label,Body_Length
0,good day,greetings\r\n\r\n\r\ni am sorry to encroach in...,1,517
1,rescheduled dates for pros 9th annual pricing ...,world events and resulting airlineindustry act...,0,805
2,re graph2,please find attached the q1 q2 q3 and q4 nyme...,0,109
3,transaction completed,hello advisinguvicca\r\n\r\npaypal ...,1,1433
4,2022 employee benefits plan,the employee benefits program will now be supp...,1,1267


In [23]:
def clean_email_text(text):
    # Check if the text is a string
    if not isinstance(text, str):
        return text  # Return as is if not a string
    # Replace \r\n with a single space
    text = re.sub(r'\r\n', ' ', text)
    # Replace multiple spaces with a single space
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Assuming your DataFrame is named df and the relevant column is named 'Body'
masterData['Body'] = masterData['Body'].apply(clean_email_text)

# To see the cleaned text
print(masterData[['Subject', 'Body']])

                                                Subject  \
0                                              good day   
1     rescheduled dates for pros 9th annual pricing ...   
2                                             re graph2   
3                                 transaction completed   
4                           2022 employee benefits plan   
...                                                 ...   
7708  good morning  these are the levels we will be ...   
7709                                       best regards   
7710  ups shipment notification  notification dexped...   
7711                          spam suspected hello dear   
7712                enron corp common stock 2for1 split   

                                                   Body  
0     greetings i am sorry to encroach into your pri...  
1     world events and resulting airlineindustry act...  
2     please find attached the q1 q2 q3 and q4 nymex...  
3     hello advisinguvicca paypal 25feb2022 063703 a...  
4

In [24]:
masterData.head()

Unnamed: 0,Subject,Body,label,Body_Length
0,good day,greetings i am sorry to encroach into your pri...,1,517
1,rescheduled dates for pros 9th annual pricing ...,world events and resulting airlineindustry act...,0,805
2,re graph2,please find attached the q1 q2 q3 and q4 nymex...,0,109
3,transaction completed,hello advisinguvicca paypal 25feb2022 063703 a...,1,1433
4,2022 employee benefits plan,the employee benefits program will now be supp...,1,1267


In [25]:
train_data, test_data = train_test_split(masterData, test_size=0.20, random_state=42)
masterData.to_csv('./masterDataset.csv', index=False)