In [14]:
!pip install nltk
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
import pandas as pd
from sklearn.model_selection import train_test_split
import re
import os


# UVIC Dataset
file_path = '/content/CaptstoneProjectData_2024.csv'
uvicData = pd.read_csv(file_path)

# Remove unnecessary columns
uvicData_cleaned = uvicData.drop(columns=['Unnamed: 2', 'Unnamed: 3'], errors='ignore')

# Replace empty 'Subject' with space
uvicData_cleaned['Subject'] = uvicData_cleaned['Subject'].fillna(' ')

# Check and remove rows with missing 'Body'
uvicData_cleaned = uvicData_cleaned.dropna(subset=['Body'])

# Normalize text: convert to lowercase, remove special characters, and trim whitespaces
uvicData_cleaned['Subject'] = uvicData_cleaned['Subject'].str.lower().str.replace('[^\w\s]', '', regex=True).str.strip()
uvicData_cleaned['Body'] = uvicData_cleaned['Body'].str.lower().str.replace('[^\w\s]', '', regex=True).str.strip()

# Confirm cleaning
print(uvicData_cleaned.head())

                                             Subject  \
0  review your shipment details  shipment notific...   
1                            υоur ассоunt іѕ оn hоld   
2  completed invoice  kz89tys2564 frombestbuycom ...   
3                              uvic important notice   
4             you have 6 suspended incoming messages   

                                                Body  
0  notice this message was sent from outside the ...  
1  votre réponse a bien été prise en compte\r\nht...  
2  notice this message was sent from outside the ...  
3  your uvic account has been filed under the lis...  
4  message generated from  uvicca source\r\n\r\n\...  


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
file_path = './emails.csv'

In [52]:
# Load the normal emails dataset
normData = pd.read_csv(file_path)
normData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 517401 entries, 0 to 517400
Data columns (total 2 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   file     517401 non-null  object
 1   message  517401 non-null  object
dtypes: object(2)
memory usage: 7.9+ MB


In [53]:
def parse_email(message):
    lines = message.split('\n')
    subject = next((line.split(": ", 1)[1] for line in lines if line.lower().startswith('subject: ')), "")
    body_start = next(i for i, line in enumerate(lines) if line.strip() == '') + 1
    body = "\n".join(lines[body_start:])
    return subject, body

# Apply the function to the 'message' column
normData[['Subject', 'Body']] = normData['message'].apply(lambda x: pd.Series(parse_email(x)))
normData.head()

Unnamed: 0,file,message,Subject,Body
0,allen-p/_sent_mail/1.,Message-ID: <18782981.1075855378110.JavaMail.e...,,Here is our forecast\n\n
1,allen-p/_sent_mail/10.,Message-ID: <15464986.1075855378456.JavaMail.e...,Re:,Traveling to have a business meeting takes the...
2,allen-p/_sent_mail/100.,Message-ID: <24216240.1075855687451.JavaMail.e...,Re: test,test successful. way to go!!!
3,allen-p/_sent_mail/1000.,Message-ID: <13505866.1075863688222.JavaMail.e...,,"Randy,\n\n Can you send me a schedule of the s..."
4,allen-p/_sent_mail/1001.,Message-ID: <30922949.1075863688243.JavaMail.e...,Re: Hello,Let's shoot for Tuesday at 11:45.


In [54]:
normData['Subject'] = normData['Subject'].fillna(' ')
normData = normData.dropna(subset=['Body'])
normData = normData.drop(columns=['file', 'message'], errors='ignore')
# Normalize text: convert to lowercase, remove special characters, and trim whitespaces
normData['Subject'] = normData['Subject'].str.lower().str.replace('[^\w\s]', '', regex=True).str.strip()
normData['Body'] = normData['Body'].str.lower().str.replace('[^\w\s]', '', regex=True).str.strip()

# Showing the updated DataFrame with subject and body columns
normData.head()

Unnamed: 0,Subject,Body
0,,here is our forecast
1,re,traveling to have a business meeting takes the...
2,re test,test successful way to go
3,,randy\n\n can you send me a schedule of the sa...
4,re hello,lets shoot for tuesday at 1145


In [55]:
uvicData_cleaned['label'] = 1
normData['label'] = 0

normEmailData = normData.sample(n=2*len(uvicData_cleaned), random_state=42)

masterData = pd.concat([uvicData_cleaned, normEmailData], ignore_index=True)
masterData = masterData.sample(frac=1).reset_index(drop=True)
masterData.head()

Unnamed: 0,Subject,Body,label
0,another document to add,i had one more document to add to the last ema...,0
1,super saturday,shelly\n\nthese are the super saturdays i can ...,0
2,fw,notice this message was sent from outside the ...,1
3,re ashland chemical,probably wont get to it until after the 1st is...,0
4,,httpsbitly3bm3pej,1


In [56]:
def clean_email_text(text):
    # Check if the text is a string
    if not isinstance(text, str):
        return text  # Return as is if not a string
    # Replace \r\n with a single space
    text = re.sub(r'\r\n', ' ', text)
    # Replace multiple spaces with a single space
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Assuming your DataFrame is named df and the relevant column is named 'Body'
masterData['Body'] = masterData['Body'].apply(clean_email_text)

# To see the cleaned text
print(masterData[['Subject', 'Body']])

                            Subject  \
0           another document to add   
1                    super saturday   
2                                fw   
3               re ashland chemical   
4                                     
...                             ...   
7708  acsess denied for scicouvicca   
7709                       re lunch   
7710                  wire transfer   
7711                                  
7712          re friday festivities   

                                                   Body  
0     i had one more document to add to the last ema...  
1     shelly these are the super saturdays i can hel...  
2     notice this message was sent from outside the ...  
3     probably wont get to it until after the 1st is...  
4                                     httpsbitly3bm3pej  
...                                                 ...  
7708  hello scico scicouvicca has been suspended tem...  
7709  my prior plans are still on you guys go ahead ...  
7710  not

In [57]:
# Set of English stopwords
stop = set(stopwords.words('english'))

def remove_stopwords(text):
    # Tokenize the text and remove stopwords
    return ' '.join([word for word in text.split() if word not in stop])

# Apply the function to remove stopwords from 'Body' and 'Subject'
masterData['Body'] = masterData['Body'].apply(remove_stopwords)
masterData['Subject'] = masterData['Subject'].apply(remove_stopwords)

# To see the text after removing stopwords
print(masterData[['Subject', 'Body']].head())

                Subject                                               Body
0  another document add  one document add last email please see attache...
1        super saturday  shelly super saturdays help nov 10 dec 1 dec 8...
2                    fw  notice message sent outside university victori...
3      ashland chemical  probably wont get 1st done think easy figure t...
4                                                        httpsbitly3bm3pej


In [58]:
masterData.head()

Unnamed: 0,Subject,Body,label
0,another document add,one document add last email please see attache...,0
1,super saturday,shelly super saturdays help nov 10 dec 1 dec 8...,0
2,fw,notice message sent outside university victori...,1
3,ashland chemical,probably wont get 1st done think easy figure t...,0
4,,httpsbitly3bm3pej,1


In [59]:
# Data Split

train_data, temp_test_data = train_test_split(masterData, test_size=0.30, random_state=42)

validation_data, test_data = train_test_split(temp_test_data, test_size=0.5, random_state=42)

masterData.to_csv('./masterDataset.csv', index=False)
train_data.to_csv('./trainData.csv', index=False)
validation_data.to_csv('./validationData.csv', index=False)
test_data.to_csv('./testData.csv', index=False)

In [63]:
masterData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7713 entries, 0 to 7712
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Subject  7713 non-null   object
 1   Body     7713 non-null   object
 2   label    7713 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 180.9+ KB


In [64]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1157 entries, 4240 to 3781
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Subject  1157 non-null   object
 1   Body     1157 non-null   object
 2   label    1157 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 36.2+ KB


In [65]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5399 entries, 3094 to 7270
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Subject  5399 non-null   object
 1   Body     5399 non-null   object
 2   label    5399 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 168.7+ KB


In [66]:
validation_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1157 entries, 3720 to 2402
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Subject  1157 non-null   object
 1   Body     1157 non-null   object
 2   label    1157 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 36.2+ KB
