In [23]:
import pandas as pd

# UVIC Dataset
file_path = '/content/CaptstoneProjectData_2024.csv'
uvicData = pd.read_csv(file_path)

# Remove unnecessary columns
uvicData_cleaned = uvicData.drop(columns=['Unnamed: 2', 'Unnamed: 3'], errors='ignore')

# Replace empty 'Subject' with space
uvicData_cleaned['Subject'] = uvicData_cleaned['Subject'].fillna(' ')

# Check and remove rows with missing 'Body'
data_cleaned = uvicData_cleaned.dropna(subset=['Body'])

# Normalize text: convert to lowercase, remove special characters, and trim whitespaces
uvicData_cleaned['Subject'] = uvicData_cleaned['Subject'].str.lower().str.replace('[^\w\s]', '', regex=True).str.strip()
uvicData_cleaned['Body'] = uvicData_cleaned['Body'].str.lower().str.replace('[^\w\s]', '', regex=True).str.strip()

# Confirm cleaning
print(uvicData_cleaned.head())

                                             Subject  \
0  review your shipment details  shipment notific...   
1                            υоur ассоunt іѕ оn hоld   
2  completed invoice  kz89tys2564 frombestbuycom ...   
3                              uvic important notice   
4             you have 6 suspended incoming messages   

                                                Body  
0  notice this message was sent from outside the ...  
1  votre réponse a bien été prise en compte\r\nht...  
2  notice this message was sent from outside the ...  
3  your uvic account has been filed under the lis...  
4  message generated from  uvicca source\r\n\r\n\...  


In [24]:
# Load the normal emails dataset
file_path = '/content/emails.csv'

normData = pd.read_csv(file_path)
normData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 517401 entries, 0 to 517400
Data columns (total 2 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   file     517401 non-null  object
 1   message  517401 non-null  object
dtypes: object(2)
memory usage: 7.9+ MB


In [25]:
normData.head()

Unnamed: 0,file,message
0,allen-p/_sent_mail/1.,Message-ID: <18782981.1075855378110.JavaMail.e...
1,allen-p/_sent_mail/10.,Message-ID: <15464986.1075855378456.JavaMail.e...
2,allen-p/_sent_mail/100.,Message-ID: <24216240.1075855687451.JavaMail.e...
3,allen-p/_sent_mail/1000.,Message-ID: <13505866.1075863688222.JavaMail.e...
4,allen-p/_sent_mail/1001.,Message-ID: <30922949.1075863688243.JavaMail.e...


In [27]:
def parse_email(message):
    lines = message.split('\n')
    subject = next((line.split(": ", 1)[1] for line in lines if line.lower().startswith('subject: ')), "")
    body_start = next(i for i, line in enumerate(lines) if line.strip() == '') + 1
    body = "\n".join(lines[body_start:])
    return subject, body

# Apply the function to the 'message' column
normData[['Subject', 'Body']] = normData['message'].apply(lambda x: pd.Series(parse_email(x)))
print(normData.head())

                       file  \
0     allen-p/_sent_mail/1.   
1    allen-p/_sent_mail/10.   
2   allen-p/_sent_mail/100.   
3  allen-p/_sent_mail/1000.   
4  allen-p/_sent_mail/1001.   

                                             message    Subject  \
0  Message-ID: <18782981.1075855378110.JavaMail.e...              
1  Message-ID: <15464986.1075855378456.JavaMail.e...        Re:   
2  Message-ID: <24216240.1075855687451.JavaMail.e...   Re: test   
3  Message-ID: <13505866.1075863688222.JavaMail.e...              
4  Message-ID: <30922949.1075863688243.JavaMail.e...  Re: Hello   

                                                Body  
0                          Here is our forecast\n\n   
1  Traveling to have a business meeting takes the...  
2                     test successful.  way to go!!!  
3  Randy,\n\n Can you send me a schedule of the s...  
4                Let's shoot for Tuesday at 11:45.    
