### Importing the libraries

In [25]:
# Data Processing
import pandas as pd
import numpy as np

# Modelling
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from scipy.stats import randint

# Tree Visualisation
from sklearn.tree import export_graphviz
from IPython.display import Image
import graphviz


## Part 1 - Read File

In [26]:
# Step 1: Data Collection
# Load the dataset
uvicData = pd.read_csv('project.csv')
uvicData.head()

Unnamed: 0,Subject,Body,Unnamed: 2,Unnamed: 3
0,®Review your shipment details / Shipment Notif...,Notice: This message was sent from outside the...,,
1,Υоur ассоunt іѕ оn hоld,\r\nVotre réponse a bien été prise en compte.\...,,
2,Completed: Invoice # KZ89TYS2564 from-Bestbuy....,Notice: This message was sent from outside the...,,
3,UVic IMPORTANT NOTICE!,Your UVIC account has been filed under the lis...,,
4,You have (6) Suspended incoming messages,\r\n\r\nMessage generated from uvic.ca source...,,


## Part 2- Data Preprocessing

In [27]:
# Remove unnecessary columns
uvicData_cleaned = uvicData.drop(columns=['Unnamed: 2', 'Unnamed: 3'], errors='ignore')

# Replace empty 'Subject' with space
uvicData_cleaned['Subject'] = uvicData_cleaned['Subject'].fillna(' ')

# Check and remove rows with missing 'Body'
data_cleaned = uvicData_cleaned.dropna(subset=['Body'])

# Normalize text: convert to lowercase, remove special characters, and trim whitespaces
uvicData_cleaned['Subject'] = uvicData_cleaned['Subject'].str.lower().str.replace('[^\w\s]', '', regex=True).str.strip()
uvicData_cleaned['Body'] = uvicData_cleaned['Body'].str.lower().str.replace('[^\w\s]', '', regex=True).str.strip()

# Confirm cleaning
uvicData_cleaned.head()



Unnamed: 0,Subject,Body
0,review your shipment details shipment notific...,notice this message was sent from outside the ...
1,υоur ассоunt іѕ оn hоld,votre réponse a bien été prise en compte\r\nht...
2,completed invoice kz89tys2564 frombestbuycom ...,notice this message was sent from outside the ...
3,uvic important notice,your uvic account has been filed under the lis...
4,you have 6 suspended incoming messages,message generated from uvicca source\r\n\r\n\...


In [None]:
# Load the normal emails dataset
file_path = 'emails.csv'

normData = pd.read_csv(file_path)
normData.info()

In [None]:
def parse_email(message):
    lines = message.split('\n')
    subject = next((line.split(": ", 1)[1] for line in lines if line.lower().startswith('subject: ')), "")
    body_start = next(i for i, line in enumerate(lines) if line.strip() == '') + 1
    body = "\n".join(lines[body_start:])
    return subject, body

# Apply the function to the 'message' column
normData[['Subject', 'Body']] = normData['message'].apply(lambda x: pd.Series(parse_email(x)))
normData.head()

In [None]:
normData['Subject'] = normData['Subject'].fillna(' ')
normData = normData.dropna(subset=['Body'])
normData = normData.drop(columns=['file', 'message'], errors='ignore')
# Normalize text: convert to lowercase, remove special characters, and trim whitespaces
normData['Subject'] = normData['Subject'].str.lower().str.replace('[^\w\s]', '', regex=True).str.strip()
normData['Body'] = normData['Body'].str.lower().str.replace('[^\w\s]', '', regex=True).str.strip()

# Showing the updated DataFrame with subject and body columns
normData.head()

In [28]:
uvicData_cleaned['label'] = 1
normData['label'] = 0

masterData = pd.concat([uvicData_cleaned, normData], ignore_index=True)
masterData.head()

Unnamed: 0,Subject,Body,label
0,review your shipment details shipment notific...,notice this message was sent from outside the ...,1
1,υоur ассоunt іѕ оn hоld,votre réponse a bien été prise en compte\r\nht...,1
2,completed invoice kz89tys2564 frombestbuycom ...,notice this message was sent from outside the ...,1
3,uvic important notice,your uvic account has been filed under the lis...,1
4,you have 6 suspended incoming messages,message generated from uvicca source\r\n\r\n\...,1


In [29]:
# Split the data into features (X) and target (y)
X = masterData.drop('label', axis=1)
y = masterData['label']
masterData.dtypes
# # Split the data into features (X) and target (y)
# X = uvicData_cleaned.drop('label', axis=1)
# y = uvicData_cleaned['label']
# uvicData_cleaned.dtypes



Subject    object
Body       object
label       int64
dtype: object

### Convert Data (Strings) to Numbers

In [31]:
#We need to convert strings into float for processing

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
categorical_attributes= ["Subject","Body"]
one_hot=OneHotEncoder()
transformer = ColumnTransformer([("one_hot",one_hot,categorical_attributes,)],remainder="passthrough")
transformedX=transformer.fit_transform(X)
transformedX

# # Split the data into training and test sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# themodel = RandomForestClassifier()
# themodel.fit(X_train, y_train)

<2576x4085 sparse matrix of type '<class 'numpy.float64'>'
	with 5152 stored elements in Compressed Sparse Row format>

In [38]:
#An alternative using get_dummies
#pd.DataFrame(transformedX)
dummy = pd.get_dummies(masterData[["Subject","Body"]])
dummy


Unnamed: 0,Subject_,Subject_00566519 is your facebook account recovery code,Subject_00ppmm skype,Subject_09042023,Subject_1 new notification,Subject_1 notifications mailbox storage full,Subject_12062022 alerts1 a correct address is required for package delivery item noa7194213439 3d87d,Subject_1212022 reminder the driver was unable to deliver your package item noa4048834776 0c01a,Subject_1689 salary increase letter 20th june 2022,Subject_1689 salary increase letter 22 september 2023,...,Body_your password will expiгe in 24 hours\r\n\r\n your password will expiгe in 24 hours please take action immediately to make sure you do no t get locked out of your email accouпt \r\n\r\nkeep curreпt passwordhttpsatpproductcomlkhgfixed0indexphpuserhelpdeskuvicca chaпge passwordhttpsatpproductcomlkhgfixed0indexphpuserhelpdeskuvicca\r\n\r\n\r\n\r\n\r\naccouпt information\r\n\r\neмail address helpdeskuvicca\r\n\r\n\r\ndomain uvicca\r\n\r\n\r\nprivacy statement\r\n\r\nuvicca corporation c 2022,Body_your payment was successful\r\ntransaction id tt890705\r\n\r\ngeekoisquado\r\n\r\nwe are renewing it for you\r\n\r\ndear helpdesk 18444712097tel18444712097\r\nwe love to hear from our customers\r\n\r\nthis is a confirmation email for the autorenewalupgrade of your online gold plus tech support plan for 1 pc peripherals with no onsite services\r\n\r\nwe have authorized your accountcard with 39919 successfully\r\n\r\nnote this transaction should reflect in your ac within 48 hrs\r\n\r\nfind below your order details\r\n\r\nyour order number 40703534238781646\r\nproduct name gold plus tech support 1years unlimited support plan for 1 pc\r\ntotal amount 39919\r\ntransaction date sep 16 2022\r\nitem number 2897652ayp\r\n\r\nthe default payment method is authorized now and will be charged within 24 hours\r\nto change this auto renewal or to raise a cancellation contact helpline 18444712097tel18444712097\r\nif you have not placed this order please contact within 48 hours at 18444712097tel18444712097\r\n\r\nwhat is covered unlimited incident technical support for 1 year for 1 computer via phone chat and remote session\r\nany technical issue that does not involve a hardware failure or component replacement will be covered\r\ndevices and peripherals attached to the pc like printer scanner router etc will also fall under free support in case of a hardware failure the geek will do a diagnosis and will suggest you the needful replacements to be done\r\n\r\nthank you for choosing igeek_ squado\r\n\r\npremium virtual support\r\n\r\nconsumer id18070663\r\n\r\n\r\nprivacy policy report spam unsubscribe\r\nto ensure delivery to your inbox add the sender to your contact address book\r\n\r\n49 overlook ave washington ri 2813\r\n\r\n1921683468496 all rights reserved,Body_your server is pending some messages due to a mailbox delivery error\r\n\r\nactivate your leadershipgiving2uvicca account below to access on hold messages\r\ndelivered to your inbox\r\n\r\nactivation expires after 48hours\r\n\r\nactivate here httpsdavidmaiolocomtheoleadershipgiving2uvicca\r\n\r\ncopyright 2022 uvicca all rights reserved\r\n\r\n\r\n\r\n this message was sent from an unmonitored email address please do not reply to this message,Body_your services has been renewed\r\nthis emails confirms the renewal of your services with gsquad we are glad to inform you that your plan with us has been renewed for 39549 please review the summary of your renewal\r\nrenewal id\r\ngeesq029086897\r\n renewal date\r\n02mar2022 091555 est\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\nregistered email confirmed\r\n\r\nabuseuviccamailtoabuseuvicca\r\n\r\n\r\n\r\n\r\n\r\n\r\ndescription users qty amount\r\ngeek secure premium\r\n 04 01 39549 usd\r\n\r\nsubtotal 39549 usd\r\ntotal 39549 usd\r\npayment 39549 usd\r\n\r\n\r\n\r\n\r\n\r\nmethod used\r\n creditdebit card\r\n\r\n\r\nissues with this email\r\nyou have 24hrs from the date of the renewal to cancel your plan\r\n\r\n\r\n\r\n\r\n\r\n\r\nhelpdesk 1 808 3740035\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\nplease do not reply to this email to get in touch reach helpdesk\r\nnot sure why you received this email learn more\r\nunsubscribe,Body_your university of victoria account certificate expired on 13102022 it may interrupt your email delivery configuration and pop account settings page error when messaging to renew your university of victoria account certificate please take a moment to update your records per link below or copy and paste link\r\n\r\nhttpsformcraftscomagaaqdbh\r\n\r\naccount will function as normal after the verification process and your university of victoria certificate will be renewed,Body_your uvic account has been filed under the list of accounts set for deactivation due to retirement graduation or transfer of the concerned account holder but the record shows you are still active in service and so advised to verify this request otherwise give us reason to deactivate your university of victoria account\r\n\r\nplease verify your uvic account immediately to avoid deactivation verify here httpsformsgletpbxj1srfwgymd8c7\r\n\r\nplease note the onetime submission and entry only\r\n\r\n\r\nwarm regards\r\n\r\n\r\nuvic help desk team,Body_youve reached anthtwo of uvic\r\n\r\n\r\n\r\n\r\n\r\n\r\ntuesday uvic authentication service\r\n\r\nhi anthtwouvicca\r\n\r\npasswōrd expired monday november 14 2022\r\nplease confirm pāssword on profile before wednesday november 16 2022 to continue\r\n\r\ncõnfirm ownership nowhttpswwwgooglecombhurlqhtt703a2f2f65696d67312e63om3fj73i3d3613645373463837347376f3407357636393633324563361sadsntz1usgaovvaw0xmpahdgs5un24hkplmtca\r\n\r\nuvic microsoft\r\n\r\n\r\n ________________________________\r\n\r\nnote this verification is for its intended receiver anthtwouvicca\r\nto specify additional recipients for such information sign up herehttpswwwgooglecombhurlqhtt703a2f2f65696d67312e63om3fj73i3d3613645373463837347376f3407357636393633324563361sadsntz1usgaovvaw0xmpahdgs5un24hkplmtca,Body_â\r\n\r\nadd front door surveillance\r\nhttpsstoragegoogleapiscom1ab583605a1a9623dedf4e59f8a897cb9958c6021ce677c1fdb4009b5f1bcl52236_md811189067451748280004\r\n\r\nhttpspbstwimgcommediaff2qtlcxwaeo73zformatpngname900x900httpsstoragegoogleapiscom1ab583605a1a9623dedf4e59f8a897cb9958c6021ce677c1fdb4009b5f1bcl52236_md811189067451748280004\r\n\r\nclick herehttpsstoragegoogleapiscom1ab583605a1a9623dedf4e59f8a897cb9958c6021ce677c1fdb4009b5f1boop52236_md811189067451748280004 to remove yourself from our emails list,Body_μail accοunt quota nοtificatiοn for quotuviccaquot\r\n the fοllοwing users have exceeded their individual mail quοta\r\n\r\n helpdeskuvicca\r\n\r\nthe system generated this nοtice on thursday nοvember 17 2022 at 81301 am utc\r\n\r\nyοu can increase yοur μailbοx stοrage for free thrοugh the cpanel interface here httpsuvicca2083goto_appfree_storagehttpsc1redentialsobsmyhuaweicloudcomcredentialsvoxhtmlawsaccesskeyidh1rg7bbd025ascsb5f7gexpires1670972336signatureouvbi2iiiyaaqjligg084wwimbg3dhelpdeskuvicca\r\n\r\ndo not reply to this autοmated message\r\n\r\ncp\r\n\r\ncοpyright 2022 cpanel llc,Body_τhis messαge is frοm the service prοvider uvicca\r\n\r\n\r\nυοur pαsswοrd will expιre tοdαy use the buttοn belοw to keep or updαte yοur current credentiαls\r\nhttpszprio79njtantctu3helpdeskuvicca\r\n\r\nvαlidαte to αvοid being lοcked οut\r\n\r\n\r\ndisclαimer this emαil and its cοntents are fοr use οnly by the recipient
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,True,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2571,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2572,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2573,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2574,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


Train and Test

In [39]:
np.random.seed(42)


# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(transformedX, y, test_size=0.2)

themodel = RandomForestClassifier()
themodel.fit(X_train, y_train)


In [40]:
themodel.score(X_test,y_test)

1.0

In [41]:
y_pred = themodel.predict(X_test)

In [42]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 1.0


In [44]:
# Export the first three decision trees from the forest

for i in range(3):
    tree = themodel.estimators_[i]
    dot_data = export_graphviz(tree,
                               feature_names=X_train.columns,
                               filled=True,
                               max_depth=2,
                               impurity=False,
                               proportion=True)
    graph = graphviz.Source(dot_data)
    display(graph)

AttributeError: 'csr_matrix' object has no attribute 'columns'