In [111]:
# import packages
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re 

In [112]:
# import and analyse dataset
labels = pd.read_csv('labels_and_ids.csv')
labels.groupby('jericho_human_classification').size()

jericho_human_classification
ham      590
phish     22
spam     539
dtype: int64

In [113]:
# flatten inbound_event_ids
labels.loc[:, 'inbound_event_ids'] = labels['inbound_event_ids'].apply(lambda x: x.removeprefix('[').removesuffix(']').split(','))
labels = labels.explode('inbound_event_ids')
# remove duplicates
labels = labels.drop_duplicates(subset='inbound_event_ids')
# convert inbound_event_ids to int
labels.loc[:, 'inbound_event_ids'] = labels['inbound_event_ids'].astype(int)
# down sample to match phish
ham_labels = labels[labels['jericho_human_classification'] == 'ham']
spam_labels = labels[labels['jericho_human_classification'] == 'spam']
phish_labels = labels[labels['jericho_human_classification'] == 'phish']
print(ham_labels.shape, spam_labels.shape, phish_labels.shape)
# ham_labels = ham_labels.sample(n=phish_labels.shape[0], random_state=42)
# spam_labels = spam_labels.sample(n=phish_labels.shape[0], random_state=42)
labels = pd.concat([ham_labels, spam_labels, phish_labels])
labels.info()
# save to ids to text
for i in labels['inbound_event_ids']:
    with open('ids.txt', 'a') as f:
        f.write(f'{i},')


(728, 3) (556, 3) (25, 3)
<class 'pandas.core.frame.DataFrame'>
Index: 1309 entries, 1 to 1148
Data columns (total 3 columns):
 #   Column                        Non-Null Count  Dtype 
---  ------                        --------------  ----- 
 0   id                            1309 non-null   int64 
 1   jericho_human_classification  1309 non-null   object
 2   inbound_event_ids             1309 non-null   object
dtypes: int64(1), object(2)
memory usage: 40.9+ KB


In [114]:
obj_df = pd.read_csv('email_data.csv')
# merge with labels
obj_df = obj_df.merge(labels, left_on='id', right_on='inbound_event_ids')
obj_df = obj_df.drop(columns=['id_x', 'id_y'])
obj_df['processed_body'] = obj_df['processed_body'].astype(str)
obj_df['processed_subject'] = obj_df['processed_subject'].astype(str)
obj_df['jericho_human_classification'] = obj_df['jericho_human_classification'].astype(str)
obj_df['inbound_event_ids'] = obj_df['inbound_event_ids'].astype(int)
obj_df.head()

Unnamed: 0,from,processed_subject,processed_body,payload,jericho_human_classification,inbound_event_ids
0,no-reply@rentaequipospiedra.cl,9192-Jerichosecurity: Accept the proposal and ...,Docusign He‌llo sage@jerichosecurity.com\nYo‌u...,"{""cc"":"""",""to"":""sage@jerichosecurity.com"",""bcc""...",phish,208
1,deep@beeleads.com,Life of a Prospect- A Journey of Multiple Purc...,Life of a Prospect- A Journey of Multiple Purc...,"{""cc"":"""",""to"":""alyssa.davis@jerichosecurity.co...",spam,402
2,no-reply@zoom.us,Ahmad Anderson has joined your meeting - Sales...,"Hi Madison Martin, Ahmad Anderson has joined...","{""cc"":"""",""to"":""madison@jerichosecurity.com"",""b...",ham,501
3,telmalogistics@gmail.com,Sage Wohns,Chris\n---\n\nI need you to kindly leave your ...,"{""cc"":"""",""to"":""chris@jerichosecurity.com"",""bcc...",phish,509
4,attendurgent@gmail.com,Sage Wohns,"Hello Chris,\n\n I have a task for you to comp...","{""cc"":"""",""to"":""chris@jerichosecurity.com"",""bcc...",phish,638


In [115]:
# preparation of data
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
le = LabelEncoder()
cv = CountVectorizer()
tfidf = TfidfTransformer()
def simplify_text(text):
    text = text.lower()
    text = text.replace('\n', ' ')
    text = text.replace('\r', ' ')
    text = text.replace('\t', ' ')
    # remove non text characters
    text = ''.join([i for i in text if i.isalnum() or i == ' '])
    return text
def spf_pass(text):
    if 'spf=fail' in text:
        return -1
    elif 'spf=pass' in text:
        return 1
    else:
        return 0
def dkim_pass(text):
    if 'dkim=fail' in text:
        return 0
    elif 'dkim=pass' in text:
        return 1
    else:
        return 0
def dmarc_pass(text):
    if 'dmarc=fail' in text:
        return 0
    elif 'dmarc=pass' in text:
        return 1
    else:
        return 0

simplified_body = obj_df['processed_body'].apply(simplify_text)
simplified_subject = obj_df['processed_subject'].apply(simplify_text)
vectorized_body = cv.fit_transform(simplified_body)
vectorized_subject = cv.fit_transform(simplified_subject)
tfidf_body = tfidf.fit_transform(vectorized_body)
tfidf_subject = tfidf.fit_transform(vectorized_subject)

spf = obj_df['payload'].apply(spf_pass)
dkim = obj_df['payload'].apply(dkim_pass)
dmarc = obj_df['payload'].apply(dmarc_pass)
# encode labels
encoder = LabelEncoder()
y = encoder.fit_transform(obj_df['jericho_human_classification'])

In [116]:
# using count vectorizer with GaussianNB
combined_features = np.concatenate([vectorized_subject.toarray(), vectorized_body.toarray(), spf.to_numpy().reshape(-1, 1), dkim.to_numpy().reshape(-1, 1), dmarc.to_numpy().reshape(-1, 1)], axis=1)
X_train, X_test, y_train, y_test = train_test_split(combined_features, y, test_size=0.3, random_state=42)

from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

clf = GaussianNB()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(y_pred)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

[1 1 2 2 2 1 2 0 1 1 2 2 2 2 2 1 0 1 1 0 0 0 2]
              precision    recall  f1-score   support

           0       0.80      0.50      0.62         8
           1       0.88      0.88      0.88         8
           2       0.60      0.86      0.71         7

    accuracy                           0.74        23
   macro avg       0.76      0.74      0.73        23
weighted avg       0.77      0.74      0.73        23

[[4 1 3]
 [0 7 1]
 [1 0 6]]


In [117]:
# using tfidf with MultinomialNB
from sklearn.naive_bayes import MultinomialNB
combined_features = np.concatenate([tfidf_subject.toarray(), tfidf_body.toarray(), spf.to_numpy().reshape(-1, 1), dkim.to_numpy().reshape(-1, 1), dmarc.to_numpy().reshape(-1, 1)], axis=1)
X_train, X_test, y_train, y_test = train_test_split(combined_features, y, test_size=0.3, random_state=42)
clf = MultinomialNB()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(y_pred)
print(classification_report(y_test, y_pred))

[1 1 1 0 2 1 2 1 1 1 2 2 1 2 1 1 0 1 1 0 0 0 0]
              precision    recall  f1-score   support

           0       0.67      0.50      0.57         8
           1       0.58      0.88      0.70         8
           2       0.40      0.29      0.33         7

    accuracy                           0.57        23
   macro avg       0.55      0.55      0.53        23
weighted avg       0.56      0.57      0.54        23

