In [11]:
# import packages
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re 

In [12]:
# import and analyse dataset
labels = pd.read_csv('labels_and_ids.csv')
labels.groupby('jericho_human_classification').size()

jericho_human_classification
ham      590
phish     22
spam     539
dtype: int64

In [13]:
# flatten inbound_event_ids
labels.loc[:, 'inbound_event_ids'] = labels['inbound_event_ids'].apply(lambda x: x.removeprefix('[').removesuffix(']').split(','))
labels = labels.explode('inbound_event_ids')
# remove duplicates
labels = labels.drop_duplicates(subset='inbound_event_ids')
# convert inbound_event_ids to int
labels.loc[:, 'inbound_event_ids'] = labels['inbound_event_ids'].astype(int)
# down sample to match phish
ham_labels = labels[labels['jericho_human_classification'] == 'ham']
spam_labels = labels[labels['jericho_human_classification'] == 'spam']
phish_labels = labels[labels['jericho_human_classification'] == 'phish']
ham_labels = ham_labels.sample(n=phish_labels.shape[0], random_state=42)
spam_labels = spam_labels.sample(n=phish_labels.shape[0], random_state=42)
labels = pd.concat([ham_labels, spam_labels])
labels.info()
# save to ids to text
for i in labels['inbound_event_ids']:
    with open('ids.txt', 'a') as f:
        f.write(f'{i},')


<class 'pandas.core.frame.DataFrame'>
Index: 50 entries, 114 to 1047
Data columns (total 3 columns):
 #   Column                        Non-Null Count  Dtype 
---  ------                        --------------  ----- 
 0   id                            50 non-null     int64 
 1   jericho_human_classification  50 non-null     object
 2   inbound_event_ids             50 non-null     object
dtypes: int64(1), object(2)
memory usage: 1.6+ KB


In [14]:
obj_df = pd.read_csv('email_data.csv')
# merge with labels
obj_df = obj_df.merge(labels, left_on='id', right_on='inbound_event_ids')
obj_df = obj_df.drop(columns=['id_x', 'id_y'])
obj_df['processed_body'] = obj_df['processed_body'].astype(str)
obj_df['processed_subject'] = obj_df['processed_subject'].astype(str)
obj_df['jericho_human_classification'] = obj_df['jericho_human_classification'].astype(str)
obj_df['inbound_event_ids'] = obj_df['inbound_event_ids'].astype(int)
obj_df.head()

Unnamed: 0,from,processed_subject,processed_body,payload,jericho_human_classification,inbound_event_ids
0,deep@beeleads.com,Life of a Prospect- A Journey of Multiple Purc...,Life of a Prospect- A Journey of Multiple Purc...,"{""cc"":"""",""to"":""alyssa.davis@jerichosecurity.co...",spam,402
1,no-reply@zoom.us,Ahmad Anderson has joined your meeting - Sales...,"Hi Madison Martin, Ahmad Anderson has joined...","{""cc"":"""",""to"":""madison@jerichosecurity.com"",""b...",ham,501
2,notifications@github.com,Re: [JerichoSecurity/jericho-security] Fix | a...,@bitnovus approved this pull request.\n\nLGTM ...,"{""cc"":""\""Rodrigo Assis\"" <rodrigo@jerichosecur...",ham,642
3,nick@getkoala.com,Invitation: Sage Wohns and Nick Ruggieri @ Tue...,"Sage Wohns and Nick Ruggieri\nTuesday May 21, ...","{""cc"":"""",""to"":""sage@jerichosecurity.com"",""bcc""...",ham,728
4,adam.mckenzie@utlimatetrios.com,RE: Cyber Insurance,"Hi,\n\n\n\nDid you have an opportunity to revi...","{""cc"":"""",""to"":""ahmad@jerichosecurity.com"",""bcc...",spam,1105


In [18]:
# preparation of data
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
le = LabelEncoder()
cv = CountVectorizer()
tfidf = TfidfTransformer()
def simplify_text(text):
    text = text.lower()
    text = text.replace('\n', ' ')
    text = text.replace('\r', ' ')
    text = text.replace('\t', ' ')
    # remove non text characters
    text = ''.join([i for i in text if i.isalnum() or i == ' '])
    return text
def spf_pass(text):
    if 'spf=fail' in text:
        return -1
    elif 'spf=pass' in text:
        return 1
    else:
        return 0
def dkim_pass(text):
    if 'dkim=fail' in text:
        return 0
    elif 'dkim=pass' in text:
        return 1
    else:
        return 0
def dmarc_pass(text):
    if 'dmarc=fail' in text:
        return 0
    elif 'dmarc=pass' in text:
        return 1
    else:
        return 0

simplified_body = obj_df['processed_body'].apply(simplify_text)
simplified_subject = obj_df['processed_subject'].apply(simplify_text)
vectorized_body = cv.fit_transform(simplified_body)
vectorized_subject = cv.fit_transform(simplified_subject)
tfidf_body = tfidf.fit_transform(vectorized_body)
tfidf_subject = tfidf.fit_transform(vectorized_subject)

spf = obj_df['payload'].apply(spf_pass)
dkim = obj_df['payload'].apply(dkim_pass)
dmarc = obj_df['payload'].apply(dmarc_pass)
# encode labels
encoder = LabelEncoder()
y = encoder.fit_transform(obj_df['jericho_human_classification'])


0     1
1     1
2     1
3     1
4     1
5     1
6     1
7     1
8     1
9    -1
10    1
11    1
12    1
13    1
14    1
15    1
16    1
17    1
18    1
19    1
20    1
21    1
22   -1
23    1
24    1
25    1
26    1
27    1
28    1
29    1
30    1
31    0
32    1
33    1
34    1
35    1
36    1
37    1
38    1
39    1
40    1
41    1
42    1
43    1
44    1
45    1
46    1
47    1
48    1
49    1
Name: payload, dtype: int64


In [20]:
# using count vectorizer
combined_features = np.concatenate([vectorized_subject.toarray(), vectorized_body.toarray(), spf.to_numpy().reshape(-1, 1), dkim.to_numpy().reshape(-1, 1), dmarc.to_numpy().reshape(-1, 1)], axis=1)
X_train, X_test, y_train, y_test = train_test_split(combined_features, y, test_size=0.2, random_state=42)

from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

clf = GaussianNB()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.57      0.73         7
           1       0.50      1.00      0.67         3

    accuracy                           0.70        10
   macro avg       0.75      0.79      0.70        10
weighted avg       0.85      0.70      0.71        10

[[4 3]
 [0 3]]


In [21]:
combined_features = np.concatenate([tfidf_subject.toarray(), tfidf_body.toarray(), spf.to_numpy().reshape(-1, 1), dkim.to_numpy().reshape(-1, 1), dmarc.to_numpy().reshape(-1, 1)], axis=1)
X_train, X_test, y_train, y_test = train_test_split(combined_features, y, test_size=0.2, random_state=42)
clf = GaussianNB()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.71      0.83         7
           1       0.60      1.00      0.75         3

    accuracy                           0.80        10
   macro avg       0.80      0.86      0.79        10
weighted avg       0.88      0.80      0.81        10

