In [None]:
import pandas as pd
import pytest
from pathlib import Path
from sklearn.model_selection import train_test_split

import numpy as np
from email.parser import BytesParser
from bs4 import BeautifulSoup
from sentence_transformers import SentenceTransformer

In [None]:
# Mapping of subcategory folder to category and binary label
label_map = {
    # Malicious
    "CEO_Fraud_-_Gift_Cards": ("gift_cards", "ceo_fraud", "malicious"),
    "CEO_Fraud_-_Payroll_Update": ("payroll_update", "ceo_fraud", "malicious"),
    "CEO_Fraud_-_Wire_Transfers": ("wire_transfers", "ceo_fraud", "malicious"),
    "Phishing_-_3rd_Party": ("third_party", "phishing", "malicious"),
    "Phishing_-_Outbound": ("outbound", "phishing", "malicious"),
    "Phishing_–_UBC": ("ubc", "phishing", "malicious"),
    "Phishing_UBC_-_Outbound": ("ubc_outbound", "phishing", "malicious"),
    "Self-Phishing": ("self_phishing", "phishing", "malicious"),
    "Spearphishing": ("spearphishing", "phishing", "malicious"),
    "Reply_Chain_Attack": ("reply-chain-attack", "reply-chain-attack", "malicious"),

    # Benign
    "Legitimate_Email_Confirmed": ("legitimate_email_confirmed", "legitimate", "benign"),
    "Spam_-_False_Positives": ("spam_false_positive", "legitimate", "benign"),
    "Spam_–_Inbound": ("inbound", "spam", "benign"),
    "Spam_–_Outbound": ("outbound", "spam", "benign"),
}

dataset_root = Path("/data/dataset")

# Collect all .eml file entries
rows = []
for subfolder, (subcategory, category, binary_label) in label_map.items():
    eml_files = (dataset_root / subfolder).rglob("*.eml")
    for eml in eml_files:
        rel_path = eml.relative_to("/") 
        rows.append({
            "path": f"/{rel_path.as_posix()}",
            "target_1": binary_label,
            "target_2": category,
            "target_3": subcategory
        })

# Build full DataFrame
df = pd.DataFrame(rows)

In [None]:
sample_small, _ = train_test_split(
    df,
    train_size=1000,
    stratify=df["target_3"],
    random_state=42
)

In [None]:
paths = sample_small["path"].tolist()
len(paths)

In [None]:
# paths = [
#     '/data/dataset/Phishing_-_3rd_Party/0a0e0cab473ff110072fbf12516d43c9/0_message.eml',
#     '/data/dataset/Phishing_–_UBC/0ad4904a1bda559024c255b62a4bcbc3/0_message.eml',
#     '/data/dataset/CEO_Fraud_-_Wire_Transfers/3a55d04d473bc290efc6767b416d43db/0_message.eml',
#     '/data/dataset/CEO_Fraud_-_Gift_Cards/b9ed6f671bc6155024c255b62a4bcb1b/0_message.eml',
#     '/data/dataset/Phishing_UBC_-_Outbound/0aeaad25938502105a9f30edfaba102e/0_message.eml'
# ]


emails = []
payloads = []
text_html = []
text_plain = []
text_clean = []

for i, path in enumerate(paths):
    with open(path, 'rb') as fp:
        msg = BytesParser().parse(fp)
        emails.append(msg)

    content_type = list()
    payload = {}

    for part in msg.walk():
        payload[part.get_content_type()] = part.get_payload(decode=True)

    payloads.append(payload)

    text_html.append(payload['text/html'] if 'text/html' in payload.keys() else None)

    text_plain.append(payload['text/plain'].decode('utf-8', errors='replace') if 'text/plain' in payload.keys() else BeautifulSoup(payload['text/html']).get_text())

    text_clean.append(' '.join(text_plain[i].split()))

In [None]:
data_df = pd.DataFrame({
    'path': paths,
    'email': emails,
    'payload': payloads,
    'text_html': text_html,
    'text_plain': text_plain,
    'text_clean': text_clean,
}).set_index('path')

# TF-IDF

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer
import spacy
import numpy as np
import re

In [None]:
nlp = spacy.load("en_core_web_md", disable=["parser", "ner"])

def clean_text(text, min_token_len=2, irrelevant_pos=["ADV", "PRON", "CCONJ", "PUNCT", "PART", "DET", "ADP"]):
    """
    Clean a single text string using spaCy.

    Parameters
    ----------
    text : str
        The input text to be cleaned.
    min_token_len : int, optional
        Minimum token length to retain in the output (default is 2).
    irrelevant_pos : list of str, optional
        List of POS tags to ignore during filtering (default excludes function words and punctuation).

    Returns
    -------
    str
        The cleaned, lemmatized, and filtered text string.
    """
    
    # Remove Caution tagging
    text = text.replace("[CAUTION: Non-UBC Email]", "")
    
    # Remove timestamps
    text = re.sub(r"\b\d{1,2}:\d{2}(?:\s*[–-]\s*\d{1,2}:\d{2})?\b", " ", text)
    doc = nlp(text)

    tokens = []

    for token in doc:
        lemma = token.lemma_.lower()

        if (
            not token.is_stop
            and len(token) > min_token_len
            and token.pos_ not in irrelevant_pos
            and not token.is_space
            and not token.like_email
            and not token.like_url
            and not token.like_num
            and not token.is_oov
            and not token.is_punct
            and not token.is_digit
            and token.ent_type_ != "PERSON"
            and not re.match(r"^\d+(px|em|%)?$", lemma)  # remove '10', '0px', '100%' etc.
        ):
            tokens.append(lemma)
            
    return " ".join(tokens).strip()



def preprocess_text(input_data, min_token_len=2, irrelevant_pos=["ADV", "PRON", "CCONJ", "PUNCT", "PART", "DET", "ADP"]):
    """
    Preprocess a string or a Pandas Series of strings using spaCy.

    Parameters
    ----------
    input_data : str or pandas.Series
        A single text string or a Series of text strings to preprocess.
    min_token_len : int, optional
        Minimum token length to retain (default is 2).
    irrelevant_pos : list of str, optional
        List of POS tags to ignore (default removes common function words and punctuation).

    Returns
    -------
    str or pandas.Series
        Cleaned string if input is a single text, or Series of cleaned strings if input is a Series.

    Raises
    ------
    TypeError
        If input_data is neither a string nor a Pandas Series.
    """
    if isinstance(input_data, pd.Series):
        return input_data.apply(lambda text: clean_text(text, min_token_len, irrelevant_pos))
    elif isinstance(input_data, str):
        return clean_text(input_data, min_token_len, irrelevant_pos)
    else:
        raise TypeError("Input must be a string or a pandas Series of strings.")


In [None]:
def preprocess_spacy(
    input_data,
    min_token_len=2,
    irrelevant_pos=["ADV", "PRON", "CCONJ", "PUNCT", "PART", "DET", "ADP"],
):
    """
    Preprocess either a single text string or a Pandas Series of texts using spaCy.
    
    Parameters:
    - input_data: str or pd.Series
    - min_token_len: Minimum token length
    - irrelevant_pos: List of POS tags to ignore
    
    Returns:
    - str (if input was a single string) or pd.Series (if input was a Series)
    """

    def clean_text(text):
        text = text.replace("[CAUTION: Non-UBC Email]", "")
        doc = nlp(text)

        tokens = [
            token.lemma_.lower()
            for token in doc
            if (
                not token.is_stop
                and len(token) > min_token_len
                and token.pos_ not in irrelevant_pos
                and not token.is_space
                and not token.like_email
                and not token.like_url
                and not token.like_num
                and not token.is_oov
                and not token.is_punct
                and not token.is_digit
                and token.ent_type_ != "PERSON"
            )
        ]
        return " ".join(tokens).strip()

    if isinstance(input_data, pd.Series):
        return input_data.apply(clean_text)
    elif isinstance(input_data, str):
        return clean_text(input_data)
    else:
        raise TypeError("Input must be a string or a pandas Series of strings.")

In [None]:
def extract_keywords(text_series, top_n = 5):
    """
    Extract top N TF-IDF keywords from each document in a text series.

    Parameters
    ----------
    text_series : pandas.Series
        Series of preprocessed text documents.
    top_n : int
        Number of top keywords to extract per document.

    Returns
    -------
    pandas.Series
        Series of lists, each containing top N keywords for a document.
    """
    # Vectorize using TF-IDF
    tfidf = TfidfVectorizer()
    X_tfidf = tfidf.fit_transform(text_series)
    feature_names = np.array(tfidf.get_feature_names_out())

    # For each row/document in the TF-IDF matrix, extract top N keywords
    top_keywords_list = []
    for row in X_tfidf:
        row_array = row.toarray().flatten()
        top_indices = row_array.argsort()[-top_n:][::-1]
        keywords = feature_names[top_indices]
        top_keywords_list.append(list(keywords))

    return pd.Series(top_keywords_list, index=text_series.index)


In [None]:
pp_text = preprocess_spacy(data_df.text_clean)
pp_text

In [None]:
keywords = extract_keywords(preprocess_text(data_df.text_clean))
keywords

In [None]:
df_kw = data_df[['text_clean']].copy()
df_kw['keywords'] = keywords
df_kw

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

def generate_bow(keyword):
    """
    Generate a bag-of-words (BoW) representation for a given pandas Series.

    Parameters
    ----------
    keyword : pandas.Series
        A pandas Series containing text data (strings or lists of keywords).

    Returns
    -------
    pandas.Series
        A Series where each row is a dictionary representing the BoW of the input text.
    """
    cv = CountVectorizer(stop_words="english")

    # Convert list to string if needed
    keywords_as_strings = keyword.apply(lambda x: " ".join(x) if isinstance(x, list) else x)

    # Fit and transform
    bow_matrix = cv.fit_transform(keywords_as_strings)

    # Feature names (vocabulary)
    feature_names = cv.get_feature_names_out()

    # Convert each row to a dictionary
    bow_series = pd.Series([
        dict(zip(feature_names, row.toarray().flatten()))
        for row in bow_matrix
    ], index=keyword.index)

    return bow_series


In [None]:
generated_bow = generate_bow(extract_keywords(preprocess_text(data_df.text_clean)))
generated_bow

<br>

# Clustering

`conda install conda-forge::sentence-transformers`

In [None]:
df = data_df[['text_clean']]


In [None]:
embedder = SentenceTransformer("all-MiniLM-L6-v2")

In [None]:
emb_sents = embedder.encode(data_df["text_clean"].tolist())
emb_sent_df = pd.DataFrame(emb_sents, index=data_df.index)
emb_sent_df

### Method 1: DBSCAN
- No need to specify number of clusters, let it search how many clusters present in the dataset

In [None]:
from sklearn.cluster import DBSCAN

In [None]:
for eps in np.arange(0.5, 0.7, 0.01):
    print("\neps={}".format(eps))
    dbscan = DBSCAN(eps=eps, min_samples=2, metric="cosine")
    labels = dbscan.fit_predict(emb_sents)
    print("Number of clusters: {}".format(len(np.unique(labels))))
    print("Cluster sizes: {}".format(np.bincount(labels + 1)))
    print("Cluster memberships:{}".format(labels))


In [None]:
dbscan = DBSCAN(eps=0.64, min_samples=3, metric="cosine")
df["dbscan"] = dbscan.fit_predict(emb_sents)
df

In [None]:
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap, colorConverter, LinearSegmentedColormap

In [None]:
colors = ['xkcd:azure', 'yellowgreen', 'tomato', 'teal', 'indigo', 'aqua', 'orangered', 'orchid', 'black', 'xkcd:turquoise', 'xkcd:violet', 'aquamarine', 'chocolate', 'darkgreen', 'sienna', 'pink', 'lightblue', 'yellow', 'lavender', 'wheat', 'linen']


def discrete_scatter(x1, x2, y=None, markers=None, s=8, ax=None,
                     labels=None, padding=.2, alpha=1, c=None, markeredgewidth=0.6, 
                     label_points=False, x1_annot=-0.1, x2_annot=0.2):
    """Adaption of matplotlib.pyplot.scatter to plot classes or clusters.
    Parameters
    ----------
    x1 : nd-array
        input data, first axis
    x2 : nd-array
        input data, second axis
    y : nd-array
        input data, discrete labels
    cmap : colormap
        Colormap to use.
    markers : list of string
        List of markers to use, or None (which defaults to 'o').
    s : int or float
        Size of the marker
    padding : float
        Fraction of the dataset range to use for padding the axes.
    alpha : float
        Alpha value for all points.
    """
    if ax is None:
        ax = plt.gca()

    if y is None:
        y = np.zeros(len(x1))        

    # unique_y = np.unique(y)
    unique_y, inds = np.unique(y, return_index=True)    

    if markers is None:
        markers = ['o', '^', 'v', 'D', 's', '*', 'p', 'h', 'H', '8', '<', '>'] * 10

    if len(markers) == 1:
        markers = markers * len(unique_y)

    if labels is None:
        labels = unique_y

    # lines in the matplotlib sense, not actual lines
    lines = []


    if len(unique_y) == 1: 
        cr = [-1]
    else: 
        cr = sorted([y[index] for index in sorted(inds)])

    if c is not None and len(c) == 1: 
        cr = c
    
    for (i, (yy, color_ind)) in enumerate(zip(unique_y, cr)):
        mask = y == yy
        # print(f'color_ind= {color_ind} and i = {i}')
        # if c is none, use color cycle
        color = colors[color_ind]
        # print('color: ', color)
        # use light edge for dark markers
        if np.mean(colorConverter.to_rgb(color)) < .2:
            markeredgecolor = "grey"
        else:
            markeredgecolor = "black"

        lines.append(ax.plot(x1[mask], x2[mask], markers[i], markersize=s,
                             label=labels[i], alpha=alpha, c=color,                             
                             markeredgewidth=markeredgewidth,
                             markeredgecolor=markeredgecolor)[0])
    if label_points: 
        labs = [str(label) for label in list(range(0,len(x1)))]
        for i, txt in enumerate(labs):
            font_size=9
            ax.annotate(txt, (x1[i]+0.2, x2[i]+0.2), xytext= (x1[i]+x1_annot, x2[i]+x2_annot), c='k', size = font_size)

    return lines    
    
def plot_original_clustered(X, model, labels):
    k = np.unique(labels).shape[0]
    fig, ax = plt.subplots(1, 2, figsize=(12, 4))    
    ax[0].set_title("Original dataset")
    ax[0].set_xlabel("Feature 0")
    ax[0].set_ylabel("Feature 1")    
    discrete_scatter(X[:, 0], X[:, 1], ax=ax[0]);
    # cluster the data into three clusters
    # plot the cluster assignments and cluster centers
    ax[1].set_title(f"{type(model).__name__} clusters")    
    ax[1].set_xlabel("Feature 0")
    ax[1].set_ylabel("Feature 1")

    discrete_scatter(X[:, 0], X[:, 1], labels, c=labels, markers='o', ax=ax[1]); 
    if type(model).__name__ == "KMeans": 
        discrete_scatter(
            model.cluster_centers_[:, 0], model.cluster_centers_[:, 1], y=np.arange(0,k), s=15, 
            markers='*', markeredgewidth=1.0, ax=ax[1])

In [None]:
plot_original_clustered(emb_sents, dbscan, dbscan.labels_)

### Method 2: KMeans
- Need to specify number of clusters

In [None]:
from sklearn.cluster import KMeans

kmeans_emb_labels = KMeans(n_clusters=12, n_init='auto', random_state=42)
kmeans_emb_labels.fit(emb_sent_df)

In [None]:
df["emb_kmeans"] = kmeans_emb_labels.labels_
df

In [None]:
from yellowbrick.cluster import KElbowVisualizer

model = KMeans(n_init='auto')
visualizer = KElbowVisualizer(model, k=(1, 20))

visualizer.fit(emb_sents)  # Fit the data to the visualizer
visualizer.show();

In [None]:
from yellowbrick.cluster import SilhouetteVisualizer

model = KMeans(12, n_init='auto', random_state=42)
visualizer = SilhouetteVisualizer(model, colors="yellowbrick")
visualizer.fit(emb_sents)  # Fit the data to the visualizer
visualizer.show();
# Finalize and render the figure

### Method 3: BERTopic
- No need to specify number of clusters, it can generate the topic keywords

In [None]:
from bertopic import BERTopic

In [None]:
docs = data_df['text_clean']
len(docs)

In [None]:
from bertopic.representation import KeyBERTInspired

# Fine-tune your topic representations
representation_model = KeyBERTInspired()
topic_model = BERTopic(representation_model=representation_model)

#topic_model = BERTopic()
topics, probs = topic_model.fit_transform(docs)

In [None]:
topic_model.get_topic_info()

In [None]:
topic_model.get_topic(-1)


# Topic modelling
`conda install conda-forge::spacy`

`python -m spacy download en_core_web_md`

 Data cleaning with spacy

In [None]:
def preprocess_spacy(
    doc,
    min_token_len=2,
    irrelevant_pos=["ADV", "PRON", "CCONJ", "PUNCT", "PART", "DET", "ADP"],
):
    """
    Given text, min_token_len, and irrelevant_pos carry out preprocessing of the text
    and return a preprocessed string.

    Parameters
    -------------
    doc : (spaCy doc object)
        the spacy doc object of the text
    min_token_len : (int)
        min_token_length required
    irrelevant_pos : (list)
        a list of irrelevant pos tags

    Returns
    -------------
    (str) the preprocessed text
    """
    # Remove specific caution text from the 'text_clean' column if it exists
    caution_text = "[CAUTION: Non-UBC Email]"
    if caution_text in doc.text:
        doc = nlp(doc.text.replace(caution_text, ""))

    clean_text = []

    for token in doc:
        if (
            token.is_stop == False  # Check if it's not a stopword
            and len(token) > min_token_len  # Check if the word meets minimum threshold
            and token.pos_ not in irrelevant_pos
            and token.is_space == False
            and token.like_email == False
            and token.like_url == False
            and token.like_num == False
            and token.is_oov == False
            and token.is_punct == False
            and token.is_digit == False
            and token.ent_type_ != "PERSON"  # Exclude tokens identified as names
            
        ):  # Check if the POS is in the acceptable POS tags
            lemma = token.lemma_  # Take the lemma of the word
            clean_text.append(lemma.lower())
    return " ".join(clean_text).strip()

In [None]:
df = data_df[['text_clean']].copy()
df

In [None]:
df["text_pp"] = [preprocess_spacy(text) for text in nlp.pipe(df["text_clean"])]
df

LDA Model

`pip install scipy gensim`

In [None]:
import gensim
from gensim.corpora import Dictionary

corpus = [doc.split() for doc in df["text_pp"].tolist()]
dictionary = Dictionary(corpus)  # Create a vocabulary for the lda model
#dictionary.filter_extremes(no_below=5, no_above=0.5)
doc_term_matrix = [dictionary.doc2bow(doc) for doc in corpus]

In [None]:
# Compute Coherence Score
from gensim.models import CoherenceModel

K = [6,8,10,12,14]

coherence_scores = []

for num_topics in K:
    lda = LdaModel(
        corpus=doc_term_matrix,
        id2word=dictionary,
        num_topics=num_topics,
        random_state=42,
        passes=10,
    )
    coherence_model_lda = CoherenceModel(
        model=lda, texts=corpus, dictionary=dictionary, coherence="c_v"
    )
    coherence_scores.append(coherence_model_lda.get_coherence())

cs_df = pd.DataFrame(coherence_scores, index=K, columns=["Coherence score"])
cs_df

In [None]:
cs_df.plot(title="Coherence scores", xlabel="num_topics", ylabel="Coherence score");

In [None]:
from gensim.models import LdaModel

lda = LdaModel(
    corpus=doc_term_matrix,
    id2word=dictionary,
    num_topics=num_topics,
    random_state=42,
    passes=10,
)

topics = lda.print_topics(num_topics = 12, num_words=5,)  
topics

In [None]:
lda.show_topic(0, topn=5)  # Topic 0

In [None]:
# df["topic"] = [lda.get_document_topics(bow) for bow in doc_term_matrix]
# df["topic_keywords"] = [", ".join([word for word, _ in lda.show_topic(max(doc, key=lambda x: x[1])[0], topn=15)]) for doc in df["topic"]]
# df

In [None]:
df["topic"] = [max(lda.get_document_topics(bow), key=lambda x: x[1]) if lda.get_document_topics(bow) else (None, 0) for bow in doc_term_matrix]
df["topic_keywords"] = [
	", ".join([word for word, _ in lda.show_topic(topic[0], topn=15)]) if topic[0] is not None else "" 
	for topic in df["topic"]
]
df

### Topic2Vec modelling
`conda install conda-forge::top2vec`

In [None]:
from top2vec import Top2Vec

In [None]:
model = Top2Vec(documents,embedding_model='distiluse-base-multilingual-cased', min_count=5)

### Sentiment analysis

In [None]:
from transformers import pipeline

In [None]:
sentiment_pipeline = pipeline("sentiment-analysis")




In [None]:
def process_text_in_chunks(texts, chunk_size=512, merge_fn=None):
    """
    Process a list of texts by splitting them into chunks and merging the results.

    Parameters:
    - texts: list of str, the input texts to process.
    - chunk_size: int, the maximum size of each chunk.
    - merge_fn: callable, a function to merge the processed chunks (e.g., averaging scores).

    Returns:
    - list, the merged results for each text.
    """
    results = []
    for text in texts:
        # Split the text into chunks
        chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
        
        # Process each chunk
        chunk_results = sentiment_pipeline(chunks)
        
        # Merge the results using the provided merge function
        if merge_fn:
            merged_result = merge_fn(chunk_results)
        else:
            merged_result = chunk_results  # Default: no merging
        
        results.append(merged_result)
    return results

# Example merge function: averaging sentiment scores
def merge_sentiment_scores(chunk_results):
    positive_scores = [res['score'] for res in chunk_results if res['label'] == 'POSITIVE']
    negative_scores = [res['score'] for res in chunk_results if res['label'] == 'NEGATIVE']
    return {
        'POSITIVE': sum(positive_scores) / len(positive_scores) if positive_scores else 0,
        'NEGATIVE': sum(negative_scores) / len(negative_scores) if negative_scores else 0,
    }

# Process the text_clean column in chunks and merge results
chunked_sentiment_results = process_text_in_chunks(df["text_pp"].tolist(), chunk_size=512, merge_fn=merge_sentiment_scores)
chunked_sentiment_results

In [None]:
df["sentiment"] = chunked_sentiment_results
df