# Generate relevant result data and graphs for each BERTopic sub-model finalists post tuning

## Preparation

In [1]:
import csv
import numpy as np
import pandas as pd
from tqdm import tqdm
from bertopic import BERTopic

  @numba.jit()
  @numba.jit()
  @numba.jit()
  from .autonotebook import tqdm as notebook_tqdm
  @numba.jit()


Read raw csv file of data with CWE described by attributes in text

In [2]:
# Specify the file path
#file_path_input = '../Input/simulated_data_AllSoftware.csv'
#file_path_input = '../Input/simulated_data_AllHardware.csv'
file_path_input = '../Input/simulated_data_AllSoftwareHardware.csv'

# Open the file
with open(file_path_input, mode='r', encoding='utf-8') as file:
    # Create a CSV reader
    csv_reader = csv.reader(file)
    
    # Read the headers (first row) into a list
    #headers = next(csv_reader)
    
    # Create a list to store the data
    raw_data = []
    
    # Loop through each row in the CSV file
    for row in csv_reader:
        # Append the row data as a list to the data list
        raw_data.append(row)

Join all attributes(features) into one clear text

In [3]:
# Initialize empty list to store the text data and cwe id list
docs = []
cwe_list = []
feature_data = []

for row in raw_data:
    tmplst = [row[i] for i in range(1,len(row))]
    docs.append(" ".join(tmplst))
    feature_data.append(tmplst)
    cwe_list.append(str(row[0]))

Load the saved text embeddings

In [4]:
embed_filepathname1 = "Embeddings/SBERT_raw.csv"
text_embeddings_sep = np.genfromtxt(embed_filepathname1, delimiter=',')

embed_filepathname2 = "Embeddings/SBERT_whole_doc.csv"
text_embeddings_whole = np.genfromtxt(embed_filepathname2, delimiter=',')

Functions for Dimension Reduction method

In [5]:
from umap import UMAP

def get_umap(n_components, n_neighbors=15, min_dist=0.0, metric='cosine', random_state=42, verbose=False):
    my_umap = UMAP(n_components=n_components, n_neighbors=n_neighbors, min_dist=min_dist, metric=metric, 
                   random_state=random_state, verbose=verbose)
    return my_umap

In [6]:
from sklearn.decomposition import PCA

def get_pca(n_components=None):
    my_pca = PCA(n_components=n_components, random_state=0)
    return my_pca

Functions for main Clustering algorithm

In [7]:
from hdbscan import HDBSCAN

def get_hdbscan(min_cluster_size=2, min_samples=None, metric='euclidean', cluster_selection_method='eom', prediction_data=True):
    my_hdbscan = HDBSCAN(min_cluster_size=min_cluster_size, min_samples=min_samples, metric=metric, 
                         cluster_selection_method=cluster_selection_method, prediction_data=prediction_data)
    return my_hdbscan

In [8]:
from sklearn.cluster import KMeans

def get_kmeans(n_clusters, random_state=0):
    my_kmeans = KMeans(n_clusters, n_init="auto", random_state=random_state)
    return my_kmeans

In [9]:
from sklearn.cluster import AgglomerativeClustering

def get_aggloClus(distance_threshold=0, n_clusters=None):
    my_model = AgglomerativeClustering(distance_threshold=distance_threshold, n_clusters=n_clusters)
    return my_model

Setup for Topic Representation

In [10]:
from sklearn.feature_extraction import text

query_words = ["cwe", "vulnerability", "vulnerabilities", "method", "methods", "technical", "impact", "security", "property", "properties", "severity", "likelihood", "relevant", "asset", "assets", "attack", "vector", "attacker", "type", "motive", "cyber", "controls", "countermeasure", "countermeasures", "detection"]
stop_words_ori = text.ENGLISH_STOP_WORDS
stop_words = stop_words_ori.union(query_words)
stop_words = stop_words.union(cwe_list)
stop_words_cust = list(stop_words)

In [11]:
from sklearn.feature_extraction.text import CountVectorizer

def get_Vectorizer(ngram_range=(1, 2), stop_words=stop_words_cust): 
    my_Vectorizer = CountVectorizer(ngram_range=ngram_range, stop_words=stop_words)
    return my_Vectorizer

## Load parameters table of finalist sub-models

In [12]:
import pandas as pd
df = pd.read_csv('Parameters/SBERT.csv')

In [13]:
df['labels'] = None
df['probs'] = None

In [14]:
df

Unnamed: 0,SN,mode,embedder,dim_red,clus,n_clusters,n_neighbors,n_components,labels,probs
0,1,SEP,SBERT,UMAP,HDBSCAN,,15.0,20,,
1,2,SEP,SBERT,UMAP,HDBSCAN,,15.0,60,,
2,3,WHOLE,SBERT,UMAP,HDBSCAN,,25.0,55,,
3,4,WHOLE,SBERT,UMAP,HDBSCAN,,30.0,35,,
4,5,WHOLE,SBERT,UMAP,K-MEANS,10.0,25.0,35,,
5,6,WHOLE,SBERT,UMAP,K-MEANS,10.0,25.0,55,,
6,7,SEP,SBERT,UMAP,K-MEANS,19.0,15.0,30,,
7,8,SEP,SBERT,UMAP,K-MEANS,19.0,20.0,45,,
8,9,WHOLE,SBERT,PCA,HDBSCAN,,,80,,
9,10,SEP,SBERT,PCA,HDBSCAN,,,50,,


## Main Run for result data and graphs generation

In [15]:
import os

# Loop for each sub-model
for index, row in tqdm(df.iterrows(), total=df.shape[0]):
    # read the parameters
    sn = row['SN']
    mode = row['mode']
    embedder = row['embedder']
    dim_red = row['dim_red']
    clus = row['clus']
    n_clusters = int(row['n_clusters']) if not np.isnan(row['n_clusters']) else None
    n_neighbors = int(row['n_neighbors']) if not np.isnan(row['n_neighbors']) else None
    n_components = int(row['n_components']) if not np.isnan(row['n_components']) else None
    model_str = f"{sn}-{mode}-{embedder}-{dim_red}-{clus}-{n_clusters}-{n_neighbors}-{n_components}"
    
    # pick the corresponding model/component
    if mode == "SEP":
        text_embeddings = text_embeddings_sep
    elif mode == "WHOLE":
        text_embeddings = text_embeddings_whole

    if dim_red == "PCA":
        umap_model=get_pca(n_components=n_components)
    elif dim_red == "UMAP":
        umap_model=get_umap(n_components=n_components, n_neighbors=n_neighbors)

    if clus == "K-MEANS":
        hdbscan_model=get_kmeans(n_clusters=n_clusters, random_state=0)
    elif clus == "HDBSCAN":
        hdbscan_model=get_hdbscan(min_cluster_size=2, min_samples=None)
    
    # BERTopic pipeline
    topic_model = BERTopic(
    
        # Pipeline models
        embedding_model=None,
        umap_model=umap_model,
        hdbscan_model=hdbscan_model,
        vectorizer_model=get_Vectorizer(ngram_range=(1, 2), stop_words=stop_words_cust),
        representation_model=None,
    
        # Hyperparameters
        top_n_words=20,
        verbose = False
    )
    
    # Train model
    topics, probs = topic_model.fit_transform(docs, text_embeddings)
    df.at[index, 'labels'] = topics
    df.at[index, 'probs'] = probs
    with open(f'Labels_{model_str}.csv', 'w', newline='') as f:
        writer = csv.writer(f)
        for item in topics:
            writer.writerow([item])
    
    # Create directory
    os.makedirs('results/SBERT/' + model_str, exist_ok=True)
    os.chdir('results/SBERT/' + model_str)
    
    # Generate topic info csv
    topic_model.get_topic_info().to_csv(f'TopicInfo_{model_str}.csv', index=False)
    # Generate barchart
    topic_model.visualize_barchart(top_n_topics=12, n_words=10, height=400, width=250).write_html(f'Top12TopicsBarchart_{model_str}.html')
    # Generate topics visualize graph
    topic_model.visualize_topics().write_html(f'VisualizeTopic_{model_str}.html')

    try:
        # Generate hierarchical topics visualize graph
        hierarchical_topics = topic_model.hierarchical_topics(docs)
        topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics).write_html(f'HierarchicalTopics_{model_str}.html')
        
        # Generate hierarchical topics tree csv
        tree = topic_model.get_topic_tree(hierarchical_topics)
        with open(f'HierarchicalTree_{model_str}.txt', 'w') as f:
            f.write(tree)
    except:
        pass

    # Generate topics heatmap
    topic_model.visualize_heatmap().write_html(f'Heatmap_{model_str}.html')
    
    # Generate visualize documents graph
    reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(text_embeddings)
    topic_model.visualize_documents(docs, reduced_embeddings=reduced_embeddings).write_html(f'VisualizeDoc_{model_str}.html')

    try:
        # Generate visualize hierarchical documents graph
        topic_model.visualize_hierarchical_documents(docs, hierarchical_topics, reduced_embeddings=reduced_embeddings).write_html(f'VisualizeHierDoc_{model_str}.html')
    except:
        pass
        
    os.chdir('../../..')

# Save the dataframe with addtional labels and probabilities list
df.to_csv('resultevals_updated.csv', index=False)

  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/54 [00:00<?, ?it/s][A
 35%|███▌      | 19/54 [00:00<00:00, 181.76it/s][A
100%|██████████| 54/54 [00:00<00:00, 189.31it/s][A
  8%|▊         | 1/12 [00:14<02:44, 14.91s/it]
  0%|          | 0/51 [00:00<?, ?it/s][A
 39%|███▉      | 20/51 [00:00<00:00, 193.18it/s][A
100%|██████████| 51/51 [00:00<00:00, 193.49it/s][A
 17%|█▋        | 2/12 [00:23<01:53, 11.35s/it]
  0%|          | 0/54 [00:00<?, ?it/s][A
 35%|███▌      | 19/54 [00:00<00:00, 189.00it/s][A
100%|██████████| 54/54 [00:00<00:00, 158.53it/s][A
 25%|██▌       | 3/12 [00:32<01:30, 10.01s/it]
  0%|          | 0/56 [00:00<?, ?it/s][A
 34%|███▍      | 19/56 [00:00<00:00, 189.00it/s][A
100%|██████████| 56/56 [00:00<00:00, 195.66it/s][A
 33%|███▎      | 4/12 [00:39<01:12,  9.09s/it]
100%|██████████| 9/9 [00:00<00:00, 169.78it/s]
 42%|████▏     | 5/12 [00:46<00:57,  8.17s/it]
100%|██████████| 9/9 [00:00<00:00, 169.77it/s]
 50%|█████     | 6/12 [00:52<00:45,  7.51s/it]
1