## SIMILARITY MEASURE
It will be a weighted average of similarity scores which takes into account topological features of the graph and content of nodes labels.

**Similairty scores** compare any node of G1 to any node of G2 and assign a score based on:
- type of nodes (need to define some rules)
- content of the labels (contextul similarity using SBERT model)
- distance from closest start/end nodes

In [1]:
import os
import random 

from similarity_utils import *
from parser_with_lane import get_edge_df_from_bpmn

  from tqdm.autonotebook import tqdm, trange


In [2]:
#Load model
#model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-mpnet-base-v2')#all-mpnet-base-v2')
model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")



In [3]:
def check_file_similarity(file1, file2, model, verbose=False):
    edge_df1 = get_edge_df_from_bpmn(file1)
    edge_df2 = get_edge_df_from_bpmn(file2)

    G1 = obtain_graph(edge_df1)
    G2 = obtain_graph(edge_df2)

    if verbose:
        print(f"Graph 1 has:", G1.number_of_nodes(), "nodes and", G1.number_of_edges(), "edges")
        print(f"Graph 2 has:", G2.number_of_nodes(), "nodes and", G2.number_of_edges(), "edges")

    _, _, _, _, similarity_matrix = get_similarity_matrix(G1, G2, model)
    return get_similarity_measure(similarity_matrix)

In [4]:
# Load a subset of 12 folders from the base path
def load_files_from_folders(base_path, subset_size=4):
    folders = {}
    # List all folders in the base directory
    all_folders = [folder_name for folder_name in os.listdir(base_path) if os.path.isdir(os.path.join(base_path, folder_name))]
    
    # Select a random subset of 12 folders
    selected_folders = random.sample(all_folders, min(subset_size, len(all_folders)))

    # Load files from the selected folders
    for folder_name in selected_folders:
        folder_path = os.path.join(base_path, folder_name)
        files = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))]
        folders[folder_name] = files

    return folders

In [17]:
folders = load_files_from_folders("dati\\bpmn", subset_size=4)
all_files = [file for folder in folders.values() for file in folder]
all_files = [file for file in all_files if file.endswith('.xml')]

In [18]:
all_files

['dati\\bpmn\\M_j01\\0.bpmn2.xml',
 'dati\\bpmn\\M_j01\\1.bpmn2.xml',
 'dati\\bpmn\\M_j01\\2.bpmn2.xml',
 'dati\\bpmn\\M_j01\\3.bpmn2.xml',
 'dati\\bpmn\\M_j01\\4.bpmn2.xml',
 'dati\\bpmn\\M_j01\\5.bpmn2.xml',
 'dati\\bpmn\\M_j01\\6.bpmn2.xml',
 'dati\\bpmn\\M_j01\\7.bpmn2.xml',
 'dati\\bpmn\\M_j01\\8.bpmn2.xml',
 'dati\\bpmn\\M_j01\\9.bpmn2.xml',
 'dati\\bpmn\\R_j04\\0.bpmn2.xml',
 'dati\\bpmn\\R_j04\\1.bpmn2.xml',
 'dati\\bpmn\\R_j04\\2.bpmn2.xml',
 'dati\\bpmn\\R_j04\\3.bpmn2.xml',
 'dati\\bpmn\\R_j04\\4.bpmn2.xml',
 'dati\\bpmn\\R_j04\\5.bpmn2.xml',
 'dati\\bpmn\\R_j04\\6.bpmn2.xml',
 'dati\\bpmn\\R_j04\\7.bpmn2.xml',
 'dati\\bpmn\\R_g01\\0.bpmn2.xml',
 'dati\\bpmn\\R_g01\\1.bpmn2.xml',
 'dati\\bpmn\\R_g01\\10.bpmn2.xml',
 'dati\\bpmn\\R_g01\\11.bpmn2.xml',
 'dati\\bpmn\\R_g01\\2.bpmn2.xml',
 'dati\\bpmn\\R_g01\\3.bpmn2.xml',
 'dati\\bpmn\\R_g01\\4.bpmn2.xml',
 'dati\\bpmn\\R_g01\\5.bpmn2.xml',
 'dati\\bpmn\\R_g01\\6.bpmn2.xml',
 'dati\\bpmn\\R_g01\\7.bpmn2.xml',
 'dati\\bpmn\\R_g0

In [7]:
from joblib import Parallel, delayed

In [19]:
# def compute_similarity_matrix(all_files, model, n_jobs=-1):
#     n = len(all_files)
#     similarity_matrix = np.zeros((n, n))

#     Parallel computation of the upper triangular matrix
#     def compute_similarity(i, j):
#         similarity = check_file_similarity(all_files[i], all_files[j], model)
#         similarity_matrix[i, j] = similarity
#         similarity_matrix[j, i] = similarity  # Exploit symmetry

#     Parallel(n_jobs=n_jobs)(
#         delayed(compute_similarity)(i, j) for i in range(n) for j in range(i, n)
#     )
#     return similarity_matrix

def compute_similarity_matrix(all_files, model):
    n = len(all_files)
    similarity_matrix = np.zeros((n, n))

    for i in range(n):
        for j in range(i, n):
            # Compute similarity once for (i, j) and reuse it for (j, i)
            print(f"Computing similarity between {all_files[i]} and {all_files[j]}")
            similarity = check_file_similarity(all_files[i], all_files[j], model)
            similarity_matrix[i, j] = similarity
            similarity_matrix[j, i] = similarity  # Exploit symmetry

    return similarity_matrix

In [20]:
similarity_matrix = compute_similarity_matrix(all_files, model)

Computing similarity between dati\bpmn\M_j01\0.bpmn2.xml and dati\bpmn\M_j01\0.bpmn2.xml
Detected 1 processes in the BPMN file
Detected 1 processes in the BPMN file
Computing similarity between dati\bpmn\M_j01\0.bpmn2.xml and dati\bpmn\M_j01\1.bpmn2.xml
Detected 1 processes in the BPMN file
Detected 1 processes in the BPMN file


  shortest_path_distance = 1 - (diff / max_val)


Computing similarity between dati\bpmn\M_j01\0.bpmn2.xml and dati\bpmn\M_j01\2.bpmn2.xml
Detected 1 processes in the BPMN file
Detected 2 processes in the BPMN file
Computing similarity between dati\bpmn\M_j01\0.bpmn2.xml and dati\bpmn\M_j01\3.bpmn2.xml
Detected 1 processes in the BPMN file
Detected 1 processes in the BPMN file
Computing similarity between dati\bpmn\M_j01\0.bpmn2.xml and dati\bpmn\M_j01\4.bpmn2.xml
Detected 1 processes in the BPMN file
Detected 1 processes in the BPMN file
Computing similarity between dati\bpmn\M_j01\0.bpmn2.xml and dati\bpmn\M_j01\5.bpmn2.xml
Detected 1 processes in the BPMN file
Detected 1 processes in the BPMN file
Computing similarity between dati\bpmn\M_j01\0.bpmn2.xml and dati\bpmn\M_j01\6.bpmn2.xml
Detected 1 processes in the BPMN file
Detected 1 processes in the BPMN file
Computing similarity between dati\bpmn\M_j01\0.bpmn2.xml and dati\bpmn\M_j01\7.bpmn2.xml
Detected 1 processes in the BPMN file
Detected 1 processes in the BPMN file
Computing 

TypeError: 'float' object is not subscriptable

In [None]:
# plot the similarity matrix
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(20, 20))
sns.heatmap(similarity_matrix, annot=True, xticklabels=all_files, yticklabels=all_files, cmap='YlGn')
plt.show()
