In [None]:
import json
import networkx as nx
import matplotlib.pyplot as plt
from transformers import BertModel, BertTokenizer
import torch
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import string
import pandas as pd
import spacy
import os

# Load the spaCy model
nlp = spacy.load("en_core_web_sm")

# Load the BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Create a new directory to save all files
output_dir = 'output_files'
os.makedirs(output_dir, exist_ok=True)

# Define global threshold variable
threshold = 0.8

def preprocess_text(text):
    """Preprocess text by lowercasing and removing punctuation."""
    text = text.lower().translate(str.maketrans('', '', string.punctuation))
    return text

def get_embeddings(text):
    """Get BERT embeddings for the given text."""
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    # Last hidden state (embedding)
    return outputs.last_hidden_state.mean(dim=1).numpy()

def save_tokenized_texts_combined(tokenized_texts, file_label):
    """Save combined tokenized texts to a file."""
    with open(os.path.join(output_dir, f"combined_tokenized_output_{file_label}.txt"), "w") as f:
        for tokenized_text in tokenized_texts:
            f.write(" ".join(tokenized_text) + "\n")

def save_embeddings_combined(embeddings, file_label):
    """Save combined embeddings as .npy file."""
    combined_embeddings = np.vstack(embeddings)
    np.save(os.path.join(output_dir, f"combined_embeddings_{file_label}.npy"), combined_embeddings)

def json_to_text(data, file_label):
    """Convert JSON entries to text for analysis and remove 'id' key. Add a 'file' key and determine node_or_edge."""
    texts = []
    embeddings = []
    tokenized_texts = []
    json_objects = {}
    original_ids = {}
    labels = []
    for i, entry in enumerate(data):
        original_id = entry.pop('id', None)  # Remove 'id' key and store it
        if original_id:
            original_ids[original_id] = entry  # Map original 'id' to entry

        entry['file'] = file_label  # Add 'file' key
        entry_items = list(entry.items())  # Get all key-value pairs
        first_key_value = entry_items[0] if len(entry_items) > 0 else ("None", "None")
        second_key_value = entry_items[1] if len(entry_items) > 1 else ("None", "None")
        labels.append(f"{first_key_value[0]}: {first_key_value[1]} ({file_label}), {second_key_value[0]}: {second_key_value[1]} ({file_label})")
        text = ". ".join(f"{key}: {value}" if not isinstance(value, list) else f"{key}: " + ", ".join(map(str, value)) for key, value in entry.items())
        text = preprocess_text(text)
        texts.append(text)

        # Tokenization
        tokenized_text = tokenizer.tokenize(text)
        print(f"Tokenized {file_label} object {i + 1}: {tokenized_text}")
        tokenized_texts.append(tokenized_text)

        # Get and save embeddings
        embedding = get_embeddings(text)
        print(f"Embedding shape for {file_label} object {i + 1}: {embedding.shape}")
        embeddings.append(embedding)
        
        # Determine node_or_edge
        if 'source_ref' in entry and 'target_ref' in entry:
            entry['node_or_edge'] = "edge"
        else:
            entry['node_or_edge'] = "node"
        
        json_objects[text] = {'entry': entry, 'first_key_value': first_key_value, 'second_key_value': second_key_value}
    return texts, embeddings, tokenized_texts, json_objects, original_ids, labels


def load_json(file_path):
    """Load JSON data from a file with detailed error handling."""
    try:
        with open(file_path, 'r') as file:
            data = file.read()
            return json.loads(data)
    except FileNotFoundError:
        print(f"Error: File not found - {file_path}")
        return None
    except json.JSONDecodeError as e:
        print(f"Error: Failed to decode JSON from file - {file_path}")
        print(f"Error details: {e}")
        print(f"Faulty JSON content:\n{data}")
        return None

def assign_encode_ids(texts1, embeddings1, texts2, embeddings2, json_objects1, json_objects2, original_ids1, original_ids2, threshold):
    """Assign encodeID to similar JSON objects in two datasets, ensuring cross-file matching and update source_ref/target_ref."""
    combined_texts = texts1 + texts2
    combined_embeddings = np.vstack(embeddings1 + embeddings2)
    combined_json_objects = {**json_objects1, **json_objects2}

    # Ensure all objects have 'first_key_value', 'second_key_value', and 'entry'
    for text in combined_texts:
        if text not in combined_json_objects:
            continue
        if 'first_key_value' not in combined_json_objects[text] or 'second_key_value' not in combined_json_objects[text] or 'entry' not in combined_json_objects[text]:
            print(f"Missing 'first_key_value', 'second_key_value' or 'entry' in combined_json_objects for text: {text}")
            print(combined_json_objects[text])
            continue

    # Compute similarity matrix for combined texts
    similarity_matrix = cosine_similarity(combined_embeddings)
    encode_id = 1
    unmatched_encode_id = 10000
    encode_id_mapping = {}
    assigned_encode_ids = set()
    max_similarity = {}

    # Create comparison matrix and identify matches
    num_texts = len(combined_texts)
    comparison_matrix = np.zeros((num_texts, num_texts))
    for i in range(num_texts):
        for j in range(num_texts):
            if i != j:
                similarity = similarity_matrix[i, j]
                comparison_matrix[i, j] = similarity
                if similarity > threshold and combined_json_objects[combined_texts[i]]['entry']['file'] != combined_json_objects[combined_texts[j]]['entry']['file']:
                    if combined_texts[i] not in assigned_encode_ids or similarity > max_similarity.get(combined_texts[i], 0):
                        encode_id_mapping[combined_texts[i]] = encode_id
                        encode_id_mapping[combined_texts[j]] = encode_id
                        assigned_encode_ids.add(combined_texts[i])
                        assigned_encode_ids.add(combined_texts[j])
                        max_similarity[combined_texts[i]] = similarity
                        max_similarity[combined_texts[j]] = similarity
                        encode_id += 1

    for text in combined_texts:
        if text not in encode_id_mapping:
            encode_id_mapping[text] = unmatched_encode_id
            unmatched_encode_id += 1

    for text, obj in combined_json_objects.items():
        if text in encode_id_mapping:
            obj['entry']['encodeID'] = encode_id_mapping[text]

    json_objects1 = {text: obj['entry'] for text, obj in combined_json_objects.items() if obj['entry']['file'] == 'json1'}
    json_objects2 = {text: obj['entry'] for text, obj in combined_json_objects.items() if obj['entry']['file'] == 'json2'}

    # Create a dictionary to map original IDs to encodeIDs
    id_to_encodeID = {original_id: obj['encodeID'] for original_id, obj in original_ids1.items()}
    id_to_encodeID.update({original_id: obj['encodeID'] for original_id, obj in original_ids2.items()})

    # Update source_ref and target_ref in relationship objects
    for obj in json_objects1.values():
        if obj['node_or_edge'] == 'edge':
            obj['source_ref'] = id_to_encodeID.get(obj['source_ref'], obj['source_ref'])
            obj['target_ref'] = id_to_encodeID.get(obj['target_ref'], obj['target_ref'])

    for obj in json_objects2.values():
        if obj['node_or_edge'] == 'edge':
            obj['source_ref'] = id_to_encodeID.get(obj['source_ref'], obj['source_ref'])
            obj['target_ref'] = id_to_encodeID.get(obj['target_ref'], obj['target_ref'])

    return json_objects1, json_objects2, comparison_matrix, combined_embeddings

def identify_node_or_edge_and_add_key(json_objects):
    """Identify if an object is a node or an edge based on the presence of two values matching encodeIDs of other objects,
    and add a new key-value pair indicating its type."""
    
    encode_ids = {entry['encodeID'] for entry in json_objects.values()}
    
    for obj in json_objects.values():
        values = set(obj.values())
        if len(values.intersection(encode_ids)) >= 2:
            obj['node_or_edge'] = "edge"
        else:
            obj['node_or_edge'] = "node"
    
    return json_objects

def save_json(data, file_path):
    """Save JSON data to a file."""
    with open(file_path, 'w') as file:
        json.dump(data, file, indent=4)


def build_graph(json_objects, suffix, color):
    """Build a graph based on JSON objects."""
    G = nx.Graph()
    valid_nodes = set()

    # Add nodes, ensuring unique IDs
    for key, obj in json_objects.items():
        if obj['node_or_edge'] != 'edge':  # Only add entities as nodes
            if obj.get('name') != 'Unknown Name':  # Avoid adding unknown name nodes
                unique_id = str(obj.get('encodeID')) + suffix
                node_name = f"{obj.get('name', 'Unknown Entity')} ({color})"
                G.add_node(unique_id, label=node_name, color=color,
                           description=obj.get('description', 'No description provided'))
                valid_nodes.add(unique_id)
        else:
            continue

    # Add edges, using unique IDs
    unique_edges = set()
    for obj in json_objects.values():
        if obj['node_or_edge'] == 'edge':
            source = str(obj.get('source_ref')) + suffix
            target = str(obj.get('target_ref')) + suffix
            edge = (source, target)
            if source in valid_nodes and target in valid_nodes and source != target:  # Ensure both nodes are valid and not the same
                if edge in unique_edges:
                    G[source][target]['style'] = 'solid'
                    G[source][target]['color'] = 'black'
                else:
                    G.add_edge(source, target, style='dotted', color=color, label=obj.get('relationship_type', 'similar-to'))
                    unique_edges.add(edge)
    return G

def visualize_graph(G):
    """Visualize a networkx graph with enhanced label formatting for readability."""
    num_nodes = len(G.nodes)
    figsize = get_optimal_figsize(num_nodes)
    k = get_optimal_k(num_nodes)

    plt.figure(figsize=figsize)  # Dynamically set figure size
    pos = nx.spring_layout(G, k=k)  # Dynamically set layout parameter

    # Draw nodes with their corresponding colors
    node_colors = [data.get('color', 'grey') for node, data in G.nodes(data=True)]
    nx.draw_networkx_nodes(G, pos, node_color=node_colors, node_size=300)

    # Draw edges with specified styles and colors
    for (u, v, data) in G.edges(data=True):
        nx.draw_networkx_edges(
            G, pos, edgelist=[(u, v)],
            style=data.get('style', 'dotted' if data['color'] == 'gray' else 'solid'),
            edge_color=data.get('color', 'black'),
            width=2
        )

    # Draw edge labels
    edge_labels = {(u, v): data.get('label', 'similar-to') for u, v, data in G.edges(data=True)}
    nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels, font_size=8)

    # Draw labels for nodes
    labels = {}
    for node, data in G.nodes(data=True):
        label = f"{data.get('label', 'Unknown')}\n({data.get('type', 'Unknown')})"
        description = data.get('description', 'No description provided')
        if description:
            doc = nlp(description)
            verbs = [token.text for token in doc if token.pos_ == 'VERB']
            label += f"\n{' '.join(verbs[:2])}"
        labels[node] = label

    for node, label in labels.items():
        x, y = pos[node]
        plt.text(x, y, label, fontsize=9, ha='center', va='center',
                 bbox=dict(boxstyle="round,pad=0.5", facecolor='white', edgecolor='gray', alpha=0.6))

    plt.title('Graph Visualization')
    plt.axis('off')
    plt.show()

def get_optimal_figsize(num_nodes):
    """Determine an optimal figure size based on the number of nodes."""
    base_size = 10  # Base size for the figure
    scale_factor = 0.5  # Scale factor to adjust size
    return (base_size + num_nodes * scale_factor, base_size + num_nodes * scale_factor)

def get_optimal_k(num_nodes):
    """Determine an optimal k value for spring_layout based on the number of nodes."""
    base_k = 0.5  # Base k value
    scale_factor = 0.1  # Scale factor to adjust k
    return base_k + num_nodes * scale_factor

def calculate_distances(json_objects1, json_objects2):
    """Calculate node distance, key distance, value distance, and graph distance."""
    def jaccard_distance(set1, set2):
        intersection = len(set1.intersection(set2))
        union = len(set1.union(set2))
        return 1 - intersection / union if union != 0 else 1

    def set_distance(set1, set2):
        return abs(len(set1) - len(set2))

    # Node distance
    nodes1 = set(json_objects1.keys())
    nodes2 = set(json_objects2.keys())
    node_distance = set_distance(nodes1, nodes2)
    node_jaccard = jaccard_distance(nodes1, nodes2)

    # Key distance
    keys1 = set(k for obj in json_objects1.values() for k in obj.keys())
    keys2 = set(k for obj in json_objects2.values() for k in obj.keys())
    key_distance = set_distance(keys1, keys2)
    key_jaccard = jaccard_distance(keys1, keys2)

    # Value distance
    values1 = set(v for obj in json_objects1.values() for v in obj.values() if isinstance(v, str))
    values2 = set(v for obj in json_objects2.values() for v in obj.values() if isinstance(v, str))
    value_distance = set_distance(values1, values2)
    value_jaccard = jaccard_distance(values1, values2)

    # Graph distance (using node and edge comparison)
    G1 = build_graph(json_objects1, '_blue', 'blue')
    G2 = build_graph(json_objects2, '_red', 'red')
    graph_distance = abs(G1.number_of_nodes() - G2.number_of_nodes()) + abs(G1.number_of_edges() - G2.number_of_edges())
    graph_jaccard = jaccard_distance(set(G1.edges()), set(G2.edges()))

    print(f"Node Distance (Simple): {node_distance}")
    print(f"Node Distance (Jaccard): {node_jaccard:.2f}")
    print(f"Key Distance (Simple): {key_distance}")
    print(f"Key Distance (Jaccard): {key_jaccard:.2f}")
    print(f"Value Distance (Simple): {value_distance}")
    print(f"Value Distance (Jaccard): {value_jaccard:.2f}")
    print(f"Graph Distance (Simple): {graph_distance}")
    print(f"Graph Distance (Jaccard): {graph_jaccard:.2f}")

def create_comparison_matrix(texts1, embeddings1, texts2, embeddings2, json_objects1, json_objects2, threshold):
    """Create a comparison matrix and identify matches."""
    combined_embeddings = np.vstack(embeddings1 + embeddings2)

    # Compute similarity matrix for combined texts
    similarity_matrix = cosine_similarity(combined_embeddings)

    # Create comparison matrix for texts from json1 and json2 only
    num_texts1 = len(texts1)
    num_texts2 = len(texts2)
    comparison_data = []
    for i in range(num_texts1):
        row = []
        for j in range(num_texts2):
            similarity = similarity_matrix[i, num_texts1 + j]
            match = 1 if similarity > threshold else 0
            row.append(match)
        comparison_data.append(row)

    # Create DataFrame for comparison matrix
    labels1 = []
    labels2 = []
    label_to_text1 = {}
    label_to_text2 = {}

    for text, obj in json_objects1.items():
        # Dynamically get the first and second key-value pairs
        entry_items = list(obj.items())
        first_key_value = entry_items[0] if len(entry_items) > 0 else ("None", "None")
        second_key_value = entry_items[1] if len(entry_items) > 1 else ("None", "None")
        label = f"{first_key_value[0]}: {first_key_value[1]} ({obj['file']}), {second_key_value[0]}: {second_key_value[1]} ({obj['file']})"
        labels1.append(label)
        label_to_text1[label] = text

    for text, obj in json_objects2.items():
        # Dynamically get the first and second key-value pairs
        entry_items = list(obj.items())
        first_key_value = entry_items[0] if len(entry_items) > 0 else ("None", "None")
        second_key_value = entry_items[1] if len(entry_items) > 1 else ("None", "None")
        label = f"{first_key_value[0]}: {first_key_value[1]} ({obj['file']}), {second_key_value[0]}: {second_key_value[1]} ({obj['file']})"
        labels2.append(label)
        label_to_text2[label] = text

    comparison_df = pd.DataFrame(comparison_data, index=labels1, columns=labels2)

    return comparison_df, similarity_matrix, label_to_text1, label_to_text2

def save_comparison_matrix_to_excel(comparison_df, similarity_matrix, file_path, labels1, labels2):
    """Save the comparison matrix and similarity values to an Excel file."""
    with pd.ExcelWriter(os.path.join(output_dir, file_path)) as writer:
        comparison_df.to_excel(writer, sheet_name='Comparison Matrix')

        # Add similarity values sheet with correct dimensions
        similarity_submatrix = similarity_matrix[:len(labels1), len(labels1):len(labels1)+len(labels2)]
        similarity_df = pd.DataFrame(similarity_submatrix, index=labels1, columns=labels2)
        similarity_df.to_excel(writer, sheet_name='Similarity Values')

def save_embeddings_to_excel(embeddings1, embeddings2, labels1, labels2, file_path, threshold):
    """Save aggregated embeddings and similarity matrix to an Excel file."""
    
    # Aggregate embeddings for JSON1 and JSON2 separately
    aggregated_embeddings1 = np.array([embedding.mean(axis=0) for embedding in embeddings1])
    aggregated_embeddings2 = np.array([embedding.mean(axis=0) for embedding in embeddings2])
    
    # Compute similarity matrix only for JSON1 vs JSON2
    similarity_matrix = cosine_similarity(aggregated_embeddings1, aggregated_embeddings2)
    
    # Create binary match matrix
    match_matrix = np.zeros_like(similarity_matrix)
    
    # Find maximum similarities above the threshold and update match matrix
    for i in range(similarity_matrix.shape[0]):
        max_sim_index = np.argmax(similarity_matrix[i])
        if similarity_matrix[i, max_sim_index] > threshold:
            match_matrix[i, max_sim_index] = 1
    
    for j in range(similarity_matrix.shape[1]):
        max_sim_index = np.argmax(similarity_matrix[:, j])
        if similarity_matrix[max_sim_index, j] > threshold:
            match_matrix[max_sim_index, j] = 1
    
    # Create DataFrames for similarity and match matrices
    similarity_df = pd.DataFrame(similarity_matrix, index=labels1, columns=labels2)
    match_df = pd.DataFrame(match_matrix, index=labels1, columns=labels2)
    
    # Save to Excel
    with pd.ExcelWriter(os.path.join(output_dir, file_path)) as writer:
        similarity_df.to_excel(writer, sheet_name='Similarity Matrix')
        match_df.to_excel(writer, sheet_name='Match Matrix')

    print(f"Embeddings and similarity matrices saved to '{file_path}'.")

def save_jaccard_distances_to_excel(similarity_matrix, labels1, labels2, json_objects1, json_objects2, label_to_text1, label_to_text2, file_path, G1, G2):
    """
    Save Jaccard distances derived from the similarity matrix to an Excel file with detailed calculations.
    
    Args:
        similarity_matrix (np.ndarray): The cosine similarity matrix of the combined embeddings.
        labels1 (list): The list of labels corresponding to the texts/embeddings from JSON 1.
        labels2 (list): The list of labels corresponding to the texts/embeddings from JSON 2.
        json_objects1 (list): JSON objects from the first dataset.
        json_objects2 (list): JSON objects from the second dataset.
        label_to_text1 (dict): Mapping of labels to texts for JSON 1.
        label_to_text2 (dict): Mapping of labels to texts for JSON 2.
        file_path (str): The file path to save the Excel file.
        G1 (nx.Graph): The graph created from json_objects1.
        G2 (nx.Graph): The graph created from json_objects2.
    """
    num_texts1 = len(labels1)
    num_texts2 = len(labels2)
    jaccard_distances = []

    # Create a similarity submatrix for JSON1 vs JSON2
    similarity_submatrix = similarity_matrix[:num_texts1, :num_texts2]

    # Ensure the submatrix has the correct shape
    if similarity_submatrix.shape != (num_texts1, num_texts2):
        raise ValueError(f"Expected submatrix shape ({num_texts1}, {num_texts2}), but got {similarity_submatrix.shape}")

    for i in range(num_texts1):
        for j in range(num_texts2):
            intersection = similarity_submatrix[i, j]
            union = 1  # In the context of cosine similarity, the union can be considered as 1.
            jaccard_distance = 1 - intersection / union if union != 0 else 1

            type1 = json_objects1[label_to_text1[labels1[i]]]['node_or_edge']
            type2 = json_objects2[label_to_text2[labels2[j]]]['node_or_edge']

            jaccard_distances.append({
                'Text 1 Index': i,
                'Text 2 Index': j,
                'Text 1 Label': labels1[i],
                'Text 2 Label': labels2[j],
                'Type 1': type1,
                'Type 2': type2,
                'Intersection (Cosine Similarity)': intersection,
                'Union': union,
                'Jaccard Distance': jaccard_distance
            })

    jaccard_df = pd.DataFrame(jaccard_distances)

    node_summary = jaccard_summary(
        set(obj['encodeID'] for obj in json_objects1.values()),
        set(obj['encodeID'] for obj in json_objects2.values()),
        'Node Distance'
    )

    key_summary = jaccard_summary(
        set(k for obj in json_objects1.values() for k in obj.keys()),
        set(k for obj in json_objects2.values() for k in obj.keys()),
        'Key Distance'
    )

    value_summary = jaccard_summary(
        set(v for obj in json_objects1.values() for v in obj.values() if isinstance(v, str)),
        set(v for obj in json_objects2.values() for v in obj.values() if isinstance(v, str)),
        'Value Distance'
    )

    graph_summary = jaccard_summary(
        set(G1.edges()),
        set(G2.edges()),
        'Graph Distance'
    )

    summary_df = pd.DataFrame([node_summary, key_summary, value_summary, graph_summary])

    # Create a DataFrame for the cosine similarity matrix between JSON1 and JSON2 objects
    cosine_df = pd.DataFrame(similarity_submatrix, index=labels1, columns=labels2)

    nodes_details = []
    edges_details = []

    for i in range(num_texts1):
        for j in range(num_texts2):
            type1 = json_objects1[label_to_text1[labels1[i]]]['node_or_edge']
            type2 = json_objects2[label_to_text2[labels2[j]]]['node_or_edge']
            if type1 == 'node' and type2 == 'node':
                nodes_details.append({
                    'Text 1 Index': i,
                    'Text 2 Index': j,
                    'Text 1 Label': labels1[i],
                    'Text 2 Label': labels2[j],
                    'Cosine Similarity': similarity_submatrix[i, j]
                })
            elif type1 == 'edge' and type2 == 'edge':
                edges_details.append({
                    'Text 1 Index': i,
                    'Text 2 Index': j,
                    'Text 1 Label': labels1[i],
                    'Text 2 Label': labels2[j],
                    'Cosine Similarity': similarity_submatrix[i, j]
                })

    nodes_df = pd.DataFrame(nodes_details)
    edges_df = pd.DataFrame(edges_details)

    with pd.ExcelWriter(file_path) as writer:
        cosine_df.to_excel(writer, sheet_name='Cosine Similarity')
        nodes_df.to_excel(writer, sheet_name='Nodes Details', index=False)
        edges_df.to_excel(writer, sheet_name='Edges Details', index=False)
        jaccard_df.to_excel(writer, sheet_name='Jaccard Distances', index=False)
        summary_df.to_excel(writer, sheet_name='Summary', index=False)

    print(f"Jaccard distances saved to '{file_path}'.")

def jaccard_summary(set1, set2, label):
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    jaccard_distance = 1 - intersection / union if union != 0 else 1
    return {
        'Metric': f'{label} (Jaccard)',
        'Distance': jaccard_distance,
        'Intersection': intersection,
        'Union': union
    }

def calculate_detailed_distances(json_objects1, json_objects2, threshold=0.5):
    def jaccard_similarity(set1, set2):
        intersection = len(set1.intersection(set2))
        union = len(set1.union(set2))
        return intersection / union if union != 0 else 0

    def calculate_set_distance(set1, set2):
        """Calculate insertion, deletion, and substitution distances."""
        intersection = set1 & set2
        insertion = set2 - intersection
        deletion = set1 - intersection
        num_substitutions = min(len(insertion), len(deletion))
        insertion = insertion - set(list(insertion)[:num_substitutions])
        deletion = deletion - set(list(deletion)[:num_substitutions])
        return len(insertion), len(deletion), num_substitutions

    # Nodes and Edges
    nodes1 = {obj['encodeID'] for obj in json_objects1.values() if obj['node_or_edge'] == 'node'}
    nodes2 = {obj['encodeID'] for obj in json_objects2.values() if obj['node_or_edge'] == 'node'}
    edges1 = {obj['encodeID'] for obj in json_objects1.values() if obj['node_or_edge'] == 'edge'}
    edges2 = {obj['encodeID'] for obj in json_objects2.values() if obj['node_or_edge'] == 'edge'}
    
    # Calculate node distances with similarity check
    node_i, node_d, node_s = 0, 0, 0
    matched_nodes = set()
    
    for n1 in nodes1:
        obj1 = next((obj for obj in json_objects1.values() if obj['encodeID'] == n1), None)
        for n2 in nodes2:
            obj2 = next((obj for obj in json_objects2.values() if obj['encodeID'] == n2), None)
            if obj1 and obj2:
                sim = jaccard_similarity(set(obj1.items()), set(obj2.items()))
                if sim >= threshold:
                    matched_nodes.add((n1, n2))
                    break
        else:
            node_d += 1
    
    for n2 in nodes2:
        if not any(n2 == match[1] for match in matched_nodes):
            node_i += 1

    node_s = min(node_i, node_d)
    node_i -= node_s
    node_d -= node_s
    node_distance = node_i + node_d + node_s

    # Calculate edge distances with similarity check
    edge_i, edge_d, edge_s = 0, 0, 0
    matched_edges = set()
    
    for e1 in edges1:
        obj1 = next((obj for obj in json_objects1.values() if obj['encodeID'] == e1), None)
        for e2 in edges2:
            obj2 = next((obj for obj in json_objects2.values() if obj['encodeID'] == e2), None)
            if obj1 and obj2:
                sim = jaccard_similarity(set(obj1.items()), set(obj2.items()))
                if sim >= threshold:
                    matched_edges.add((e1, e2))
                    break
        else:
            edge_d += 1
    
    for e2 in edges2:
        if not any(e2 == match[1] for match in matched_edges):
            edge_i += 1

    edge_s = min(edge_i, edge_d)
    edge_i -= edge_s
    edge_d -= edge_s
    edge_distance = edge_i + edge_d + edge_s

    if edge_distance > 0:
        key_distance = 0
        key_i = 0
        key_d = 0
        key_s = 0
        value_distance = 0
        value_i = 0
        value_d = 0
        value_s = 0
    else:
        # Keys and Values
        key_i, key_d, key_s = 0, 0, 0
        value_i, value_d, value_s = 0, 0, 0

        for obj1 in json_objects1.values():
            obj2 = next((obj for obj in json_objects2.values() if obj['encodeID'] == obj1['encodeID']), None)
            if obj2:
                keys1 = set(obj1.keys())
                keys2 = set(obj2.keys())
                ki, kd, ks = calculate_set_distance(keys1, keys2)
                key_i += ki
                key_d += kd
                key_s += ks
                
                # Debugging
                print(f"Comparing objects with encodeID {obj1['encodeID']}:")
                print(f"Keys1: {keys1}")
                print(f"Keys2: {keys2}")
                print(f"Key insertions: {ki}, deletions: {kd}, substitutions: {ks}")
                
                # If there are key differences, do not check for value differences
                if ki > 0 or kd > 0 or ks > 0:
                    continue

                for key in keys1 & keys2:
                    if obj1[key] != obj2[key]:
                        value_i += 1
                        if isinstance(obj1[key], str) or isinstance(obj2[key], str):
                            value_d += 1
                        # Debugging
                        print(f"Value difference found in key '{key}': {obj1[key]} != {obj2[key]}")

        key_s = min(key_i, key_d)
        key_i -= key_s
        key_d -= key_s
        key_distance = key_i + key_d + key_s
        
        value_s = min(value_i, value_d)
        value_i -= value_s
        value_d -= value_s
        value_distance = value_i + value_d + value_s

    detailed_distance = {
        "node_distance": node_distance,
        "node_i": node_i,
        "node_d": node_d,
        "node_s": node_s,
        "edge_distance": edge_distance,
        "edge_i": edge_i,
        "edge_d": edge_d,
        "edge_s": edge_s,
        "key_distance": key_distance,
        "key_i": key_i,
        "key_d": key_d,
        "key_s": key_s,
        "value_distance": value_distance,
        "value_i": value_i,
        "value_d": value_d,
        "value_s": value_s
    }
    
    return [detailed_distance]


def main():
    global threshold
    path1 = input("Enter the file path for JSON 1: ").strip('"')
    path2 = input("Enter the file path for JSON 2: ").strip('"')
        
    threshold = float(input("Enter the similarity threshold (e.g., 0.95): ").strip())

    data1 = load_json(path1)
    if data1 is None:
        return

    data2 = load_json(path2)
    if data2 is None:
        return

    texts1, embeddings1, tokenized_texts1, json_objects1, original_ids1, labels1 = json_to_text(data1, 'json1')
    texts2, embeddings2, tokenized_texts2, json_objects2, original_ids2, labels2 = json_to_text(data2, 'json2')

    combined_tokenized_texts = tokenized_texts1 + tokenized_texts2

    save_tokenized_texts_combined(combined_tokenized_texts, 'combined')
    save_embeddings_combined(embeddings1 + embeddings2, 'combined')

    # Save embeddings and similarity for JSON1 vs JSON2
    save_embeddings_to_excel(embeddings1, embeddings2, labels1, labels2, 'embeddings_and_similarity.xlsx', threshold)

    json_objects1, json_objects2, comparison_matrix, combined_embeddings = assign_encode_ids(texts1, embeddings1, texts2, embeddings2, json_objects1, json_objects2, original_ids1, original_ids2, threshold)

    json_objects1 = identify_node_or_edge_and_add_key(json_objects1)
    json_objects2 = identify_node_or_edge_and_add_key(json_objects2)

    # Save normalized data to JSON files
    save_json(json_objects1, os.path.join(output_dir, 'normalized_data1.json'))
    save_json(json_objects2, os.path.join(output_dir, 'normalized_data2.json'))

    comparison_df, similarity_matrix, label_to_text1, label_to_text2 = create_comparison_matrix(texts1, embeddings1, texts2, embeddings2, json_objects1, json_objects2, threshold)
    save_comparison_matrix_to_excel(comparison_df, similarity_matrix, 'comparison_matrix.xlsx', labels1, labels2)

    print("Comparison matrix saved to 'output_files/comparison_matrix.xlsx'.")

    print("Embedding Details:")
    for i, text in enumerate(texts1 + texts2):
        print(f"Text {i + 1}: {text}")
        print(f"Embedding: {combined_embeddings[i]}")
        print(f"Dimensions: {combined_embeddings[i].shape}")

    G1 = build_graph(json_objects1, '_blue', 'blue')
    G2 = build_graph(json_objects2, '_red', 'red')

    visualize_graph(G1)
    visualize_graph(G2)

    calculate_distances(json_objects1, json_objects2)

    # Correct the similarity matrix slicing here
    sub_similarity_matrix = similarity_matrix[:len(labels1), len(labels1):len(labels1)+len(labels2)]
    save_jaccard_distances_to_excel(sub_similarity_matrix, labels1, labels2, json_objects1, json_objects2, label_to_text1, label_to_text2, os.path.join(output_dir, 'jaccard_distances.xlsx'), G1, G2)

    # Calculate detailed distances and save to JSON
    detailed_distances = calculate_detailed_distances(json_objects1, json_objects2)
    save_json(detailed_distances, os.path.join(output_dir, 'detailed_distances.json'))
    print("Detailed distances saved to 'output_files/detailed_distances.json'.")


if __name__ == "__main__":
    main()