In [None]:
import json
import networkx as nx
import matplotlib.pyplot as plt
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import silhouette_score
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import numpy as np

# Load the spaCy NLP model
nlp = spacy.load('en_core_web_sm')

# Ensure necessary resources are downloaded
nltk.download('punkt')
nltk.download('stopwords')

def load_json(file_path):
    """Load JSON data from a file."""
    try:
        with open(file_path, 'r') as file:
            return json.load(file)
    except FileNotFoundError:
        print(f"Error: File not found - {file_path}")
        return None

def json_to_text(data):
    """Convert JSON entries to text for analysis."""
    texts = []
    ids = []
    json_objects = {}
    for entry in data:
        text = ". ".join(f"{key}: {value}" if not isinstance(value, list) else f"{key}: " + ", ".join(map(str, value)) for key, value in entry.items())
        texts.append(text)
        ids.append(entry['id'])
        json_objects[entry['id']] = entry
    return texts, ids, json_objects

def preprocess_json_objects(json_objects):
    """Ensure all JSON objects have required fields."""
    for obj in json_objects.values():
        obj.setdefault('name', 'Unknown Name')
        obj.setdefault('type', 'Unknown Type')
        obj.setdefault('description', 'No description provided')
    return json_objects

def extract_verb_centered_snippet(description, window=2, max_gap=4):
    """Extract snippets around the first two verbs found in the description."""
    doc = nlp(description)
    verbs = []
    for token in doc:
        if token.pos_ == 'VERB':
            verbs.append((token.i, token))
            if len(verbs) == 2:
                break

    if not verbs:
        return description

    if len(verbs) == 1:
        start = max(0, verbs[0][1].i - window)
        end = min(len(doc), verbs[0][1].i + window + 1)
        return doc[start:end].text

    v1, v2 = verbs
    if (v2[0] - v1[0]) <= max_gap:
        start = max(0, v1[1].i - window)
        end = min(len(doc), v2[1].i + window + 1)
        return doc[start:end].text
    else:
        snippet1 = doc[max(0, v1[1].i - window):v1[1].i + window + 1].text
        snippet2 = doc[max(0, v2[1].i - window):v2[1].i + window + 1].text
        return f"{snippet1} ... {snippet2}"

def build_graph(json_objects1, json_objects2, threshold, similarity_matrix, ids1, ids2):
    """Build a graph based on JSON objects and similarity matrix."""
    G = nx.Graph()
    valid_nodes = set()

    # Add nodes from both JSON datasets, ensuring unique IDs
    for json_objects, dataset_suffix, dataset_color in [(json_objects1, '_blue', 'blue'), (json_objects2, '_red', 'red')]:
        for key, obj in json_objects.items():
            if obj['type'] != 'relationship':  # Only add entities as nodes
                if obj.get('name') != 'Unknown Name':  # Avoid adding unknown name nodes
                    unique_id = key + dataset_suffix
                    node_name = f"{obj.get('name', 'Unknown Entity')} ({dataset_color})"
                    G.add_node(unique_id, label=node_name, color=dataset_color,
                               description=obj.get('description', 'No description provided'))
                    valid_nodes.add(unique_id)
                    print(f"Added node: {unique_id} with name: {node_name}")
                else:
                    print(f"Filtered out node with id: {key} due to 'Unknown Name'")

    # Add edges from similarity matrix
    for i, row in enumerate(similarity_matrix):
        for j, similarity in enumerate(row):
            if similarity > threshold:
                source = ids1[i] + "_blue"
                target = ids2[j] + "_red"
                if source in valid_nodes and target in valid_nodes:
                    G.add_edge(source, target, weight=similarity, style='solid', color='black', label='similar-to')
                    print(f"Added edge: {source} -> {target} with similarity {similarity}")

    return G

def get_optimal_figsize(num_nodes):
    """Determine an optimal figure size based on the number of nodes."""
    base_size = 10  # Base size for the figure
    scale_factor = 0.5  # Scale factor to adjust size
    return (base_size + num_nodes * scale_factor, base_size + num_nodes * scale_factor)

def get_optimal_k(num_nodes):
    """Determine an optimal k value for spring_layout based on the number of nodes."""
    base_k = 0.5  # Base k value
    scale_factor = 0.1  # Scale factor to adjust k
    return base_k + num_nodes * scale_factor

def visualize_similarity_graph(G, unmatched_nodes):
    """Visualize a networkx graph with enhanced label formatting for readability."""
    num_nodes = len(G.nodes)
    figsize = get_optimal_figsize(num_nodes)
    k = get_optimal_k(num_nodes)

    plt.figure(figsize=figsize)  # Dynamically set figure size
    pos = nx.spring_layout(G, k=k)  # Dynamically set layout parameter

    # Draw nodes with their corresponding colors
    node_colors = [data.get('color', 'red' if node in unmatched_nodes else 'grey') for node, data in G.nodes(data=True)]
    nx.draw_networkx_nodes(G, pos, node_color=node_colors, node_size=300)

    # Draw edges with specified styles and colors
    for (u, v, data) in G.edges(data=True):
        nx.draw_networkx_edges(
            G, pos, edgelist=[(u, v)],
            style=data.get('style', 'dotted' if data['color'] == 'gray' else 'solid'),
            edge_color=data.get('color', 'black'),
            width=2
        )

    # Draw edge labels
    edge_labels = {(u, v): data.get('label', 'similar-to') for u, v, data in G.edges(data=True)}
    nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels, font_size=8)

    # Draw labels for nodes
    labels = {}
    for node, data in G.nodes(data=True):
        label = f"{data.get('label', 'Unknown')}\n({data.get('type', 'Unknown')})"
        description = data.get('description', 'No description provided')
        if description:
            snippet = extract_verb_centered_snippet(description)
            label += f"\n{snippet}"
        labels[node] = label

    for node, label in labels.items():
        x, y = pos[node]
        plt.text(x, y, label, fontsize=9, ha='center', va='center',
                 bbox=dict(boxstyle="round,pad=0.5", facecolor='white', edgecolor='gray', alpha=0.6))

    plt.title('Graph Visualization')
    plt.axis('off')
    plt.show()

    # Print unmatched nodes
    print("Unmatched Nodes:")
    for node in unmatched_nodes:
        print(f"{node}: {G.nodes[node]['label']}")

def plot_threshold_analysis(similarity_matrix):
    thresholds = np.linspace(0, 1, 50)
    num_edges = []
    silhouette_scores = []
    num_components = []
    avg_clustering = []

    for threshold in thresholds:
        G = nx.Graph()
        labels = np.zeros(len(similarity_matrix))  # Correct the size of the labels array
        label_idx = 1
        for i, row in enumerate(similarity_matrix):
            for j, similarity in enumerate(row):
                if similarity > threshold:
                    G.add_edge(i, j)
                    if labels[i] == 0:
                        labels[i] = label_idx
                        label_idx += 1
                    if j < len(labels):
                        labels[j] = labels[i]
        num_edges.append(len(G.edges))
        num_components.append(nx.number_connected_components(G))
        if len(G) > 0:
            avg_clustering.append(nx.average_clustering(G))
        else:
            avg_clustering.append(0)
        if len(set(labels)) > 1:
            silhouette_scores.append(silhouette_score(similarity_matrix, labels))
        else:
            silhouette_scores.append(-1)

    plt.figure(figsize=(20, 5))

    plt.subplot(1, 4, 1)
    plt.plot(thresholds, num_edges, marker='o')
    plt.xlabel('Threshold')
    plt.ylabel('Number of Edges (Find the Elbow Point)')
    plt.title('Elbow Method for Optimal Threshold')

    plt.subplot(1, 4, 2)
    plt.plot(thresholds, silhouette_scores, marker='o')
    plt.xlabel('Threshold')
    plt.ylabel('Silhouette Score (Higher is Better)')
    plt.title('Silhouette Analysis for Optimal Threshold')

    plt.subplot(1, 4, 3)
    plt.plot(thresholds, num_components, marker='o')
    plt.xlabel('Threshold')
    plt.ylabel('Number of Components (Stable Plateau)')
    plt.title('Number of Connected Components')

    plt.subplot(1, 4, 4)
    plt.plot(thresholds, avg_clustering, marker='o')
    plt.xlabel('Threshold')
    plt.ylabel('Average Clustering Coefficient (Higher is Better)')
    plt.title('Average Clustering Coefficient')

    plt.tight_layout()
    plt.show()

def calculate_similarity_matrix(texts1, texts2):
    """Calculate the cosine similarity matrix for two sets of texts."""
    vectorizer = TfidfVectorizer(tokenizer=word_tokenize, stop_words=stopwords.words('english'))
    tfidf1 = vectorizer.fit_transform(texts1)
    tfidf2 = vectorizer.transform(texts2)
    return cosine_similarity(tfidf1, tfidf2)

def recommend_optimal_threshold(similarity_matrix):
    thresholds = np.linspace(0, 1, 50)
    aggregated_scores = []

    for threshold in thresholds:
        G = nx.Graph()
        labels = np.zeros(len(similarity_matrix))
        label_idx = 1
        for i, row in enumerate(similarity_matrix):
            for j, similarity in enumerate(row):
                if similarity > threshold:
                    G.add_edge(i, j)
                    if labels[i] == 0:
                        labels[i] = label_idx
                        label_idx += 1
                    if j < len(labels):
                        labels[j] = labels[i]
        num_edges = len(G.edges)
        num_components = nx.number_connected_components(G)
        avg_clustering = nx.average_clustering(G) if len(G) > 0 else 0
        silhouette = silhouette_score(similarity_matrix, labels) if len(set(labels)) > 1 else -1

        aggregated_score = silhouette * avg_clustering / num_components if num_components > 0 else 0
        aggregated_scores.append(aggregated_score)

    optimal_threshold_idx = np.argmax(aggregated_scores)
    optimal_threshold = thresholds[optimal_threshold_idx]

    plt.figure(figsize=(12, 6))
    plt.plot(thresholds, aggregated_scores, marker='o')
    plt.axvline(optimal_threshold, color='red', linestyle='--')
    plt.xlabel('Threshold')
    plt.ylabel('Aggregated Score')
    plt.title('Recommended Threshold Analysis')
    plt.show()

    return optimal_threshold

def main():
    path1 = input("Enter the file path for JSON 1: ").strip('"')
    path2 = input("Enter the file path for JSON 2: ").strip('"')

    data1 = load_json(path1)
    if data1 is None:
        return

    data2 = load_json(path2)
    if data2 is None:
        return

    texts1, ids1, json_objects1 = json_to_text(data1)
    texts2, ids2, json_objects2 = json_to_text(data2)

    json_objects1 = preprocess_json_objects(json_objects1)
    json_objects2 = preprocess_json_objects(json_objects2)

    similarity_matrix = calculate_similarity_matrix(texts1, texts2)

    # Plot threshold analysis to determine the optimal threshold
    plot_threshold_analysis(similarity_matrix)

    # Choose a threshold value based on the analysis plots
    threshold = recommend_optimal_threshold(similarity_matrix)
    print(f"Recommended Threshold: {threshold}")

    G = build_graph(json_objects1, json_objects2, threshold, similarity_matrix, ids1, ids2)
    unmatched_nodes = []

    for id1 in ids1:
        if not any(similarity_matrix[ids1.index(id1)][j] > threshold for j in range(len(ids2))):
            unmatched_nodes.append(id1 + "_blue")
            if id1 + "_blue" not in G.nodes and json_objects1[id1]['name'] != 'Unknown Name':
                G.add_node(id1 + "_blue", label=json_objects1[id1]['name'], type=json_objects1[id1]['type'], description=json_objects1[id1]['description'])

    for id2 in ids2:
        if not any(similarity_matrix[i][ids2.index(id2)] > threshold for i in range(len(ids1))):
            unmatched_nodes.append(id2 + "_red")
            if id2 + "_red" not in G.nodes and json_objects2[id2]['name'] != 'Unknown Name':
                G.add_node(id2 + "_red", label=json_objects2[id2]['name'], type=json_objects2[id2]['type'], description=json_objects2[id2]['description'])

    relationship_nodes = [node for node in G.nodes if 'relationship' in node]
    G.remove_nodes_from(relationship_nodes)

    visualize_similarity_graph(G, unmatched_nodes)

if __name__ == "__main__":
    main()
