In [2]:
import json
import networkx as nx
import matplotlib.pyplot as plt
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import silhouette_score
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import numpy as np
from sklearn.preprocessing import MinMaxScaler

# Load the spaCy NLP model
nlp = spacy.load('en_core_web_sm')

# Ensure necessary resources are downloaded
nltk.download('punkt')
nltk.download('stopwords')

def load_json(file_path):
    """Load JSON data from a file."""
    try:
        with open(file_path, 'r') as file:
            return json.load(file)
    except FileNotFoundError:
        print(f"Error: File not found - {file_path}")
        return None

def json_to_text(data):
    """Convert JSON entries to text for analysis."""
    texts = []
    ids = []
    json_objects = {}
    for entry in data:
        text = ". ".join(f"{key}: {value}" if not isinstance(value, list) else f"{key}: " + ", ".join(map(str, value)) for key, value in entry.items())
        texts.append(text)
        ids.append(entry['id'])
        json_objects[entry['id']] = entry
    return texts, ids, json_objects

def preprocess_json_objects(json_objects):
    """Ensure all JSON objects have required fields."""
    for obj in json_objects.values():
        obj.setdefault('name', 'Unknown Name')
        obj.setdefault('type', 'Unknown Type')
        obj.setdefault('description', 'No description provided')
    return json_objects

def extract_verb_centered_snippet(description, window=2, max_gap=4):
    """Extract snippets around the first two verbs found in the description."""
    doc = nlp(description)
    verbs = []
    for token in doc:
        if token.pos_ == 'VERB':
            verbs.append((token.i, token))
            if len(verbs) == 2:
                break

    if not verbs:
        return description

    if len(verbs) == 1:
        start = max(0, verbs[0][1].i - window)
        end = min(len(doc), verbs[0][1].i + window + 1)
        return doc[start:end].text

    v1, v2 = verbs
    if (v2[0] - v1[0]) <= max_gap:
        start = max(0, v1[1].i - window)
        end = min(len(doc), v2[1].i + window + 1)
        return doc[start:end].text
    else:
        snippet1 = doc[max(0, v1[1].i - window):v1[1].i + window + 1].text
        snippet2 = doc[max(0, v2[1].i - window):v2[1].i + window + 1].text
        return f"{snippet1} ... {snippet2}"

def build_graph(json_objects, suffix, color):
    """Build a graph based on JSON objects."""
    G = nx.Graph()
    valid_nodes = set()

    # Add nodes, ensuring unique IDs
    for key, obj in json_objects.items():
        if obj['type'] != 'relationship':  # Only add entities as nodes
            if obj.get('name') != 'Unknown Name':  # Avoid adding unknown name nodes
                unique_id = key + suffix
                node_name = f"{obj.get('name', 'Unknown Entity')} ({color})"
                G.add_node(unique_id, label=node_name, color=color,
                           description=obj.get('description', 'No description provided'))
                valid_nodes.add(unique_id)
                print(f"Added node: {unique_id} with name: {node_name}")
            else:
                print(f"Filtered out node with id: {key} due to 'Unknown Name'")

    # Add edges, using unique IDs
    unique_edges = set()
    for obj in json_objects.values():
        if obj['type'] == 'relationship':
            source = obj['source_ref'] + suffix
            target = obj['target_ref'] + suffix
            edge = (source, target)
            if source in valid_nodes and target in valid_nodes and source != target:  # Ensure both nodes are valid and not the same
                if edge in unique_edges:
                    G[source][target]['style'] = 'solid'
                    G[source][target]['color'] = 'black'
                else:
                    G.add_edge(source, target, style='dotted', color=color, label=obj.get('relationship_type', 'similar-to'))
                    unique_edges.add(edge)
                print(f"Added edge: {source} -> {target}")
            else:
                print(f"Filtered out edge from {source} to {target} due to invalid nodes")

    return G

def get_optimal_figsize(num_nodes):
    """Determine an optimal figure size based on the number of nodes."""
    base_size = 10  # Base size for the figure
    scale_factor = 0.5  # Scale factor to adjust size
    return (base_size + num_nodes * scale_factor, base_size + num_nodes * scale_factor)

def get_optimal_k(num_nodes):
    """Determine an optimal k value for spring_layout based on the number of nodes."""
    base_k = 0.5  # Base k value
    scale_factor = 0.1  # Scale factor to adjust k
    return base_k + num_nodes * scale_factor

def visualize_graph(G):
    """Visualize a networkx graph with enhanced label formatting for readability."""
    num_nodes = len(G.nodes)
    figsize = get_optimal_figsize(num_nodes)
    k = get_optimal_k(num_nodes)

    plt.figure(figsize=figsize)  # Dynamically set figure size
    pos = nx.spring_layout(G, k=k)  # Dynamically set layout parameter

    # Draw nodes with their corresponding colors
    node_colors = [data.get('color', 'grey') for node, data in G.nodes(data=True)]
    nx.draw_networkx_nodes(G, pos, node_color=node_colors, node_size=300)

    # Draw edges with specified styles and colors
    for (u, v, data) in G.edges(data=True):
        nx.draw_networkx_edges(
            G, pos, edgelist=[(u, v)],
            style=data.get('style', 'dotted' if data['color'] == 'gray' else 'solid'),
            edge_color=data.get('color', 'black'),
            width=2
        )

    # Draw edge labels
    edge_labels = {(u, v): data.get('label', 'similar-to') for u, v, data in G.edges(data=True)}
    nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels, font_size=8)

    # Draw labels for nodes
    labels = {}
    for node, data in G.nodes(data=True):
        label = f"{data.get('label', 'Unknown')}\n({data.get('type', 'Unknown')})"
        description = data.get('description', 'No description provided')
        if description:
            snippet = extract_verb_centered_snippet(description)
            label += f"\n{snippet}"
        labels[node] = label

    for node, label in labels.items():
        x, y = pos[node]
        plt.text(x, y, label, fontsize=9, ha='center', va='center',
                 bbox=dict(boxstyle="round,pad=0.5", facecolor='white', edgecolor='gray', alpha=0.6))

    plt.title('Graph Visualization')
    plt.axis('off')
    plt.show()

def plot_threshold_analysis(similarity_matrix):
    thresholds = np.linspace(0, 1, 50)
    num_edges = []
    silhouette_scores = []
    num_components = []
    avg_clustering = []

    for threshold in thresholds:
        G = nx.Graph()
        labels = np.zeros(len(similarity_matrix))  # Correct the size of the labels array
        label_idx = 1
        for i, row in enumerate(similarity_matrix):
            for j, similarity in enumerate(row):
                if similarity > threshold:
                    G.add_edge(i, j)
                    if labels[i] == 0:
                        labels[i] = label_idx
                        label_idx += 1
                    if j < len(labels):
                        labels[j] = labels[i]
        num_edges.append(len(G.edges))
        num_components.append(nx.number_connected_components(G))
        if len(G) > 0:
            avg_clustering.append(nx.average_clustering(G))
        else:
            avg_clustering.append(0)
        if len(set(labels)) > 1:
            silhouette_scores.append(silhouette_score(similarity_matrix, labels))
        else:
            silhouette_scores.append(-1)

    plt.figure(figsize=(20, 5))

    plt.subplot(1, 4, 1)
    plt.plot(thresholds, num_edges, marker='o')
    plt.xlabel('Threshold')
    plt.ylabel('Number of Edges (Find the Elbow Point)')
    plt.title('Elbow Method for Optimal Threshold')

    plt.subplot(1, 4, 2)
    plt.plot(thresholds, silhouette_scores, marker='o')
    plt.xlabel('Threshold')
    plt.ylabel('Silhouette Score (Higher is Better)')
    plt.title('Silhouette Analysis for Optimal Threshold')

    plt.subplot(1, 4, 3)
    plt.plot(thresholds, num_components, marker='o')
    plt.xlabel('Threshold')
    plt.ylabel('Number of Components (Stable Plateau)')
    plt.title('Number of Connected Components')

    plt.subplot(1, 4, 4)
    plt.plot(thresholds, avg_clustering, marker='o')
    plt.xlabel('Threshold')
    plt.ylabel('Average Clustering Coefficient (Higher is Better)')
    plt.title('Average Clustering Coefficient')

    plt.tight_layout()
    plt.show()

def recommend_optimal_threshold(similarity_matrix):
    thresholds = np.linspace(0, 1, 50)
    num_edges = []
    silhouette_scores = []
    num_components = []
    avg_clustering = []

    for threshold in thresholds:
        G = nx.Graph()
        labels = np.zeros(len(similarity_matrix))
        label_idx = 1
        for i, row in enumerate(similarity_matrix):
            for j, similarity in enumerate(row):
                if similarity > threshold:
                    G.add_edge(i, j)
                    if labels[i] == 0:
                        labels[i] = label_idx
                        label_idx += 1
                    if j < len(labels):
                        labels[j] = labels[i]
        num_edges.append(len(G.edges))
        num_components.append(nx.number_connected_components(G))
        if len(G) > 0:
            avg_clustering.append(nx.average_clustering(G))
        else:
            avg_clustering.append(0)
        if len(set(labels)) > 1:
            silhouette_scores.append(silhouette_score(similarity_matrix, labels))
        else:
            silhouette_scores.append(-1)

    scaler = MinMaxScaler()
    num_edges_scaled = scaler.fit_transform(np.array(num_edges).reshape(-1, 1)).flatten()
    silhouette_scores_scaled = scaler.fit_transform(np.array(silhouette_scores).reshape(-1, 1)).flatten()
    num_components_scaled = scaler.fit_transform(np.array(num_components).reshape(-1, 1)).flatten()
    avg_clustering_scaled = scaler.fit_transform(np.array(avg_clustering).reshape(-1, 1)).flatten()

    aggregate_score = (num_edges_scaled + silhouette_scores_scaled + (1 - num_components_scaled) + avg_clustering_scaled) / 4
    optimal_threshold = thresholds[np.argmax(aggregate_score)]

    plt.figure(figsize=(10, 5))
    plt.plot(thresholds, aggregate_score, marker='o')
    plt.xlabel('Threshold')
    plt.ylabel('Aggregated Score')
    plt.title('Recommended Threshold Analysis')
    plt.axvline(x=optimal_threshold, color='r', linestyle='--')
    plt.tight_layout()
    plt.show()

    print(f"Recommended threshold based on aggregated metrics: {optimal_threshold:.2f}")

def calculate_distances(json_objects1, json_objects2):
    """Calculate node distance, key distance, value distance, and graph distance."""
    def jaccard_distance(set1, set2):
        intersection = len(set1.intersection(set2))
        union = len(set1.union(set2))
        return 1 - intersection / union if union != 0 else 1
    #Jaccard Distance = 1 minus (number of elements in the intersection of A and B) divided by (number of elements in the union of A and B)

    # Node distance
    nodes1 = set(json_objects1.keys())
    nodes2 = set(json_objects2.keys())
    node_distance = jaccard_distance(nodes1, nodes2)

    # Key distance
    keys1 = set(k for obj in json_objects1.values() for k in obj.keys())
    keys2 = set(k for obj in json_objects2.values() for k in obj.keys())
    key_distance = jaccard_distance(keys1, keys2)

    # Value distance
    values1 = set(v for obj in json_objects1.values() for v in obj.values() if isinstance(v, str))
    values2 = set(v for obj in json_objects2.values() for v in obj.values() if isinstance(v, str))
    value_distance = jaccard_distance(values1, values2)


    # Graph distance (using node and edge comparison)
    G1 = build_graph(json_objects1, '_blue', 'blue')
    G2 = build_graph(json_objects2, '_red', 'red')
    graph_distance = nx.graph_edit_distance(G1, G2)

    print(f"Node Distance: {node_distance:.2f}")
    print(f"Key Distance: {key_distance:.2f}")
    print(f"Value Distance: {value_distance:.2f}")
    print(f"Graph Distance: {graph_distance:.2f}")

    '''The equation calculates the Jaccard distance between the sets of string values from two JSON objects. Here’s a step-by-step breakdown:

    Extract Values from JSON Objects:

    values1 and values2 are sets containing all string values from json_objects1 and json_objects2, respectively.
    The comprehension set(v for obj in json_objects1.values() for v in obj.values() if isinstance(v, str)) iterates through all objects (obj) in json_objects1, and then through all values (v) in each object. It includes v in the set if v is a string. The same process applies to json_objects2 to get values2.
    Calculate Jaccard Distance:

    The function jaccard_distance(values1, values2) calculates the Jaccard distance between the two sets of string values.
    The Jaccard distance is a measure of how dissimilar two sets are. It is calculated as:
    Jaccard Distance = 1 - (|Intersection of values1 and values2| / |Union of values1 and values2|)
    In this formula, |Intersection of values1 and values2| represents the number of elements common to both sets, and |Union of values1 and values2| represents the total number of unique elements in both sets combined.
    Example
    Suppose json_objects1 has string values {"a", "b", "c"} and json_objects2 has string values {"b", "c", "d"}:

    values1 would be {"a", "b", "c"}
    values2 would be {"b", "c", "d"}
    The Jaccard distance calculation would be:

    Intersection: {"b", "c"} (2 elements)
    Union: {"a", "b", "c", "d"} (4 elements)
    Jaccard Distance = 1 - (2 / 4) = 0.5
    This result means that there is a 50% dissimilarity between the string values of the two JSON objects.'''




def main():
    path1 = input("Enter the file path for JSON 1: ").strip('"')
    path2 = input("Enter the file path for JSON 2: ").strip('"')

    data1 = load_json(path1)
    if data1 is None:
        return

    data2 = load_json(path2)
    if data2 is None:
        return

    texts1, ids1, json_objects1 = json_to_text(data1)
    texts2, ids2, json_objects2 = json_to_text(data2)

    json_objects1 = preprocess_json_objects(json_objects1)
    json_objects2 = preprocess_json_objects(json_objects2)

    vectorizer = TfidfVectorizer(tokenizer=word_tokenize, stop_words=stopwords.words('english'))
    tfidf1 = vectorizer.fit_transform(texts1)
    tfidf2 = vectorizer.transform(texts2)

    similarity_matrix = cosine_similarity(tfidf1, tfidf2)

    # Plot threshold analysis to determine the optimal threshold
    plot_threshold_analysis(similarity_matrix)

    # Recommend the optimal threshold value based on aggregated metrics
    recommend_optimal_threshold(similarity_matrix)

    # Choose a threshold value based on the analysis plots
    threshold = float(input("Enter the chosen threshold value: "))

    # Build separate graphs for each JSON dataset
    G1 = build_graph(json_objects1, '_blue', 'blue')
    G2 = build_graph(json_objects2, '_red', 'red')

    visualize_graph(G1)
    visualize_graph(G2)

    # Calculate and print distances
    calculate_distances(json_objects1, json_objects2)

if __name__ == "__main__":
    main()


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\omar2\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\omar2\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Error: File not found - 
