In [None]:
from pymongo import MongoClient
import networkx as nx
from collections import Counter
import numpy as np

In [None]:
def get_mongo_client(mongo_uri):
    client = MongoClient(mongo_uri)
    return client

In [None]:
mongo_client = get_mongo_client("mongodb://localhost:27018/?directConnection=true")
[db_name for db_name in mongo_client.list_database_names() if ('hotpot' not in db_name) and ('triplets' in db_name)]

In [None]:
mongo_client = MongoClient("mongodb://localhost:27018/?directConnection=true")
db = mongo_client.get_database("musique_gpt4_1_mini_onto_triplets")

triplets = list(db.triplets.find({}))
filtered_triplets = list(db.ontology_filtered_triplets.find({}))
entity_aliases = list(db.entity_aliases.find({}))

In [None]:
mean_degree = []
mean_clustering = []
largest_component_size = []

for sample_id in db.triplets.distinct("sample_id"):
    triplets = list(db.triplets.find({"sample_id": sample_id}))
    filtered_triplets = list(db.ontology_filtered_triplets.find({"sample_id": sample_id}))
    
    triplets = triplets + filtered_triplets 

    G = nx.DiGraph()
    for t in triplets:
        G.add_edge(t["subject"], t["object"], relation=t["relation"])

    mean_degree.append(sum(dict(G.degree()).values())/G.number_of_nodes())  
    mean_clustering.append(nx.average_clustering(G.to_undirected()))

    components = sorted(nx.connected_components(G.to_undirected()), key=len, reverse=True)
    largest_component_size.append(len(components[0]))

print("Mean degree:", np.mean(mean_degree))
print("Mean clustering:", np.mean(mean_clustering))
print("Largest component size:", np.mean(largest_component_size))


In [None]:
db.ontology_filtered_triplets.find_one({})

In [None]:
mean_degree = []
mean_clustering = []
largest_component_size = []

for sample_id in db.triplets.distinct("sample_id"):
    triplets = list(db.triplets.find({"sample_id": sample_id}))
    filtered_triplets = list(db.ontology_filtered_triplets.find({"sample_id": sample_id}))
    
    triplets = triplets + filtered_triplets 

    G = nx.DiGraph()
    for t in triplets:
        G.add_edge(t["subject"], t["object"], relation=t["relation"])

    # print("Nodes:", G.number_of_nodes())
    # print("Edges:", G.number_of_edges())

    # print("Avg degree:", sum(dict(G.degree()).values())/G.number_of_nodes())
    # print("Clustering coefficient:", nx.average_clustering(G.to_undirected()))

    mean_degree.append(sum(dict(G.degree()).values())/G.number_of_nodes())  
    mean_clustering.append(nx.average_clustering(G.to_undirected()))

    components = sorted(nx.connected_components(G.to_undirected()), key=len, reverse=True)
    largest_component_size.append(len(components[0]))

print("Mean degree:", np.mean(mean_degree))
print("Mean clustering:", np.mean(mean_clustering))
print("Largest component size:", np.mean(largest_component_size))


In [None]:
mongo_client = MongoClient("mongodb://localhost:27018/?directConnection=true")
db = mongo_client.get_database("musique_gpt4_1_mini_non_onto_triplets")

triplets = list(db.triplets.find({}))
entity_aliases = list(db.entity_aliases.find({}))

In [None]:
mean_degree = []
mean_clustering = []
largest_component_size = []

for sample_id in db.triplets.distinct("sample_id"):
    triplets = list(db.triplets.find({"sample_id": sample_id}))
    G = nx.DiGraph()
    for t in triplets:
        G.add_edge(t["subject"], t["object"], relation=t["relation"])

    # print("Nodes:", G.number_of_nodes())
    # print("Edges:", G.number_of_edges())

    # print("Avg degree:", sum(dict(G.degree()).values())/G.number_of_nodes())
    # print("Clustering coefficient:", nx.average_clustering(G.to_undirected()))

    mean_degree.append(sum(dict(G.degree()).values())/G.number_of_nodes())  
    mean_clustering.append(nx.average_clustering(G.to_undirected()))

    components = sorted(nx.connected_components(G.to_undirected()), key=len, reverse=True)
    largest_component_size.append(len(components[0]))

print("Mean degree:", np.mean(mean_degree))
print("Mean clustering:", np.mean(mean_clustering))
print("Largest component size:", np.mean(largest_component_size))


In [None]:
def basic_stats(db):
    triplets = list(db.triplets.find({}))
    nodes = set()
    edges = 0
    for t in triplets:
        nodes.add(t['subject'])
        nodes.add(t['object'])
        edges += 1
    # graph
    G = nx.DiGraph()
    for t in triplets:
        G.add_edge(t['subject'], t['object'], relation=t['relation'])
    # numeric
    node_count = G.number_of_nodes()
    edge_count = G.number_of_edges()
    mean_degree = sum(dict(G.degree()).values())/node_count
    mean_clust = nx.average_clustering(G.to_undirected())
    comps = sorted(nx.connected_components(G.to_undirected()), key=len, reverse=True)
    largest_comp = len(comps[0]) if comps else 0
    # duplicates / aliases
    aliases = list(db.entity_aliases.find({}))
    canonical_counts = Counter([a['label'] for a in aliases])
    avg_aliases_per_canonical = sum(canonical_counts.values())/len(canonical_counts) if canonical_counts else 0
    return {
        'node_count': node_count,
        'edge_count': edge_count,
        'mean_degree': mean_degree,
        'mean_clustering': mean_clust,
        'largest_comp': largest_comp,
        'avg_aliases_per_canonical': avg_aliases_per_canonical,
    }

print("Ontology:", basic_stats(mongo_client.get_database("musique_gpt4_1_mini_onto_triplets")))
print("No-ontology:", basic_stats( mongo_client.get_database("musique_gpt4_1_mini_non_onto_triplets")))

In [None]:
# подсчитать кол-во листьев в графе
# подсчитать степень вершин для не листьев