In [70]:
import os
import networkx as nx
import itertools

def load_hypergraph(path, model):
    with open(path, 'r') as f:
        hg = f.readlines()
    if model == 'HyperDK00' or model == 'HyperDK11' or model == 'HyperPLR':
        hg = [list(map(int, e.split())) for e in hg]
    else:
        hg = [list(map(int, e.split(','))) for e in hg]
    return hg

def hg_projection(hg):
    # projected graph
    G = nx.Graph()
    # Add all nodes from the hypergraph
    nodes = set(node for edge in hg for node in edge)
    G.add_nodes_from(nodes)
    # For each hyperedge, create a clique
    for edge in hg:
        # Add edges between all pairs of nodes in the hyperedge
        G.add_edges_from(itertools.combinations(edge, 2))
    return G

def display_result(property, datasets, models, result):
    print(f'({property}) | ' + ' | '.join(datasets))
    print(' | '.join(['---'] * 6))
    for model in models:
        print(model + ' | ' + ' | '.join([f'{result[(dataset, model)]:.3f}' for dataset in datasets]))


graph_path = './generate_graphs'
datasets = ['contact-high-school', 'contact-primary-school', 'email-Enron', 'email-Eu', 'NDC-classes']
models = ['HyperDK00', 'HyperDK11','Hyperlap', 'Hyperlap+', 'TheRA', 'HyperPLR']


In [69]:
import networkx as nx
import numpy as np
from collections import Counter

property = 'degree_distribution'
result = {}

def get_degree_distribution_pdf(G):
    degrees = [d for n, d in G.degree()]
    degree_count = Counter(degrees)
    total_nodes = G.number_of_nodes()
    # Normalize to get probability density function
    degree_pdf = {k: v / total_nodes for k, v in degree_count.items()}
    return degree_pdf

for dataset in datasets:
    for model in models:
        hgs_names = os.listdir(f'{graph_path}/{model}/{dataset}')
        hgs = [load_hypergraph(f'{graph_path}/{model}/{dataset}/{name}', model) for name in hgs_names]
        projections = [hg_projection(hg) for hg in hgs]

        hg_gt = load_hypergraph(f'./data/{dataset}/unique.txt', model='HyperPLR')
        projection_gt = hg_projection(hg_gt)

        
        G1 = projection_gt
        total_diff = 0
        for i in range(len(hgs)):
            G2 = projections[i]

            # Get degree distributions as probability density functions
            P_pdf = get_degree_distribution_pdf(G1)
            Q_pdf = get_degree_distribution_pdf(G2)

            # Align the degrees
            all_degrees = set(P_pdf.keys()).union(set(Q_pdf.keys()))
            P_probs = []
            Q_probs = []

            # Small epsilon to avoid log(0)
            epsilon = 1e-10

            for k in sorted(all_degrees):
                P_prob = P_pdf.get(k, 0)
                Q_prob = Q_pdf.get(k, 0)
                # Add epsilon to avoid zero probabilities
                Q_prob = Q_prob if Q_prob > 0 else epsilon
                P_probs.append(P_prob)
                Q_probs.append(Q_prob)

            # Convert to numpy arrays
            P_probs = np.array(P_probs)
            Q_probs = np.array(Q_probs)

            # Ensure that P_probs sums to 1 
            P_probs /= P_probs.sum()
            # Compute cross entropy H(P, Q)
            cross_entropy = -np.sum(P_probs * np.log(Q_probs))
            total_diff += cross_entropy
        avg_diff = total_diff/len(hgs)
        print(f"{dataset}, {model}, Cross Entropy: {avg_diff}")
        result[(dataset, model)] = avg_diff

display_result(property, datasets, models, result)
# print(f'({property}) | ' + ' | '.join(datasets))
# print(' | '.join(['---'] * 6))
# for model in models:
#     print(model + ' | ' + ' | '.join([f'{result[(dataset, model)]:.3f}' for dataset in datasets]))

contact-high-school, HyperDK00, Cross Entropy: 23.025850929940457
contact-high-school, HyperDK11, Cross Entropy: 21.001819721539057
contact-high-school, Hyperlap, Cross Entropy: 5.958404326476858
contact-high-school, Hyperlap+, Cross Entropy: 5.335822199029654
contact-high-school, TheRA, Cross Entropy: 6.633860483399639
contact-high-school, HyperPLR, Cross Entropy: 5.065288272409231
contact-primary-school, HyperDK00, Cross Entropy: 23.02585092994046
contact-primary-school, HyperDK11, Cross Entropy: 22.634531379112225
contact-primary-school, Hyperlap, Cross Entropy: 8.07223821458973
contact-primary-school, Hyperlap+, Cross Entropy: 8.811819888628994
contact-primary-school, TheRA, Cross Entropy: 10.530116711406675
contact-primary-school, HyperPLR, Cross Entropy: 7.656663380108346
email-Enron, HyperDK00, Cross Entropy: 23.025850929940457
email-Enron, HyperDK11, Cross Entropy: 12.405930719752595
email-Enron, Hyperlap, Cross Entropy: 9.29385832998112
email-Enron, Hyperlap+, Cross Entropy: 9

In [72]:
# triangle analysis
import networkx as nx
import numpy as np

property = 'singular_value_distribution'
result = {}

def get_singular_value_distribution_pdf(G, num_singular_values=None):
    A = nx.to_numpy_array(G)
    singular_values = np.linalg.svd(A, compute_uv=False)
    # Optionally limit to the top singular values
    if num_singular_values:
        singular_values = singular_values[:num_singular_values]
    total = np.sum(singular_values)
    # Normalize to get probability density function
    singular_value_pdf = singular_values / total
    return singular_value_pdf


for dataset in datasets:
    for model in models:
        hgs_names = os.listdir(f'{graph_path}/{model}/{dataset}')
        hgs = [load_hypergraph(f'{graph_path}/{model}/{dataset}/{name}', model) for name in hgs_names]
        projections = [hg_projection(hg) for hg in hgs]

        hg_gt = load_hypergraph(f'./data/{dataset}/unique.txt', model='HyperPLR')
        projection_gt = hg_projection(hg_gt)

        G1 = projection_gt
        total_diff = 0
        for i in range(len(hgs)):
            G2 = projections[i]

            # Get singular value distributions as PDFs
            P_pdf = get_singular_value_distribution_pdf(G1)
            Q_pdf = get_singular_value_distribution_pdf(G2)

            # Align the singular values by index (use the minimum length)
            min_length = min(len(P_pdf), len(Q_pdf))
            P_probs = P_pdf[:min_length]
            Q_probs = Q_pdf[:min_length]

            # Small epsilon to avoid log(0)
            epsilon = 1e-10
            Q_probs = np.where(Q_probs > 0, Q_probs, epsilon)

            # Ensure that P_probs sums to 1 (should be true, but confirm)
            P_probs /= P_probs.sum()

            # Compute cross entropy H(P, Q)
            cross_entropy = -np.sum(P_probs * np.log(Q_probs))
            total_diff += cross_entropy

        avg_diff = total_diff/len(hgs)
        print(f"{dataset}, {model}, Cross Entropy: {avg_diff}")
        result[(dataset, model)] = avg_diff


display_result(property, datasets, models, result)

contact-high-school, HyperDK00, Cross Entropy: 6.212359937364999
contact-high-school, HyperDK11, Cross Entropy: 5.46224002332582
contact-high-school, Hyperlap, Cross Entropy: 5.369444566812628
contact-high-school, Hyperlap+, Cross Entropy: 5.371690772501404
contact-high-school, TheRA, Cross Entropy: 5.376621650374456
contact-high-school, HyperPLR, Cross Entropy: 5.346510020945931
contact-primary-school, HyperDK00, Cross Entropy: 5.73030801342421
contact-primary-school, HyperDK11, Cross Entropy: 5.1469741701475815
contact-primary-school, Hyperlap, Cross Entropy: 5.042727748436525
contact-primary-school, Hyperlap+, Cross Entropy: 5.056472053334813
contact-primary-school, TheRA, Cross Entropy: 5.053760216450511
contact-primary-school, HyperPLR, Cross Entropy: 5.031801361220007
email-Enron, HyperDK00, Cross Entropy: 4.652814173793677
email-Enron, HyperDK11, Cross Entropy: 4.51341482728036
email-Enron, Hyperlap, Cross Entropy: 4.469817096210736
email-Enron, Hyperlap+, Cross Entropy: 4.52013

In [73]:
property = 'triangles_num'
result = {}


def get_triangles_num(G):
    num_triangles = sum(nx.triangles(G).values()) / 3
    return num_triangles


for dataset in datasets:
    for model in models:
        hgs_names = os.listdir(f'{graph_path}/{model}/{dataset}')
        hgs = [load_hypergraph(f'{graph_path}/{model}/{dataset}/{name}', model) for name in hgs_names]
        projections = [hg_projection(hg) for hg in hgs]

        hg_gt = load_hypergraph(f'./data/{dataset}/unique.txt', model='HyperPLR')
        projection_gt = hg_projection(hg_gt)

        G1 = projection_gt
        num_triangles_g1 = get_triangles_num(G1)
        total_diff = 0
        for i in range(len(hgs)):
            G2 = projections[i]
            num_triangles_g2 = get_triangles_num(G2)
            total_diff += abs(num_triangles_g1 - num_triangles_g2)

        avg_diff = total_diff/num_triangles_g1/len(hgs)
        print(f"{dataset}, {model}, triangle different: {avg_diff}")
        result[(dataset, model)] = avg_diff

display_result(property, datasets, models, result)

contact-high-school, HyperDK00, triangle different: 167.1174926943308
contact-high-school, HyperDK11, triangle different: 84.44020455873758
contact-high-school, Hyperlap, triangle different: 0.5990531852717709
contact-high-school, Hyperlap+, triangle different: 0.9419345412039742
contact-high-school, TheRA, triangle different: 0.4681180596142607
contact-high-school, HyperPLR, triangle different: 0.31552308591466977
contact-primary-school, HyperDK00, triangle different: 21.456141094834233
contact-primary-school, HyperDK11, triangle different: 15.98765034695451
contact-primary-school, Hyperlap, triangle different: 0.40897648419429455
contact-primary-school, Hyperlap+, triangle different: 0.28384155744024675
contact-primary-school, TheRA, triangle different: 0.1984194294525829
contact-primary-school, HyperPLR, triangle different: 0.3538454124903624
email-Enron, HyperDK00, triangle different: 41.426700353714
email-Enron, HyperDK11, triangle different: 10.867367357251137
email-Enron, Hyperl

In [74]:
property = 'diameter'
result = {}

def get_diameter_value(G):
    if nx.is_connected(G):
        diameter_value = nx.diameter(G)
        return diameter_value
    else:
        max_diameter_value = 0
        connected_components = nx.connected_components(G)
        for component in connected_components:
            subgraph = G.subgraph(component)
            diameter_value = nx.diameter(subgraph)
            if max_diameter_value < diameter_value:
                max_diameter_value = diameter_value
        return max_diameter_value


for dataset in datasets:
    for model in models:
        hgs_names = os.listdir(f'{graph_path}/{model}/{dataset}')
        hgs = [load_hypergraph(f'{graph_path}/{model}/{dataset}/{name}', model) for name in hgs_names]
        projections = [hg_projection(hg) for hg in hgs]

        hg_gt = load_hypergraph(f'./data/{dataset}/unique.txt', model='HyperPLR')
        projection_gt = hg_projection(hg_gt)

        G1 = projection_gt
        diameter_value_g1 = get_diameter_value(G1)
        total_diff = 0
        for i in range(len(hgs)):
            G2 = projections[i]
            diameter_value_g2 = get_diameter_value(G2)
            total_diff += abs(diameter_value_g1 - diameter_value_g2)

        avg_diff = total_diff/diameter_value_g1/len(hgs)
        print(f"{dataset}, {model}, diameter diff: {avg_diff}")
        result[(dataset, model)] = avg_diff

display_result(property, datasets, models, result)

contact-high-school, HyperDK00, diameter diff: 0.5
contact-high-school, HyperDK11, diameter diff: 0.25
contact-high-school, Hyperlap, diameter diff: 0.2
contact-high-school, Hyperlap+, diameter diff: 0.25
contact-high-school, TheRA, diameter diff: 0.25
contact-high-school, HyperPLR, diameter diff: 0.25
contact-primary-school, HyperDK00, diameter diff: 0.33333333333333337
contact-primary-school, HyperDK11, diameter diff: 0.33333333333333337
contact-primary-school, Hyperlap, diameter diff: 0.06666666666666667
contact-primary-school, Hyperlap+, diameter diff: 0.33333333333333337
contact-primary-school, TheRA, diameter diff: 0.0
contact-primary-school, HyperPLR, diameter diff: 0.0
email-Enron, HyperDK00, diameter diff: 0.5
email-Enron, HyperDK11, diameter diff: 0.25
email-Enron, Hyperlap, diameter diff: 0.25
email-Enron, Hyperlap+, diameter diff: 0.05
email-Enron, TheRA, diameter diff: 0.25
email-Enron, HyperPLR, diameter diff: 0.0
email-Eu, HyperDK00, diameter diff: 0.6666666666666667
ema