In [1]:
import json

import numpy as np
import pandas as pd
import xgi
from scipy.stats import pearsonr, spearmanr

from sod import *

# Global and local correlation measures

### Global correlation measures

First, we quantify the correlation between different measures of simpliciality on the corpus of higher-order datasets that we consider. 

In [None]:
def load_data(filename="Data/empirical_simpliciality.json"):
    with open(filename, "r") as f:
        data = json.load(f)
    return data


# Use the function to load the data
data = load_data()


# Convert the nested dictionary to a pandas DataFrame
df = pd.DataFrame.from_dict(data, orient="index")

# Specify the columns to compute correlations for
columns = ["es", "fes", "sf"]

# Compute the correlations and p-values for each pair of columns
results = {}
for i in range(len(columns)):
    for j in range(i + 1, len(columns)):
        col1, col2 = columns[i], columns[j]

        # Spearman's correlation
        spearman_corr, spearman_p = spearmanr(df[col1], df[col2])
        results[(col1, col2, "spearman")] = (spearman_corr, spearman_p)

        # Pearson's correlation
        pearson_corr, pearson_p = pearsonr(df[col1], df[col2])
        results[(col1, col2, "pearson")] = (pearson_corr, pearson_p)

# Print results
for key, (corr, p_val) in results.items():
    col1, col2, method = key
    print(
        f"Correlation ({method}) between {col1} and {col2}: {corr:.4f}, p-value: {p_val:.4f}"
    )

### Local correlations

Now examining a single higher-order dataset, we look at the simplicial assortativity and the correlation between measures

In [None]:
dataset = "email-enron"
max_order = 2

H = xgi.load_xgi_data(dataset, max_order=max_order)
H.cleanup()

In [4]:
sf = H.nodes.local_simplicial_fraction.asnumpy()
es = H.nodes.local_edit_simpliciality.asnumpy()
fes = H.nodes.local_face_edit_simpliciality.asnumpy()

In [None]:
ls = {"sf": sf, "es": es, "fes": fes}
results = {}

for i, m1 in enumerate(ls):
    for j, m2 in enumerate(ls):
        if i < j:
            # the measures are NaN in the same places
            s1 = ls[m1][~np.isnan(ls[m1])]
            s2 = ls[m2][~np.isnan(ls[m2])]

            # Spearman's correlation
            spearman_corr, spearman_p = spearmanr(s1, s2)
            results[(m1, m2, "spearman")] = (spearman_corr, spearman_p)

            # Pearson's correlation
            pearson_corr, pearson_p = pearsonr(s1, s2)
            results[(m1, m2, "pearson")] = (pearson_corr, pearson_p)

# Print results
for key, (corr, p_val) in results.items():
    col1, col2, method = key
    print(
        f"Correlation ({method}) between {col1} and {col2}: {corr:.4f}, p-value: {p_val:.4f}"
    )