In [1]:
from asymptotic_significative_interactions import *

In [2]:
##############################
# This is a simple script that explains how to infer a simplicial complex or a hypergraph
# from a presence/absence data table, i.e., a binary matrix of dimension (pun), where p 
# is the population size (number of random variables) and n is the sample size (number of # data points)

##############################
# Step 0: Data preparation
#
# Option to decide if we use the step method (recommended) or the systematic method, which is
# longer and does not return a simplicial complex. Use step_method = False for the systematic method.

step_method = True 

# Choose the name of the directory (dir_name) where to save the files and the 'prefix' name of each 
# created files (data_name)

dir_name = 'Directory'
data_name = 'Data'

# Create target Directory if doesn't exist

if not os.path.exists(dir_name):
    os.mkdir(dir_name)
    print("Directory ", dir_name, " Created ")
else:
    print("Directory ", dir_name, " already exists")

data_name = os.path.join(dir_name, data_name)

# Choose the significance level alpha to use throughout the analysis.
alpha = 0.01

# Load data
data_matrix = np.load('sample_data.npy')
data_matrix = data_matrix.astype(np.int64)
p,n = data_matrix.shape
print("Preparing to analyze a data set with " + str(p) + " variables and " + str(n) + " samples") 

Directory  Directory  already exists
Preparing to analyze a data set with 70 variables and 185 samples


In [3]:
########## First step : Extract all the unique tables
print('Step 1: Extract all the unique tables')

# Finds all unique tables
find_unique_tables(data_matrix, data_name)

print("Unique contingency tables saved in file " + data_name + "_table_list.json")

2415it [00:00, 12701.63it/s]

Step 1: Extract all the unique tables
How many different tables :  1090
Unique contingency tables saved in file Directory/Data_table_list.json





In [4]:
######### Second step : Extract all the pvalues with an asymptotic distribution

print('Step 2: Extract pvalues for all tables with an asymptotic distribution')

pvalues_for_tables(data_name)

print("Resulting p-values saved in file " + data_name + "_asymptotic_pval_dictio.json")

  0%|          | 0/1090 [00:00<?, ?it/s]

Step 2: Extract pvalues for all tables with an asymptotic distribution


100%|██████████| 1090/1090 [00:01<00:00, 715.50it/s]

Resulting p-values saved in file Directory/Data_asymptotic_pval_dictio.json





In [5]:
######### Third step : Find table for all links and their associated pvalue

print('Step 3 : Find table for all links and their associated pvalue')

with open(data_name + '_asymptotic_pval_dictio.json') as jsonfile:
    dictio = json.load(jsonfile)
    save_pairwise_p_values_phi_dictionary(data_matrix, dictio, data_name + '_asymptotic_pvalues')

print("Results saved in file " + data_name + "_asymptotic_pvalues.csv")

2415it [00:00, 30907.66it/s]

Step 3 : Find table for all links and their associated pvalue
Results saved in file Directory/Data_asymptotic_pvalues.csv





In [6]:
######### Fourth step : Choose alpha and extract the network

print('Step 4 : Generate network and extract edge_list for a given alpha')

g = read_pairwise_p_values(data_name + '_asymptotic_pvalues.csv', alpha)

nx.write_edgelist(g, data_name + '_asymptotic_edge_list_' + str(alpha)[2:] + '.txt', data=True)

print('Number of nodes : ', g.number_of_nodes())
print('Number of links : ', g.number_of_edges())
print("Edge list saved in file " + data_name + "_asymptotic_edge_list_" + str(alpha)[2:] + ".txt")

2415it [00:00, 517484.63it/s]

Step 4 : Generate network and extract edge_list for a given alpha
Number of nodes :  66
Number of links :  149
Edge list saved in file Directory/Data_asymptotic_edge_list_01.txt





In [7]:
######### Fifth step : Extract all the unique cubes
print('Step 5 : Extract all the unique valid cubes')

find_unique_cubes(data_matrix, data_name)

print("Unique contingency cubes saved in" + data_name + "_cube_list.json")


0it [00:00, ?it/s]

Step 5 : Extract all the unique valid cubes


54740it [00:00, 55694.72it/s]

How many different valid cubes :  7372
Unique contingency cubes saved inDirectory/Data_cube_list.json





In [8]:
######### Sixth step: Extract pvalues for all cubes with an asymptotic distribution

print('Step 6: Extract pvalues for all cubes with an asymptotic distribution')

pvalues_for_cubes(data_name)

  0%|          | 1/7372 [00:00<19:26,  6.32it/s]

Step 6: Extract pvalues for all cubes with an asymptotic distribution


100%|██████████| 7372/7372 [00:03<00:00, 2124.97it/s]


In [9]:
######## Seventh step: Find all triangles in the previous network

print('Step 7: Find all empty triangles in the network')

g = read_pairwise_p_values(data_name + '_asymptotic_pvalues.csv', alpha)

save_all_triangles(g, data_name + '_asymptotic_triangles_' + str(alpha)[2:])

print('Number of triangles : ', count_triangles_csv(data_name + '_asymptotic_triangles_' + str(alpha)[2:] + '.csv'))

2415it [00:00, 535570.46it/s]
97it [00:00, 320100.31it/s]

Step 7: Find all empty triangles in the network
Number of triangles :  96





In [10]:
######## Eighth step: Find all the p-values for the triangles under the hypothesis of homogeneity

print('Step 8: Find all the p-values for the triangles under the hypothesis of homogeneity')

with open(data_name + "_asymptotic_cube_pval_dictio.json") as jsonfile:
    dictio = json.load(jsonfile)
    triangles_p_values_tuple_dictionary(data_name + '_asymptotic_triangles_' + str(alpha)[2:] + '.csv', data_name + '_asymptotic_triangles_' + str(alpha)[2:] + '_pvalues.csv', dictio, data_matrix)

96it [00:00, 65632.14it/s]

Step 8: Find all the p-values for the triangles under the hypothesis of homogeneity





In [11]:
######## Last step : Exctract all 2-simplices

print('Extract 2-simplices')

significant_triplet_from_csv(data_name + '_asymptotic_triangles_' + str(alpha)[2:] + '_pvalues.csv', alpha, data_name + '_asymptotic_2-simplices_' + str(alpha)[2:])


80it [00:00, 175861.80it/s]

Extract 2-simplices



