### Test distribution similarities between real and simulated data

In [1]:

import pickle
import torch

from create_synthetic_with_confounders import create_dag_from_ppi, topological_sort, reindex_dag_nodes, logistic_function, generate_gene_expression, get_parents_for_each_node, simulate_observational_data, intervene_on_gene
import networkx as nx
import numpy as np
import pandas as pd
from time import time

tic  = time()
edge_index = torch.load('../../processed/torch_data/chemical/real_lognorm/edge_index_A549.pt')
num_nodes = len(edge_index.unique())
edge_index = edge_index.transpose(1,0).tolist()
toc = time()
print('Time loading data: {:.3f} secs'.format(toc - tic))

tic = time()
ppi_network = nx.Graph()
ppi_network.add_edges_from(edge_index)
toc = time()
print('Time creating PPI network: {:.3f} secs'.format(toc - tic))


tic = time()
# Convert the PPI network to a DAG
dag = create_dag_from_ppi(ppi_network)
sorted_nodes = topological_sort(dag.edges(), num_nodes)
toc = time()
print('Time converting PPI to DAG: {:.3f} secs'.format(toc - tic))


tic = time()
#Reindex so that it's consistent with hierarchical ordering
dag = reindex_dag_nodes(dag, sorted_nodes)
parents_dict = get_parents_for_each_node(dag)
toc = time()
print('Time relabeling nodes: {:.3f} secs'.format(toc - tic))



tic = time()
#Simulate observational data
observational_data = simulate_observational_data(parents_dict, num_samples = 100)
toc = time()
print('Time simulating observational data: {:.3f} secs'.format(toc - tic))



Time loading data: 0.331 secs
Time creating PPI network: 0.419 secs


In [None]:


#Real observational data (let's load it and compare distributions - if not the same as real data, then I need to tweak the causal mechanisms)
real_observational_data =  torch.load('../../processed/torch_data/chemical/real_lognorm/data_forward_A549.pt')
real_observational_data = np.stack([e.diseased for e in real_observational_data])

    #visual inspection using histograms





#Simulate interventional data


#Real interventional data (let's load it and compare distributions - if not the same as real data, then I need to tweak the causal mechanisms)



In [None]:
import random
print(f'Simulated observational data: {observational_data.shape}')
print(f'Real observational data: {real_observational_data.shape}')

print('Subsampling real data')
real_observational_data = real_observational_data[np.random.randint(0, len(real_observational_data), 100)]
print(f'Real observational data: {real_observational_data.shape}')