### Test distribution similarities between real and simulated data

In [2]:
# ! ml Anaconda3/2021.05 GCC CUDA/11.4.1   
# ! export LD_LIBRARY_PATH=/apps/rocs/2020.08/sandybridge/software/CUDA/11.4.1/lib:$LD_LIBRARY_PATH

In [7]:
# import torch
# torch.cuda.is_available()

False

In [4]:

import pickle
import torch

from create_synthetic_with_confounders import create_dag_from_ppi, topological_sort, reindex_dag_nodes, logistic_function, generate_gene_expression, get_parents_for_each_node, simulate_observational_data, intervene_on_gene
import networkx as nx
import numpy as np
import pandas as pd
from time import time

tic  = time()
edge_index = torch.load('../../processed/torch_data/chemical/real_lognorm/edge_index_A549.pt')
real_observational_data =  torch.load('../../processed/torch_data/chemical/real_lognorm/data_forward_A549.pt')
num_nodes = len(edge_index.unique())
edge_index = edge_index.transpose(1,0).tolist()
toc = time()
print('Time loading data: {:.3f} secs'.format(toc - tic))

tic = time()
ppi_network = nx.Graph()
ppi_network.add_edges_from(edge_index)
toc = time()
print('Time creating PPI network: {:.3f} secs'.format(toc - tic))


tic = time()
# Convert the PPI network to a DAG
dag = create_dag_from_ppi(ppi_network)
sorted_nodes = topological_sort(dag.edges(), num_nodes)
toc = time()
print('Time converting PPI to DAG: {:.3f} secs'.format(toc - tic))


tic = time()
#Reindex so that it's consistent with hierarchical ordering
dag = reindex_dag_nodes(dag, sorted_nodes)
parents_dict = get_parents_for_each_node(dag)
toc = time()
print('Time relabeling nodes: {:.3f} secs'.format(toc - tic))



tic = time()
#Simulate observational data
observational_data = simulate_observational_data(parents_dict, num_samples = 100)
toc = time()
print('Time simulating observational data: {:.3f} secs'.format(toc - tic))



OSError: libcusparse.so.11: cannot open shared object file: No such file or directory

In [None]:


#Real observational data (let's load it and compare distributions - if not the same as real data, then I need to tweak the causal mechanisms)
real_observational_data = np.stack([e.diseased for e in real_observational_data])

    #visual inspection using histograms





#Simulate interventional data


#Real interventional data (let's load it and compare distributions - if not the same as real data, then I need to tweak the causal mechanisms)



In [None]:
real_observational_data = observational_data
import random
print(f'Simulated observational data: {observational_data.shape}')
print(f'Real observational data: {real_observational_data.shape}')

print('Subsampling real data')
real_observational_data = real_observational_data[np.random.randint(0, len(real_observational_data), 100)]
print(f'Real observational data: {real_observational_data.shape}')

In [None]:
##Let's check the distribution similarity
#Simple PCA

from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

# Assuming X1 and X2 are your datasets
pca = PCA(n_components=2)
X1_pca = pca.fit_transform(real_observational_data)
X2_pca = pca.transform(observational_data)

plt.scatter(X1_pca[:, 0], X1_pca[:, 1], label='Real observational data')
plt.scatter(X2_pca[:, 0], X2_pca[:, 1], label='Simulated observational data')
plt.legend()
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.title('PCA of Datasets')
plt.savefig('scatter_plot_pca.png')
plt.close()


In [None]:
## t-SNE
from sklearn.manifold import TSNE

tsne = TSNE(n_components=2)
X1_tsne = tsne.fit_transform(real_observational_data)
X2_tsne = tsne.fit_transform(observational_data)

plt.scatter(X1_tsne[:, 0], X1_tsne[:, 1], label='Real observational data')
plt.scatter(X2_tsne[:, 0], X2_tsne[:, 1], label='Simulated observational data')
plt.legend()
plt.xlabel('t-SNE Dimension 1')
plt.ylabel('t-SNE Dimension 2')
plt.title('t-SNE of Datasets')
plt.savefig('scatter_plot_tsne.png')
plt.close()


In [None]:
##Fit a Gaussian distribution and compare mean and covariate
from scipy.stats import multivariate_normal

# Fit multivariate normal to the data
mean1, cov1 = np.mean(real_observational_data, axis=0), np.cov(real_observational_data, rowvar=False)
mean2, cov2 = np.mean(observational_data, axis=0), np.cov(observational_data, rowvar=False)


In [None]:
plt.figure(figsize=(8, 8))

plt.scatter(mean1, mean2, alpha=0.6)
plt.title('Scatter Plot of Vector 1 vs Vector 2')
plt.xlabel('Vector 1')
plt.ylabel('Vector 2')
plt.grid(True)
plt.show()
plt.savefig('scatter_plot_mean_gauss.png')
plt.close()


plt.figure(figsize=(12, 6))

plt.hist(mean1, bins=50, alpha=0.5, label='Vector 1')
plt.hist(mean2, bins=50, alpha=0.5, label='Vector 2')

plt.title('Histogram of Vectors')
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.legend()
plt.grid(True)
plt.savefig('histogra_gauss.png')
plt.close()

import seaborn as sns
import numpy as np

# Creating a combined array for heatmap
data = np.vstack([mean1, mean2])

plt.figure(figsize=(10, 2))
sns.heatmap(data, annot=False, cmap='viridis', cbar=True, yticklabels=['Vector 1', 'Vector 2'])
plt.title('Heatmap of Vectors')
plt.xlabel('Index')
plt.savefig('heatmap_gauss.png')
plt.close()


In [None]:
# ##Modified graphs Apply various edge removal strategies
# from create_synthetic_with_confounders import remove_random_edges, remove_high_betweenness_edges, remove_bridge_edges

# dag_random_edges_removed = remove_random_edges(dag.copy(), remove_fraction=0.1)
# dag_high_betweenness_edges_removed = remove_high_betweenness_edges(dag.copy(), fraction=0.1)
# dag_bridge_edges_removed = remove_bridge_edges(dag.copy(), fraction=0.1)

# dag_random_edges_removed_parents_dict = get_parents_for_each_node(dag_random_edges_removed)
# dag_high_betweenness_edges_removed_parents_dict = get_parents_for_each_node(dag_high_betweenness_edges_removed)
# dag_bridge_edges_removed_parents_dict = get_parents_for_each_node(dag_bridge_edges_removed)


# data_random_edges = simulate_observational_data(dag_random_edges_removed, num_samples = 100)
# data_high_betweenness_edges = simulate_observational_data(dag_high_betweenness_edges_removed_parents_dict, num_samples = 100)
# data_bridge_edges = simulate_observational_data(dag_bridge_edges_removed_parents_dict, num_samples = 100)
