## 1. Setup

In [1]:
import numpy as np
%matplotlib notebook
import matplotlib.pyplot as plt
import pandas as pd
import re
from glob import glob
import os

In [2]:
# Import the functions

from functions_py_file import *

In [3]:
# Read in data

adjacency_matrix = pd.read_csv("adjacency_matrix2.csv", header=0, index_col=0)

multilevel = pd.read_csv("multilevel2.csv", header=0, index_col=0) 

In [4]:
# Clean the adjacency matrix
adjacency_matrix = clean_adjacency_mat(adjacency_matrix)

# Clean the multilevel lookup table
multilevel = clean_multilevel(multilevel, adjacency_matrix)

In [5]:
# Create the subset adjacency matrix

subset_leaf_list = ["Amyg_L_73_1", "Hippo_L_75_1"]
subset = subset_matrix_creator(subset_leaf_list, adjacency_matrix, multilevel)

In [6]:
# Create the descendants matrix
descendants = adjacency_descendants(subset, N=20, mu=3.0)

# Create the ancestors matrix
ancestors = adjacency_ancestors(subset, N=20, mu=3.0)

## 2. Generate data for calculating Ds and Cs

In [7]:
# Generate simulated data

generate_simulated_data(filename="test1_data.npz", subset=subset, case=1, n_repeats=10, N=20, mu=3.0)
generate_simulated_data(filename="test2_data.npz", subset=subset, case=2, n_repeats=10, N=20, mu=3.0)
generate_simulated_data(filename="test3_data.npz", subset=subset, case=3, n_repeats=10, N=20, mu=3.0)
generate_simulated_data(filename="test4_data.npz", subset=subset, case=4, n_repeats=10, N=20, mu=3.0)

## 3. Calculating Ds (our version)

In [8]:
# Modify our current permutation testing function to have outputs 

def permutation_testing(filename_old, filename_new, subset, n_repeats, nperm, N, mu, niter, clip, initial_prob):
    ''' Function that conducts permutation testing
    
    This function conducts permutation testing using the generated data
    
    Parameters
    ----------
    filename_old: string
        The pattern of the filenames of the generated data; example: "test1_data_repeat_*"
    
    filename_new: string
        The user-specified filename pattern for the permutation testing results (choose a different name from 
        filename_old, i.e. say "results" instead of "data" if you don't want generated data to get overwritten 
        by permutation testing results); example: "results.npz"
    
    subset: pandas.DataFrame
        The subset adjacency matrix
    
    n_repeats: int
        The number of repeats. We generated a random dataset with the same parameters but `n_repeats` 
        different realizations of the random variables. `n_repeats` must be the same value as `n_repeats`
        when we generated data earlier. n_repeats should be 1 in practice for external users, but in our case, 
        since we simulated a lot of data, n_repeats is greater than 1.
    
    nperm: int
        The number of permutations for permutation testing 
    
    N: int
        The number of samples. N must be the same value as N from generating data earlier.
    
    mu: float
        The difference in means (generally unknown). mu should be the same value as mu from generating data
        earlier in order to get meaningful results. But, mu doesn't have to be the same if you don't want to
        make it the same.
    
    niter: int
        The number of iterations of the EM algorithm
    
    clip: float
        Number that clips probabilities away from 0 or 1
    
    initial_prob: float
        The intial probability
    
    Returns 
    ----------
    npz file (written to disk, not explicitly returned)
        The 1st array contains p-values, 2nd array contains the names of the structures in the subset, 3rd 
        array contains the posterior probabilities, and 4th array contains the information from the prior 3 
        arrays saved in 1 string per structure.
    
    '''
    
    M = subset.shape[0] # Number of total unique structures
    S = np.array(subset, dtype = bool)
    names_subset = subset.columns # List of the 8 structures' names
    Descendants = adjacency_descendants(subset, N=N, mu=mu)
    Descendants_and_self = np.logical_or(Descendants, np.eye(M))
    
    basename, extension = os.path.splitext(filename_new)
    filename_old = glob(filename_old)
    filename_old = sorted(filename_old)
    
    for j in range(n_repeats):
        
        # Load the generated data for each repeat
        data = np.load(filename_old[j])
        X = data["X"]
        Z = data["Z"]
        G = data["G"]
        
        outputs = [] # Empty list for each iteration
        
        ### PARAMETER ESTIMATION ###
    
        P_subset = np.ones(M) * 0.5 # Array of 8 copies of 0.5
        Q = Q_from_P(P_subset, S)

        P0 = np.ones(M) * initial_prob
        P_subset = estimate_P(X[G], mu, S, Descendants_and_self, draw=0, P0=P0, niter=niter, names=names_subset, clip=clip)
        # Set draw = 0 to prevent drawing the graphs
        
        ### GENERATING PERMUTED DATA ###
    
        Ps = []
        for n in range(nperm):
            Xp = X[np.random.permutation(N)[G]]
            P_ = estimate_P(Xp,mu,S,Descendants_and_self,draw=0,niter=niter,P0=P0)
            Ps.append(P_)

        Ps_sort = np.array([np.sort(Pi)[::-1] for Pi in Ps])
        
        ### PERMUTATION TESTING ###
    
        inds = np.argsort(P_subset)[::-1]
        pval = np.zeros_like(P_subset)
        alpha = 0.05
        
        pval_list = [] # Empty list to be filled
        names_list = [] # Empty list to be filled
        posterior_list = [] # Empty list to be filled
        
        for i in range(M):    
            pval[inds[i]] = np.mean(Ps_sort[:,i] >= P_subset[inds[i]])
            outputs.append(f"{names_subset[inds[i]]}, P[Z=1|X]={P_subset[inds[i]]}, p={pval[inds[i]]}")
            # Every structure that gets rejected gets an entry
            
            pval_list.append(pval[inds[i]])
            names_list.append(names_subset[inds[i]])
            posterior_list.append(P_subset[inds[i]])
        
        ### SORT THE POSTERIOR VALUES ###
        
        # Use the subset adjacency matrix to create a dictionary
        columns = np.array(subset.columns)
        dictionary = dict(enumerate(columns.flatten(), 1))
        dictionary = dict((value, key) for key, value in dictionary.items()) # Swap the keys and values
        outputs = sorting_function(outputs, dictionary)
        
        ### SAVE DATA ### 
        
        filename_new_this_repeat = basename + f'_repeat_{j:06d}' + extension
        np.savez(filename_new_this_repeat, pval = pval_list, names = names_list, posterior = posterior_list, strings = outputs)
        
        ### OUTPUTS ###
        
        return np.quantile(Ps_sort, 0.95, axis=0)

In [9]:
# Permutation testing

print("\033[1m" + "Ds for test 1:" + "\033[0m")
print(permutation_testing(filename_old="test1_data_repeat_*", filename_new = "test1_results.npz", subset=subset, n_repeats=10, nperm=10, N=20, mu=3.0, niter=5, initial_prob = 0.5, clip=0.001))
print("\n")

print("\033[1m" + "Ds for test 2:" + "\033[0m")
print(permutation_testing(filename_old="test2_data_repeat_*", filename_new = "test2_results.npz", subset=subset, n_repeats=10, nperm=10, N=20, mu=3.0, niter=5, clip=0.0001, initial_prob = 0.5))
print("\n")

print("\033[1m" + "Ds for test 3:" + "\033[0m")
print(permutation_testing(filename_old="test3_data_repeat_*", filename_new = "test3_results.npz", subset=subset, n_repeats=10, nperm=10, N=20, mu=3.0, niter=5, initial_prob = 0.25, clip=0.01))
print("\n")

print("\033[1m" + "Ds for test 4:" + "\033[0m")
print(permutation_testing(filename_old="test4_data_repeat_*", filename_new = "test4_results.npz", subset=subset, n_repeats=10, nperm=10, N=20, mu=3.0, niter=5, initial_prob = 0.75, clip=0.001))

[1mDs for test 1:[0m
[0.1894531  0.1894531  0.1894531  0.1894531  0.08984659 0.08984659
 0.00126238 0.00126238]


[1mDs for test 2:[0m
[5.99138136e-01 5.99138136e-01 5.99138136e-01 5.99138136e-01
 5.72739619e-01 5.72739619e-01 1.72202632e-04 1.72202632e-04]


[1mDs for test 3:[0m
[0.85154346 0.85154346 0.85154346 0.85154346 0.78849059 0.78849059
 0.7298553  0.7298553 ]


[1mDs for test 4:[0m
[0.86350713 0.86350713 0.86350713 0.86350713 0.37892168 0.37892168
 0.27892163 0.27892163]


## 4. Calculating Cs (based on the paper)

In [61]:
# Modified permutation testing function

def permutation_testing2(filename_old, filename_new, subset, n_repeats, nperm, N, mu, niter, clip, initial_prob):
    ''' Function that conducts permutation testing
    
    This function conducts permutation testing using the generated data
    
    Parameters
    ----------
    filename_old: string
        The pattern of the filenames of the generated data; example: "test1_data_repeat_*"
    
    filename_new: string
        The user-specified filename pattern for the permutation testing results (choose a different name from 
        filename_old, i.e. say "results" instead of "data" if you don't want generated data to get overwritten 
        by permutation testing results); example: "results.npz"
    
    subset: pandas.DataFrame
        The subset adjacency matrix
    
    n_repeats: int
        The number of repeats. We generated a random dataset with the same parameters but `n_repeats` 
        different realizations of the random variables. `n_repeats` must be the same value as `n_repeats`
        when we generated data earlier. n_repeats should be 1 in practice for external users, but in our case, 
        since we simulated a lot of data, n_repeats is greater than 1.
    
    nperm: int
        The number of permutations for permutation testing 
    
    N: int
        The number of samples. N must be the same value as N from generating data earlier.
    
    mu: float
        The difference in means (generally unknown). mu should be the same value as mu from generating data
        earlier in order to get meaningful results. But, mu doesn't have to be the same if you don't want to
        make it the same.
    
    niter: int
        The number of iterations of the EM algorithm
    
    clip: float
        Number that clips probabilities away from 0 or 1
    
    initial_prob: float
        The intial probability
    
    Returns 
    ----------
    npz file (written to disk, not explicitly returned)
        The 1st array contains p-values, 2nd array contains the names of the structures in the subset, 3rd 
        array contains the posterior probabilities, and 4th array contains the information from the prior 3 
        arrays saved in 1 string per structure.
    
    '''
    
    M = subset.shape[0] # Number of total unique structures
    S = np.array(subset, dtype = bool)
    names_subset = subset.columns # List of the 8 structures' names
    Descendants = adjacency_descendants(subset, N=N, mu=mu)
    Descendants_and_self = np.logical_or(Descendants, np.eye(M))
    
    basename, extension = os.path.splitext(filename_new)
    filename_old = glob(filename_old)
    filename_old = sorted(filename_old)
    
    for j in range(n_repeats):
        
        # Load the generated data for each repeat
        data = np.load(filename_old[j])
        X = data["X"]
        Z = data["Z"]
        G = data["G"]
        
        outputs = [] # Empty list for each iteration
        
        ### PARAMETER ESTIMATION ###
    
        P_subset = np.ones(M) * 0.5 # Array of 8 copies of 0.5
        Q = Q_from_P(P_subset, S)

        P0 = np.ones(M) * initial_prob
        P_subset = estimate_P(X[G], mu, S, Descendants_and_self, draw=0, P0=P0, niter=niter, names=names_subset, clip=clip)
        # Set draw = 0 to prevent drawing the graphs
        
        ### GENERATING PERMUTED DATA ###
    
        Ps = []
        for n in range(nperm):
            Xp = X[np.random.permutation(N)[G]]
            P_ = estimate_P(Xp,mu,S,Descendants_and_self,draw=0,niter=niter,P0=P0)
            Ps.append(P_)

        Ps_sort = np.array([np.sort(Pi)[::-1] for Pi in Ps])
        
        ### PERMUTATION TESTING ###
    
        inds = np.argsort(P_subset)[::-1]
        pval = np.zeros_like(P_subset)
        alpha = 0.05
        
        pval_list = [] # Empty list to be filled
        names_list = [] # Empty list to be filled
        posterior_list = [] # Empty list to be filled
        
        for i in range(1):    
            pval[inds[i]] = np.mean(Ps_sort[:,i] >= P_subset[inds[i]])
            outputs.append(f"{names_subset[inds[i]]}, P[Z=1|X]={P_subset[inds[i]]}, p={pval[inds[i]]}")
            # Every structure that gets rejected gets an entry
            
            pval_list.append(pval[inds[i]])
            names_list.append(names_subset[inds[i]])
            posterior_list.append(P_subset[inds[i]])
        
        ### SORT THE POSTERIOR VALUES ###
        
        # Use the subset adjacency matrix to create a dictionary
        columns = np.array(subset.columns)
        dictionary = dict(enumerate(columns.flatten(), 1))
        dictionary = dict((value, key) for key, value in dictionary.items()) # Swap the keys and values
        outputs = sorting_function(outputs, dictionary)
        
        ### SAVE DATA ### 
        
        filename_new_this_repeat = basename + f'_repeat_{j:06d}' + extension
        np.savez(filename_new_this_repeat, pval = pval_list, names = names_list, posterior = posterior_list, strings = outputs)
        
        ### OUTPUTS ###
        return Ps_sort

In [62]:
# Permutation testing

print("\033[1m" + "Cs for test 1:" + "\033[0m")
print(permutation_testing2(filename_old="test1_data_repeat_*", filename_new = "test1_results.npz", subset=subset, n_repeats=10, nperm=10, N=20, mu=3.0, niter=5, initial_prob = 0.5, clip=0.001))
print("\n")

print("\033[1m" + "Cs for test 2:" + "\033[0m")
print(permutation_testing2(filename_old="test2_data_repeat_*", filename_new = "test2_results.npz", subset=subset, n_repeats=10, nperm=10, N=20, mu=3.0, niter=5, clip=0.0001, initial_prob = 0.5))
print("\n")

print("\033[1m" + "Cs for test 3:" + "\033[0m")
print(permutation_testing2(filename_old="test3_data_repeat_*", filename_new = "test3_results.npz", subset=subset, n_repeats=10, nperm=10, N=20, mu=3.0, niter=5, initial_prob = 0.25, clip=0.01))
print("\n")

print("\033[1m" + "Cs for test 4:" + "\033[0m")
print(permutation_testing2(filename_old="test4_data_repeat_*", filename_new = "test4_results.npz", subset=subset, n_repeats=10, nperm=10, N=20, mu=3.0, niter=5, initial_prob = 0.75, clip=0.001))

[1mCs for test 1:[0m
[[3.31485862e-02 3.31485862e-02 3.31485862e-02 3.31485862e-02
  7.77698973e-05 7.77698973e-05 6.90450538e-05 6.90450538e-05]
 [1.76153473e-01 1.76153473e-01 1.76153473e-01 1.76153473e-01
  8.69987684e-02 8.69987684e-02 1.00338451e-03 1.00338451e-03]
 [4.53238447e-02 4.53238447e-02 4.53238447e-02 4.53238447e-02
  1.71727766e-03 1.71727766e-03 5.83183538e-05 5.83183538e-05]
 [1.32299497e-01 1.32299497e-01 1.32299497e-01 1.32299497e-01
  9.13059261e-02 9.13059261e-02 5.78283307e-05 5.78283307e-05]
 [3.46030816e-02 3.46030816e-02 3.46030816e-02 3.46030816e-02
  1.21281973e-03 1.21281973e-03 4.40000426e-05 4.40000426e-05]
 [1.21444688e-01 1.21444688e-01 1.21444688e-01 1.21444688e-01
  8.92345499e-02 8.92345499e-02 4.50403046e-05 4.50403046e-05]
 [1.26739395e-01 1.26739395e-01 1.26739395e-01 1.26739395e-01
  8.77826605e-02 8.77826605e-02 5.74340612e-05 5.74340612e-05]
 [1.74058708e-01 1.74058708e-01 1.74058708e-01 1.74058708e-01
  8.99656452e-02 8.99656452e-02 7.025930