# 1. Setup

In [1]:
import numpy as np
%matplotlib notebook
import matplotlib.pyplot as plt
import pandas as pd
import re
from glob import glob
import os

In [2]:
# Import the functions

from functions_py_file import *

In [3]:
# Read in data

adjacency_matrix = pd.read_csv("adjacency_matrix2.csv", header=0, index_col=0)

multilevel = pd.read_csv("multilevel2.csv", header=0, index_col=0) 

In [4]:
# Clean the adjacency matrix
adjacency_matrix = clean_adjacency_mat(adjacency_matrix)

# Clean the multilevel lookup table
multilevel = clean_multilevel(multilevel, adjacency_matrix)

In [5]:
# Create the subset adjacency matrix

subset_leaf_list = ["Amyg_L_73_1", "Hippo_L_75_1"]
subset = subset_matrix_creator(subset_leaf_list, adjacency_matrix, multilevel)

In [6]:
# Create the descendants matrix
descendants = adjacency_descendants(subset, N=20, mu=3.0)

# Create the ancestors matrix
ancestors = adjacency_ancestors(subset, N=20, mu=3.0)

# 2. Generating data

In [7]:
# Generate simulated data
# Case 4, 100 repeats, mu 2

np.random.seed(5)
generate_simulated_data(filename="test4_data.npz", subset=subset, case=4, n_repeats=100, N=20, mu=2.0)

# 3. Cs (paper) method for permutation testing and stepdown procedure

In [12]:
# Modified permutation testing function

def permutation_testing2(filename_old, filename_new, subset, ignore, n_repeats, nperm, N, mu, niter, clip, initial_prob):
    ''' Function that conducts permutation testing
    
    This function conducts permutation testing using the generated data
    
    Parameters
    ----------
    filename_old: string
        The pattern of the filenames of the generated data; example: "test1_data_repeat_*"
    
    filename_new: string
        The user-specified filename pattern for the permutation testing results (choose a different name from 
        filename_old, i.e. say "results" instead of "data" if you don't want generated data to get overwritten 
        by permutation testing results); example: "results.npz"
    
    subset: pandas.DataFrame
        The subset adjacency matrix
    
    ignore: list
        A list of integers to ignore (structures that were rejected in previous function calls); 
        or it could be any empty list to not ignore anything
    
    n_repeats: int
        The number of repeats. We generated a random dataset with the same parameters but `n_repeats` 
        different realizations of the random variables. `n_repeats` must be the same value as `n_repeats`
        when we generated data earlier. n_repeats should be 1 in practice for external users, but in our case, 
        since we simulated a lot of data, n_repeats is greater than 1.
    
    nperm: int
        The number of permutations for permutation testing 
    
    N: int
        The number of samples. N must be the same value as N from generating data earlier.
    
    mu: float
        The difference in means (generally unknown). mu should be the same value as mu from generating data
        earlier in order to get meaningful results. But, mu doesn't have to be the same if you don't want to
        make it the same.
    
    niter: int
        The number of iterations of the EM algorithm
    
    clip: float
        Number that clips probabilities away from 0 or 1
    
    initial_prob: float
        The intial probability
    
    Returns 
    ----------
    npz file (written to disk, not explicitly returned)
        The 1st array contains p-values, 2nd array contains the names of the structures in the subset, 3rd 
        array contains the posterior probabilities, and 4th array contains the information from the prior 3 
        arrays saved in 1 string per structure.
    
    '''
    
    M = subset.shape[0] # Number of total unique structures
    S = np.array(subset, dtype = bool)
    names_subset = subset.columns # List of the 8 structures' names
    Descendants = adjacency_descendants(subset, N=N, mu=mu)
    Descendants_and_self = np.logical_or(Descendants, np.eye(M))
    
    basename, extension = os.path.splitext(filename_new)
    filename_old = glob(filename_old)
    filename_old = sorted(filename_old)
    
    for j in range(n_repeats):
        
        # Load the generated data for each repeat
        data = np.load(filename_old[j])
        X = data["X"]
        Z = data["Z"]
        G = data["G"]
        
        outputs = [] # Empty list for each iteration
        
        ### PARAMETER ESTIMATION ###
    
        P_subset = np.ones(M) * 0.5 # Array of 8 copies of 0.5
        Q = Q_from_P(P_subset, S)

        P0 = np.ones(M) * initial_prob
        P_subset = estimate_P(X[G], mu, S, Descendants_and_self, draw=0, P0=P0, niter=niter, names=names_subset, clip=clip)
        # Set draw = 0 to prevent drawing the graphs
        P_subset[ignore] = -1 # Set the ignored structures to -1 so they'll never be the maximum 
        ind = np.argsort(P_subset[::-1])[::-1][0] # Index of the maximum value of P_subset (among the non-excluded structures)
        ind = len(P_subset) - ind - 1 # Flips the order of contents in P_subset so they apply to the unflipped P_subset
        
        ### GENERATING PERMUTED DATA ###
    
        Ps = []
        np.random.seed(5)
        for n in range(nperm):
            Xp = X[np.random.permutation(N)[G]]
            P_ = estimate_P(Xp,mu,S,Descendants_and_self,draw=0,niter=niter,P0=P0)
            P_ = [element for i, element in enumerate(P_) if i not in ignore] # Remove ignored structures from P_
            Ps.append(P_) # Append the subset of P_
        
        Ps = np.array(Ps)
        Ps_sort = np.max(Ps, axis = 1) # Maximum value of Ps
        
        ### PERMUTATION TESTING ###
    
        pval = np.zeros_like(P_subset)
        alpha = 0.05
        
        pval_list = [] # Empty list to be filled
        names_list = [] # Empty list to be filled
        posterior_list = [] # Empty list to be filled
        
        pval = np.mean(Ps_sort >= P_subset[ind])
        outputs.append(f"{names_subset[ind]}, P[Z=1|X]={P_subset[ind]}, p={pval}") 
        pval_list.append(pval)
        # Every structure that gets rejected gets an entry
        names_list.append(names_subset[ind])
        posterior_list.append(P_subset[ind])
                    
        
        ### SORT THE POSTERIOR VALUES ###
        
        # Use the subset adjacency matrix to create a dictionary
        columns = np.array(subset.columns)
        dictionary = dict(enumerate(columns.flatten(), 1))
        dictionary = dict((value, key) for key, value in dictionary.items()) # Swap the keys and values
        outputs = sorting_function(outputs, dictionary)
        
        ### SAVE DATA ### 
        
        filename_new_this_repeat = basename + f'_repeat_{j:06d}' + extension
        np.savez(filename_new_this_repeat, pval = pval_list, names = names_list, posterior = posterior_list, strings = outputs)
        
    ### OUTPUTS ###
    return pval, ind, np.quantile(Ps_sort, 0.95), names_list

In [13]:
# IMPLEMENT THE STEPDOWN PROCEDURE (case 4)

ignore_list = [] # Initialize ignore_list as an empty list
critical_values = [] # Initialize critical_values as an empty list
p_values = [] # Initialize p_values as an empty list
M = subset.shape[0] # Number of total unique structures

while True: # Loop to call the function multiple times
    results = permutation_testing2(filename_old="test4_data_repeat_*", filename_new = "test4_results.npz", ignore = ignore_list, subset=subset, n_repeats=100, nperm=100, N=20, mu=2.0, niter=5, initial_prob = 0.5, clip=0.001)
    print("Structure: ", results[3])
    if ((results[0] < 0.05) or True): # If the output p-value is < 0.05...
        ignore_list.append(results[1]) # Add the maximum structure to the ignore list
        critical_values.append(results[2])
        p_values.append(results[0])
    else:
        # if we fail to reject one hypothesis, we stop testing 
        # but, for our study we will still want all M structures, so above we say "or True"
        break
    if (len(ignore_list) == M): # Break once the ignore list is filled with every structure
        break

Structure:  ['Everything']
Structure:  ['Telencephalon_L_501_5']
Structure:  ['CerebralCortex_L_482_4']
Structure:  ['Limbic_L_434_3']
Structure:  ['Amyg_L_336_2']
Structure:  ['Amyg_L_73_1']
Structure:  ['Hippo_L_338_2']
Structure:  ['Hippo_L_75_1']


In [14]:
p_values

[0.0, 0.0, 0.0, 0.0, 0.06, 0.06, 0.05, 0.05]

# 4. How often do we detect nothing at child level and something at parent level?

In [15]:
# Viewing the saved results

test4_filenames = glob("test4_results_repeat_*")
test4_filenames = sorted(test4_filenames)

for i in range(0, len(test4_filenames)):
    # Assign the file to an object called "test," which is a dictionary object
    test = np.load(test4_filenames[i])
    print("\033[1m" + test4_filenames[i] + "\033[0m") 

    # Print the values corresponding to each key
    print("\033[1m" + "p-values:" + "\033[0m")
    print(test["pval"])
    
    print("\033[1m" + "names:" + "\033[0m")
    print(test["names"])
    
    print("\033[1m" + "posterior:" + "\033[0m")
    print(test["posterior"])
    
    print("\033[1m" + "strings:" + "\033[0m")
    print(test["strings"])
    print("\n")

[1mtest4_results_repeat_000000.npz[0m
[1mp-values:[0m
[0.04]
[1mnames:[0m
['Hippo_L_75_1']
[1mposterior:[0m
[0.4539879]
[1mstrings:[0m
['Hippo_L_75_1, P[Z=1|X]=0.45398789622528735, p=0.04']


[1mtest4_results_repeat_000001.npz[0m
[1mp-values:[0m
[0.44]
[1mnames:[0m
['Hippo_L_75_1']
[1mposterior:[0m
[0.2738127]
[1mstrings:[0m
['Hippo_L_75_1, P[Z=1|X]=0.2738126961097244, p=0.44']


[1mtest4_results_repeat_000002.npz[0m
[1mp-values:[0m
[0.04]
[1mnames:[0m
['Hippo_L_75_1']
[1mposterior:[0m
[0.69611634]
[1mstrings:[0m
['Hippo_L_75_1, P[Z=1|X]=0.696116338936419, p=0.04']


[1mtest4_results_repeat_000003.npz[0m
[1mp-values:[0m
[0.07]
[1mnames:[0m
['Hippo_L_75_1']
[1mposterior:[0m
[0.3894126]
[1mstrings:[0m
['Hippo_L_75_1, P[Z=1|X]=0.3894126008270704, p=0.07']


[1mtest4_results_repeat_000004.npz[0m
[1mp-values:[0m
[0.01]
[1mnames:[0m
['Hippo_L_75_1']
[1mposterior:[0m
[0.36347559]
[1mstrings:[0m
['Hippo_L_75_1, P[Z=1|X]=0.36347559103723187, p=0

[1mtest4_results_repeat_000094.npz[0m
[1mp-values:[0m
[0.09]
[1mnames:[0m
['Hippo_L_75_1']
[1mposterior:[0m
[0.17303211]
[1mstrings:[0m
['Hippo_L_75_1, P[Z=1|X]=0.17303210832139576, p=0.09']


[1mtest4_results_repeat_000095.npz[0m
[1mp-values:[0m
[0.]
[1mnames:[0m
['Hippo_L_75_1']
[1mposterior:[0m
[0.96570268]
[1mstrings:[0m
['Hippo_L_75_1, P[Z=1|X]=0.965702678402009, p=0.0']


[1mtest4_results_repeat_000096.npz[0m
[1mp-values:[0m
[0.]
[1mnames:[0m
['Hippo_L_75_1']
[1mposterior:[0m
[0.5874723]
[1mstrings:[0m
['Hippo_L_75_1, P[Z=1|X]=0.5874722984189769, p=0.0']


[1mtest4_results_repeat_000097.npz[0m
[1mp-values:[0m
[0.01]
[1mnames:[0m
['Hippo_L_75_1']
[1mposterior:[0m
[0.6831758]
[1mstrings:[0m
['Hippo_L_75_1, P[Z=1|X]=0.6831757998437109, p=0.01']


[1mtest4_results_repeat_000098.npz[0m
[1mp-values:[0m
[0.03]
[1mnames:[0m
['Hippo_L_75_1']
[1mposterior:[0m
[0.55309553]
[1mstrings:[0m
['Hippo_L_75_1, P[Z=1|X]=0.5530955271463276, p=0.03']
