# 1. Setup

In [1]:
import numpy as np
%matplotlib notebook
import matplotlib.pyplot as plt
import pandas as pd
import re
from glob import glob
import os
from functions_py_file import *

# 2. Loading and generating dummy data for X and P

X contains the observed data for each leaf structure and each subject 

P contains the marginal probabilities that structures are affected) for both leaf and non-leaf structures 

* $P_{ij}$ is the probability that a structure i is affected given $X_{ij}$
* $P_{ij}$ is the posterior probability given the data
* In our code, the variable $P_{ij}$ is called `posterior` $\rightarrow$ we must index `posterior` so that we just have the values for leaf structures

In [2]:
# Load a previously generated data file to extract X
data = np.load("test4_data_repeat_000051.npz")
X = data["X"]
X

array([[-1.65830854,  1.06862772],
       [-1.55608052,  1.32659454],
       [-1.25835328,  1.34691376],
       [ 0.27317221,  0.26482819],
       [ 4.45708111,  0.40883695],
       [-0.45648577,  3.16245117],
       [ 0.44170751,  2.28639287],
       [ 2.12004193, -0.01084918],
       [-0.40958086,  1.90966784],
       [ 2.49087637,  0.17468901],
       [ 0.65169195, -0.02742522],
       [ 0.68536846,  0.79055769],
       [-0.36241304, -0.7791171 ],
       [ 1.21656581, -0.2686083 ],
       [ 0.4315702 , -0.12492621],
       [-1.08265721, -1.12421064],
       [ 0.01726323, -0.68373408],
       [-0.67447636, -0.38647334],
       [ 1.69139926,  0.85812991],
       [ 0.91961641,  0.17195564]])

In [3]:
adjacency_matrix = pd.read_csv("adjacency_matrix2.csv", header=0, index_col=0)
adjacency_matrix = clean_adjacency_mat(adjacency_matrix)
multilevel = pd.read_csv("multilevel2.csv", header=0, index_col=0) 
multilevel = clean_multilevel(multilevel, adjacency_matrix)
subset_leaf_list = ["Amyg_L_73_1", "Hippo_L_75_1"]
subset = subset_matrix_creator(subset_leaf_list, adjacency_matrix, multilevel)
descendants = adjacency_descendants(subset, N=20, mu=3.0)
M = subset.shape[0]
N = X.shape[0]
Descendants_and_self = np.logical_or(descendants, np.eye(M))
is_leaf = np.sum(Descendants_and_self, 1) == 1
niter = 100
P = np.ones(M)*0.5
clip = 0.001
mu = 3
A = np.array(subset, dtype = bool)

for it in range(niter):
    P_ = np.maximum(P, clip) # Clip probability: if P is very small, then set it to 0.001
    P_ = np.minimum(P_, 1-clip) # Clip probability: if P_ is very big, then set it to 0.999
    P_over_one_minus_P = P_/(1.0-P_)
    leaf_log_posterior = -np.log1p( P_over_one_minus_P[is_leaf]*phi(X,mu)/phi(X) )
        
log_posterior = np.zeros((N,M))
for i in range(M):
    log_posterior[:,i] = np.sum(leaf_log_posterior[:,Descendants_and_self[i,:][is_leaf]],1)
    
Q = Q_from_P(P,A)
log_adjustment_single = np.zeros(M)
for i in range(M):
    if is_leaf[i]:
        continue
    log_adjustment_single[i] = -np.log1p(P_over_one_minus_P[i]*np.prod(1.0 - Q[A[i,:]]))
     
log_adjustment = np.ones(M)
for i in range(M):
    log_adjustment[i] = np.sum(log_adjustment_single[Descendants_and_self[i,:]])
          
log_posterior = log_posterior + log_adjustment

P = -np.sum(np.expm1(log_posterior),0)/N
posterior = np.exp(log_posterior)
posterior = posterior[:, is_leaf]

# 3. Modify the `phi()` function (Gaussian)

The current `phi()` function only takes mu (default 0) as an input, and it assumes that sigma is 1. 

Now, for dealing with real data, we want to input the mean (either $\mu_0$ or $\mu_1$, each default 0) as well as the variance (not SD anymore) $\sigma^2$ (default 1), assuming both groups have the same equal variance.

Gaussian PDF for group 0 (unaffected): $f_0^i (X_i^j) = \frac{1}{\sqrt{2 \pi v_i}} e^{-\frac{(X_i^j - \mu_i^0)^2}{2v_i}}$

Gaussian PDF for group 1 (affected): $f_1^i (X_i^j) = \frac{1}{\sqrt{2 \pi v_i}} e^{-\frac{(X_i^j - \mu_i^1)^2}{2v_i}}$

Common variance: $v = \sigma_0^2 = \sigma_1^2$

In [4]:
def phi(x, mu = 0.0, sigma2 = 1.0):
    return 1.0 / np.sqrt(2.0 * np.pi * sigma2) * np.exp(-(x - mu)**2 / (2.0 * sigma2))

# 4. Where in the code is the `phi()` function called?

Old functions .py file: https://github.com/paige-lee/ontology_fwer/blob/main/Defining%20functions/functions_py_file.py

Line 404 in the above .py file: `leaf_log_posterior = -np.log1p(P_over_one_minus_P[is_leaf]*phi(X,mu)/phi(X))`

* This is the log likelihood ratio that involves the ratio of the Gaussian for class 1 (affected) to the Gaussian for class 0 (unaffected)
* Gaussian for class 1 (affected): `phi(X,mu)`
* Gaussian for class 0 (unaffected): `phi(X)` 
    - `mu = 0` by default
    
Now, we must update the `estimate_P()` function so that it inputs two mu arguments, not just one $\rightarrow$ instead of only `mu` as an argument, we want to also input `mu0`, `mu1`, and `sigma2` as arguments

Updated line 404: `leaf_log_posterior = -np.log1p(P_over_one_minus_P[is_leaf]*phi(X,mu1, sigma2)/phi(X, mu0, sigma2))`

# 5. Write a function that calculates mu and sigma from X and P

Indices:

* i is the structure number 
* j is the patient number

MLE of $\mu_0$: $\hat{\mu_i^0} = \frac{\sum_j X_{ij} (1 - P_{ij})}{\sum_j (1 - P_{ij})}$

MLE of $\mu_1$: $\hat{\mu_i^1} = \frac{\sum_j P_{ij} X_{ij}}{\sum_j P_{ij}}$ 

MLE of $v$ (common equal variance): $\hat{v_i} = \frac{\sum_j P_{ij} (X_{ij} - \hat{\mu_i^1})^2 + \sum_j (1 - P_{ij}) (X_{ij} - \hat{\mu_i^0})^2}{N}$

In [5]:
X.shape, posterior.shape 
# Both X and P_{ij} have the same dimensions

((20, 2), (20, 2))

In [6]:
def calc_mu_sigma_from_xp(X, P_ij, M):
    mu0i = sum(X * (1 - P_ij)) / sum(1 - P_ij)
    mu1i = sum(P_ij * X) / sum(P_ij)
    vi = (sum(P_ij * (X - mu1i)**2) + sum((1 - P_ij) * (X - mu0i)**2)) / M
    return mu0i, mu1i, vi

In [7]:
calc_mu_sigma_from_xp(X, posterior, M)

(array([2.48079773, 2.02878524]),
 array([-0.15450929,  0.13749105]),
 array([2.48238184, 1.36918263]))