In [539]:
import sklearn
import numpy as np
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
from collections import deque

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

In [540]:
# Import the data
tree = pd.read_csv('tree.csv')
tree['t'] = tree['t'].replace(to_replace=0, value=0.1)

vert_genes = pd.read_csv('vert_genes.csv')

## Part I: Simulation

In [541]:
# Creating the graph

def create_graph(tree, alpha, beta, sigma_sq):
    G = nx.DiGraph()
    for _, row in tree.iterrows():
        if not pd.isna(row['Parent']):
            G.add_edge(int(row["Parent"]), int(row["Child"]), time = row["t"], a = alpha*row["t"], b = beta, variance = sigma_sq*row["t"])
            
    return G

G = create_graph(tree, alpha = 0, beta = 1, sigma_sq = 2500)

In [542]:
def simulate_node_length_with_parameters(G, parent, simulated_lengths, alpha, beta, sigma_sq):
    for child in G.successors(parent):
        t = G[parent][child]['time']
        mean = alpha * t + beta * simulated_lengths[parent]
        std = np.sqrt(sigma_sq * t)
        simulated_lengths[child] = np.random.normal(mean, std)
        simulate_node_length_with_parameters(G, child, simulated_lengths, alpha, beta, sigma_sq)
    
    return simulated_lengths

In [543]:
def simulate_data_for_learning(G, n, alpha, beta, sigma_sq, alpha_0, sigma_0_sq, root, learn_params = True, only_X = True):
    X_values = []
    Y_values = []
    
    all_nodes = list(G.nodes)  # Get all nodes in the graph

    for _ in range(n):
        simulated_lengths = {}
        
        # Simulate root node first
        simulated_lengths[root] = np.random.normal(alpha_0, np.sqrt(sigma_0_sq))
        
        # Simulate all other nodes recursively
        simulated_lengths = simulate_node_length_with_parameters(G, root, simulated_lengths, alpha, beta, sigma_sq)
        if only_X:
            leaf_nodes = [node for node in all_nodes if G.out_degree(node) == 0]
            simulated_x = [simulated_lengths[node] for node in leaf_nodes]
            
            X_values.append(simulated_x)
        else:
            X_values.append([simulated_lengths[node] for node in all_nodes])
            
        if learn_params:
            Y_values.append([alpha,beta,sigma_sq])
        else:
            Y_values.append(simulated_lengths[root])

    return np.array(X_values), np.array(Y_values)

learn_parameters = False

X, y = simulate_data_for_learning(G, 1000, alpha = 0.5, beta = 1, sigma_sq = 2500, alpha_0 = 50000, sigma_0_sq = 5000, root = 407, learn_params=learn_parameters)

print(X.shape)
print(y.shape)

print(y[10])

(1000, 204)
(1000,)
49990.84514683039


In [544]:
def compute_gamma(G):
    gamma = np.eye(len(G.nodes)) # Initialize gamma as an identity matrix

    # Iterate through the nodes in the graph
    for node in G.nodes:
        parent = next(G.predecessors(node), None)
        if parent is None:
            continue
        else:
            gamma[parent-1, node -1] = -G[parent][node]['b'] # Determine the dependency between parent and child nodes with -b
    return gamma

def compute_beta(G, alpha_0):
    beta = np.zeros((len(G.nodes), 1))
    for node in G.nodes:
        parent = next(G.predecessors(node), None)
        if parent is None:
            beta[node-1] = alpha_0
            continue
        a = G[parent][node]['a'] # The constant term in the mean of the CPD
        beta[node-1] = a
        
    return beta

def compute_sigma(G, sigma_0_sq):
    sigma = np.zeros((len(G.nodes)))
    for node in G.nodes:
        parent = next(G.predecessors(node), None)
        if parent is None:
            # Assign the default value for the root node
            sigma[node - 1] = sigma_0_sq
        else:
            # Access the edge attribute 'variance' only if parent exists
            variance = G[parent][node]['variance']
            sigma[node - 1] = variance
    return sigma

def compute_J_and_h(G, alpha, beta, sigma_sq, sigma_0_sq, alpha_0):
    
    beta = compute_beta(G, alpha_0)
    sigma = compute_sigma(G, sigma_0_sq)
    gamma = compute_gamma(G)

    J = np.sum([np.outer(gamma[:, i], gamma[:, i]) / sigma[i] for i in range(len(G))], axis=0)
    h = np.sum([(beta[i] / sigma[i]) * gamma[:, i] for i in range(len(G))], axis=0)
    return J, h



In [545]:
def compute_clique_tree(G):
    C = nx.Graph()

    G_working = G.copy()
    leaves = [node for node in G_working.nodes if G_working.out_degree(node) == 0]
    G_working.remove_nodes_from(leaves)

    index = min(G_working.nodes) -1 if all(isinstance(n, int) for n in G_working.nodes) else 0

    for node in G_working.nodes:
        parent = node
        children = list(G_working.neighbors(parent))
        C.add_node(parent, variables=[parent])
        for child in children:
            pair_clique = index
            C.add_node(pair_clique, variables=[parent, child])
            C.add_edge(parent, pair_clique)
            C.add_edge(pair_clique, child)
            index = index - 1

    return C

C = compute_clique_tree(G)

In [546]:
NoV = len([node for node in C.nodes if len(C.nodes[node]['variables']) == 1]) -1
maxIndex = max([node for node in C.nodes])
away_from_zero = maxIndex - NoV
minIndex = min([node for node in C.nodes if len(C.nodes[node]['variables']) == 1])

print("Away from zero: ", away_from_zero)
print("Max index: ", maxIndex)
print("Min index: ", minIndex)
print("Number of variables in the graph: ", NoV)

def mapping_GTM(index_in_graph):
    return index_in_graph - away_from_zero 

def mapping_MTG(index_in_matrix):
    return index_in_matrix + away_from_zero

Away from zero:  205
Max index:  407
Min index:  205
Number of variables in the graph:  202


### Matrix algebra implementation

In [547]:
def get_sub_matrices(scope, X_values, J, h):
    X_indices = np.isin(scope, X_values)
    Z_indices = ~X_indices

    J_ZZ = J[Z_indices, :][:, Z_indices]
    J_ZX = J[Z_indices, :][:, X_indices]
    J_XZ = J_ZX.T
    J_XX = J[X_indices, :][:, X_indices]
    J_ZZ_inv = np.linalg.inv(J_ZZ)

    h_X = h[X_indices]
    h_Z = h[Z_indices]

    return J_ZZ, J_ZX, J_XZ, J_XX, J_ZZ_inv, h_X, h_Z

def get_conditional_distribution(J, h, X_values, X_indices):
    scope = [i for i in range(1, len(J) + 1)]
    J_ZZ, J_ZX, J_XZ, J_XX, J_ZZ_inv, h_X, h_Z = get_sub_matrices(scope, X_indices, J, h)

    J_reduced = J_ZZ
    h_reduced = h_Z- J_ZX @ X_values

    return J_reduced, h_reduced

alpha = 0.5
beta = 1
sigma_sq = 2500

X_values = simulate_data_for_learning(G, 1, alpha=alpha, beta=beta, sigma_sq=sigma_sq, alpha_0=50000, sigma_0_sq=5000, root=407, learn_params=False, only_X=True)[0]

### Testing matrix algebra

In [548]:
leaves = [node for node in G.nodes if G.out_degree(node) == 0]
X_indices = leaves

J, h = compute_J_and_h(G, alpha, beta, sigma_sq, 5000, 50000)

J_reduced, h_reduced = get_conditional_distribution(J, h, X_values[0], X_indices)


Sigma = np.linalg.inv(J_reduced)
mu = Sigma @ h_reduced

random_index = np.random.choice(range(len(X_values[0])))
z = random_index
print(f"Predicted value for node 407: {mu[z]}")
print(f"Variance for node 407: {Sigma[z,z]}")
print(f"Actual value for node 407: {X_values[0][0]}")
print(f"True variance for node 407: {sigma_sq}")

Predicted value for node 407: 50612.42638834081
Variance for node 407: 2297.8328343106095
Actual value for node 407: 50449.29596389175
True variance for node 407: 2500


## Part II : inference

In [549]:
def marginalize_out(J, h, X_indices):
    scope = [i for i in range(1, len(J) + 1)]
    J_ZZ, J_ZX, J_XZ, J_XX, J_ZZ_inv, h_X, h_Z = get_sub_matrices(scope, X_indices, J, h)

    J_marg = J_ZZ - (J_ZX @ np.linalg.inv(J_XX) @ J_XZ)
    h_marg = h_Z - (J_ZX @ np.linalg.inv(J_XX) @ h_X)

    return J_marg, h_marg

### Rekuriv løsning

In [550]:
def single_clique(G):
    leaves = [node for node in G.nodes if G.out_degree(node) == 0]
    H = G.copy()
    H.remove_nodes_from(leaves)
    H = H.to_undirected()
    return H

def compute_J_i_arrow_j(clique_tree, i, j, J, h, J_messages, h_messages):
    
    neighbors = list(clique_tree.neighbors(i))
    neighbors.remove(j)

    i_idx = mapping_GTM(i)
    j_idx = mapping_GTM(j)

    if not neighbors:
        J_messages[i_idx][j_idx] = -J[i_idx, j_idx] * J[j_idx, i_idx] / J[i_idx, i_idx]
    else:
        J_sum = sum(compute_J_i_arrow_j(clique_tree, k, i, J, h, J_messages, h_messages) for k in neighbors)
        J_messages[i_idx][j_idx] = -J[i_idx, j_idx] * J[j_idx, i_idx] / (J[i_idx, i_idx] + J_sum)

    return J_messages[i_idx][j_idx]


def compute_h_i_arrow_j(clique_tree, i, j, J, h, J_messages, h_messages):

    neighbors = list(clique_tree.neighbors(i))
    neighbors.remove(j)

    i_idx = mapping_GTM(i)
    j_idx = mapping_GTM(j)

    if not neighbors:
        h_messages[i_idx][j_idx] = -J[i_idx, j_idx] * h[i_idx] / J[i_idx, i_idx]
    else:
        J_sum = sum(compute_J_i_arrow_j(clique_tree, k, i, J, h, J_messages, h_messages) for k in neighbors)
        h_sum = sum(compute_h_i_arrow_j(clique_tree, k, i, J, h, J_messages, h_messages) for k in neighbors)
        Ji_backslash_j = J[i_idx, i_idx] + J_sum
        hi_backslash_j = h[i_idx] + h_sum
        h_messages[i_idx][j_idx] = (-J[i_idx, j_idx] * hi_backslash_j) / (Ji_backslash_j)

    return h_messages[i_idx][j_idx]

def inference_algorithm(G, alpha, beta, sigma_sq, alpha_0, X_indices, observed_X, Z):

    clique_tree = single_clique(G)
    J, h = compute_J_and_h(G, alpha, beta, sigma_sq, 5000, alpha_0)

    J_reduced, h_reduced = get_conditional_distribution(J, h, observed_X, X_indices)
        
    J_messages = np.full(J_reduced.shape, np.nan)
    h_messages = np.full(J_reduced.shape, np.nan)

    Z_neighbors = list(clique_tree.neighbors(Z))
    Z_idx = mapping_GTM(Z)

    J_zz = J_reduced[Z_idx, Z_idx]

    sum_J = sum([compute_J_i_arrow_j(clique_tree, k, Z, J_reduced, h_reduced, J_messages, h_messages) for k in Z_neighbors])
    sum_h = sum([compute_h_i_arrow_j(clique_tree, k, Z, J_reduced, h_reduced, J_messages, h_messages) for k in Z_neighbors])

    J_hat_Z = J_zz + sum_J
    h_hat_Z = h_reduced[Z_idx] + sum_h


    return J_hat_Z, h_hat_Z

In [551]:
def information_to_standard(clique_j, clique_h):

    z_hats = np.zeros_like(clique_h)
    z_vars = np.zeros_like(clique_h)

    for i in range(len(clique_j)):
        z_hats[i] = clique_h[i] / clique_j[i]
        z_vars[i] = 1 / clique_j[i]

    return z_hats, z_vars


def J_msg(clique_tree, i, j, J, h, J_messages, h_messages, sent):
    neighbors = list(clique_tree.neighbors(i))
    neighbors.remove(j)

    i_idx = mapping_GTM(i)
    j_idx = mapping_GTM(j)

    if not neighbors:
        # Leaf node — base case
        J_messages[i_idx][j_idx] = -J[i_idx, j_idx] * J[j_idx, i_idx] / J[i_idx, i_idx]
        sent[i_idx][j_idx] = True
        return

    # Collect required messages
    incoming_msgs = [J_messages[mapping_GTM(k)][i_idx] for k in neighbors]

    if any(np.isnan(m) for m in incoming_msgs):
        print("Attempted to compute message with missing dependencies")
        return  # Dependencies not ready — skip

    J_sum = sum(incoming_msgs)
    J_messages[i_idx][j_idx] = -J[i_idx, j_idx] * J[j_idx, i_idx] / (J[i_idx, i_idx] + J_sum)
    sent[i_idx][j_idx] = True

def h_msg(clique_tree, i, j, J, h, J_messages, h_messages, sent):
    neighbors = list(clique_tree.neighbors(i))
    neighbors.remove(j)

    i_idx = mapping_GTM(i)
    j_idx = mapping_GTM(j)

    if not neighbors:
        # Leaf node — base case
        h_messages[i_idx][j_idx] = -J[i_idx, j_idx] * h[i_idx] / J[i_idx, i_idx]
        sent[i_idx][j_idx] = True
        return

    J_incoming = [J_messages[mapping_GTM(k)][i_idx] for k in neighbors]
    h_incoming = [h_messages[mapping_GTM(k)][i_idx] for k in neighbors]

    if any(np.isnan(x) for x in J_incoming + h_incoming):
        print("Attempted to compute message with missing dependencies")
        return  # Dependencies not ready — skip

    J_sum = sum(J_incoming)
    h_sum = sum(h_incoming)

    Ji_backslash_j = J[i_idx, i_idx] + J_sum
    hi_backslash_j = h[i_idx] + h_sum

    h_messages[i_idx][j_idx] = -J[j_idx, i_idx] * hi_backslash_j / Ji_backslash_j
    sent[i_idx][j_idx] = True

def upward_pass(clique_tree, J, h, J_messages, h_messages, sent):
    queue = deque()

    # Start from leaves (nodes with degree 1)
    leaves = [node for node in clique_tree.nodes if clique_tree.degree[node] == 1]
    for leaf in leaves:
        queue.append(leaf)

    while queue:
        current = queue.popleft()
        for neighbor in clique_tree.neighbors(current):
            i = current
            j = neighbor
            i_idx = mapping_GTM(i)
            j_idx = mapping_GTM(j)

            if not sent[i_idx][j_idx]:
                # Check if all incoming messages to i (except from j) are ready
                deps_ready = all(
                    sent[mapping_GTM(k)][i_idx]
                    for k in clique_tree.neighbors(i) if k != j
                )
                if deps_ready:
                    J_msg(clique_tree, i, j, J, h, J_messages, h_messages, sent)
                    h_msg(clique_tree, i, j, J, h, J_messages, h_messages, sent)
                    queue.append(j)

    return J_messages, h_messages

def downward_pass(clique_tree, J, h, J_messages, h_messages, sent, root):
    queue = deque([root])

    while queue:
        current = queue.popleft()
        for neighbor in clique_tree.neighbors(current):
            i = current
            j = neighbor
            i_idx = mapping_GTM(i)
            j_idx = mapping_GTM(j)

            if not sent[i_idx][j_idx]:
                deps_ready = all(
                    sent[mapping_GTM(k)][i_idx]
                    for k in clique_tree.neighbors(i) if k != j
                )
                if deps_ready:
                    J_msg(clique_tree, i, j, J, h, J_messages, h_messages, sent)
                    h_msg(clique_tree, i, j, J, h, J_messages, h_messages, sent)
                    queue.append(j)

    return J_messages, h_messages

def compute_clique_beliefs(clique_tree, J, h, J_messages, h_messages):
    
    clique_j = np.zeros(shape=h.shape)
    clique_h = np.zeros(shape=h.shape)
    
    number_of_variables = len(h)
    for i in range(number_of_variables):
        J_i = J[i][i]
        h_i = h[i]

        graph_i = mapping_MTG(i)

        for neighbor in clique_tree.neighbors(graph_i):
            J_message = J_messages[mapping_GTM(neighbor)][i]
            h_message = h_messages[mapping_GTM(neighbor)][i]
            J_i += J_message
            h_i += h_message
        
        clique_j[i] = J_i
        clique_h[i] = h_i

    return clique_j, clique_h

def message_passing(G, alpha, beta, sigma_sq, alpha_0, sigma_0_sq, X_indices, observed_X):

    single_clique_tree = single_clique(G)
    J, h = compute_J_and_h(G, alpha, beta, sigma_sq, sigma_0_sq, alpha_0)

    J_reduced, h_reduced = get_conditional_distribution(J, h, observed_X, X_indices)

    J_messages = np.full(J_reduced.shape, np.nan)
    h_messages = np.full(J_reduced.shape, np.nan)

    sent = np.zeros_like(J_messages, dtype=bool)

    J_msg_up, h_msg_up = upward_pass(single_clique_tree, J_reduced, h_reduced, J_messages, h_messages, sent)
    J_msg_down, h_msg_down = downward_pass(single_clique_tree, J_reduced, h_reduced, J_msg_up, h_msg_up, sent, 407)
    clique_j, clique_h = compute_clique_beliefs(single_clique_tree, J_reduced, h_reduced, J_msg_down, h_msg_down)
    print("Messages sent:", np.sum(sent), "/", sent.size)

    z_hats, z_vars = information_to_standard(clique_j, clique_h)
    return z_hats, z_vars

alpha_0 = 50000
sigma_0_sq = 5000
z_hats, z_vars = message_passing(G, alpha, beta, sigma_sq, alpha_0, sigma_0_sq, X_indices, X_values[0])


prnt = True
if prnt:
    print("Maximum value: ", np.max(z_hats))
    print("Minimum value: ", np.min(z_hats))

    for i in range(len(z_hats)):
        print(f"Node {mapping_MTG(i)}:")
        print(f"Predicted value: {z_hats[i]}")
        print(f"Predicted variance: {z_vars[i]}")
        print(f"True value: {X_values[0][i]}")
        print(f"True variance: {sigma_sq}")
        print()

    

Messages sent: 404 / 41209
Maximum value:  51119.67442554878
Minimum value:  49797.69869461732
Node 205:
Predicted value: 50064.14437218458
Predicted variance: 5669.433309434267
True value: 50449.29596389175
True variance: 2500

Node 206:
Predicted value: 50078.81295898286
Predicted variance: 7994.384765674203
True value: 50668.64383555307
True variance: 2500

Node 207:
Predicted value: 50095.76473970306
Predicted variance: 8740.563259965546
True value: 51001.22679029449
True variance: 2500

Node 208:
Predicted value: 50145.020314119414
Predicted variance: 8421.377349952001
True value: 50759.756940043895
True variance: 2500

Node 209:
Predicted value: 50165.837386342704
Predicted variance: 8061.681958844391
True value: 50636.5500197688
True variance: 2500

Node 210:
Predicted value: 50188.23007685436
Predicted variance: 8202.000244228886
True value: 49985.341719029224
True variance: 2500

Node 211:
Predicted value: 50274.60995924309
Predicted variance: 7954.361268096818
True value: 502

## Part III: Learning

In [552]:
def simulate_full_data(G, n_samples, alpha, beta, sigma_sq, alpha_0, sigma_0_sq):
    rows = []

    for _ in range(n_samples):
        simulated = {}
        simulated[root] = np.random.normal(alpha_0, np.sqrt(sigma_0_sq))
        simulate_node_length_with_parameters(G, root, simulated, alpha, beta, sigma_sq)

        for parent, child in G.edges:
            t = G[parent][child]['time']
            y = simulated[child]
            z = simulated[parent]
            rows.append({
                'Y': y,
                'Z': z,
                't': t
            })

    return pd.DataFrame(rows)

def estimate_alpha_beta_sigma2(data):
    X = data[['t', 'Z']]
    y = data['Y']

    weights = 1 / data['t']
    model = LinearRegression().fit(X, y, sample_weight=weights)
    alpha_hat = model.coef_[0]
    beta_hat = model.coef_[1]

    # Residual variance estimate of σ²

    residuals = y - model.predict(X)
    sigma_sq_hat = np.sum(weights * residuals**2) / len(y)

    return alpha_hat, beta_hat, sigma_sq_hat

root = 407
n = 1000
df = simulate_full_data(G, n, alpha=0.5, beta=1, sigma_sq=2500, alpha_0=50000, sigma_0_sq=5000)
alpha_hat, beta_hat, sigma_sq_hat = estimate_alpha_beta_sigma2(df)

print("Estimated alpha:", alpha_hat)
print("Estimated beta:", beta_hat)
print("Estimated sigma_sq:", sigma_sq_hat)

Estimated alpha: 0.4798322412422627
Estimated beta: 0.9999945685704371
Estimated sigma_sq: 2506.5862992248617


In [553]:
z0_id = 407

def make_train_data(di):
    X_i = np.array([[di['t'], di[z0_id]]])     # predictors: [t, z_parent]
    y_i = np.array([di['target']])             # target: Z_i or X_i
    return X_i, y_i

def learn_parameters(xs_lst, zs_list):
    '''
    xs_lst is a list of dictionaries for x
    zs_list is a list of dictionaries for z
    '''

    y_list = []
    x_list = []

    for i in range(len(xs_lst)):
        di = {**xs_lst[i], **zs_list[i]}
        X_i, y_i = make_train_data(di)
        y_list.append(y_i)
        x_list.append(X_i)
      
    y = np.concatenate(y_list)
    X = np.concatenate(x_list)

    model = LinearRegression().fit(X, y)
    mean_ys = model.predict(X)

    alpha = model.coef_[0]
    beta = model.coef_[1]


    var = np.mean((y - mean_ys)**2 / X[:, 0])
    alpha_0 = np.mean([d[z0_id] for d in zs_list])

    return alpha, beta, var, alpha_0


In [554]:
def hard_assignment_EM(X_indices, n, alpha_init, beta_init, sigma_sq_init, alpha_0_init, sigma_0_sq = 5000, root=407):
    alpha = alpha_init
    beta = beta_init
    sigma_sq = sigma_sq_init
    alpha_0 = alpha_0_init

    for i in range(n):
        G = create_graph(tree, alpha, beta, sigma_sq)
        z_hats, _ = message_passing(G, alpha, beta, sigma_sq, alpha_0, sigma_0_sq, X_indices, X_values[0])

        xs = [{'t': t} for t in z_hats]  # z_hats are used as tildes
        zs = [{z0_id: z0_val, 'target': z_i_val} for z0_val, z_i_val in zip(X_values[0], z_hats)]


        alpha_0_hat, alpha_hat, beta_hat, sigma_sq_hat = learn_parameters(xs, zs)
        
        alpha = alpha_hat
        beta = beta_hat
        sigma_sq = sigma_sq_hat
        alpha_0 = alpha_0_hat
        if i % 4 == 0:
            print("Iteration:", i)
            print("Estimated alpha:", alpha)
            print("Estimated beta:", beta)
            print("Estimated sigma_sq:", sigma_sq)
            print("Estimated alpha_0:", alpha_0)  
        
    return alpha, beta, sigma_sq, alpha_0

X_indices = leaves

hard_assignment_EM(X_indices, 1, 0.5, 1, 2500, 50000)

Messages sent: 404 / 41209
Iteration: 0
Estimated alpha: -5.3376430128498033e-17
Estimated beta: 1.0523909645761267e-27
Estimated sigma_sq: 50317.14912916369
Estimated alpha_0: 0.9999999999999998


(-5.3376430128498033e-17,
 1.0523909645761267e-27,
 50317.14912916369,
 0.9999999999999998)

### Part III: Apply inference and learning algorithms to real data

In [555]:
def split_gene_data(tree, vert_genes):
    data = pd.merge(tree, vert_genes, on='species', how='inner')
    rows = []
    for orth_id, group in data.groupby('orthId'):
        child_glength = dict(zip(group['Child'], group['glength']))
        rows.append({'orthId': orth_id, 'glength_dict': child_glength})

    return pd.DataFrame(rows)

real_data = split_gene_data(tree, vert_genes)

real_data


Unnamed: 0,orthId,glength_dict
0,1CPN2,"{1: 88338, 2: 219992, 3: 89972, 4: 233175, 5: ..."
1,1CQBX,"{1: 30935, 2: 31081, 3: 40971, 4: 32508, 5: 28..."
2,1CQJ6,"{1: 21555, 2: 20662, 3: 12707, 4: 21723, 5: 22..."
3,1CR8Z,"{1: 12401, 2: 16419, 3: 11305, 4: 16328, 5: 59..."
4,1CTEU,"{1: 3125, 2: 2814, 3: 2122, 4: 2765, 5: 3457, ..."
5,1CTI9,"{1: 44314, 2: 42679, 3: 44077, 4: 36837, 5: 11..."
6,1CYBB,"{1: 104633, 2: 104930, 3: 105023, 4: 106842, 5..."
7,1D0EM,"{1: 19977, 2: 21211, 3: 24187, 4: 22395, 5: 21..."
8,1D1CF,"{1: 16494, 2: 15176, 3: 16651, 4: 15010, 5: 16..."
9,1D3F1,"{1: 29924, 2: 8341, 3: 6260, 4: 9613, 5: 6236,..."


In [558]:

def run_inference_real_data(real_data, alpha, beta, sigma_sq, alpha_0, root = 407):

    results = pd.DataFrame(columns=['orthId', 'mu', 'sigma', 'true_mean', 'NoV'])
    for _, row in real_data.iterrows():
        orth_id = row['orthId']
        data = row['glength_dict']
        X_indices = list(data.keys())
        X_values = list(data.values())
        mean = np.mean(X_values)

        print("Number of observed values:", len(X_values))

        G = create_graph(tree, alpha, beta, sigma_sq)
        if root not in G.nodes:
            raise ValueError(f"Root node {root} not in generated graph.")
        else:
            print("Root node found in the graph.")

        z_hats, z_vars = message_passing(G, alpha, beta, sigma_sq, alpha_0, 5000, X_indices, X_values)

        mu = z_hats[mapping_GTM(root)]
        sigma = z_vars[mapping_GTM(root)]
        
        results.loc[len(results)] = {'orthId': orth_id, 'mu': mu, 'sigma': sigma, 'true_mean': mean, 'NoV': len(X_indices)}

    return results

results = run_inference_real_data(real_data, 0, 1, 100, 50000)

print(results)
tree

Number of observed values: 203
Root node found in the graph.


NetworkXError: The node 408 is not in the graph.

In [None]:
# Iterate over each row in the real_data DataFrame
for _, row in real_data.iterrows():
    orth_id = row['orthId']
    glength_dict = row['glength_dict']

    # Extract observed values (X_values) and their indices (X_indices)
    X_indices = list(glength_dict.keys())
    X_values = np.array(list(glength_dict.values()))

    # Compute the reduced precision matrix and potential vector
    J_reduced, h_reduced = get_conditional_distribution(J, h, X_values, X_indices)

    print("size of J_reduced: ", J_reduced.shape)
    # Compute the posterior covariance matrix (Sigma) and mean (mu)
    Sigma = np.linalg.inv(J_reduced)
    mu = Sigma @ h_reduced

    # Map the root node index to the reduced matrix
    root_index = mapping_GTM(root)

    # Extract the posterior mean and variance for the root node
    posterior_mean = mu[root_index]
    posterior_variance = Sigma[root_index, root_index]

    print(f"Ortholog ID: {orth_id}")
    print(f"Posterior mean for root node: {posterior_mean}")
    print(f"Posterior variance for root node: {posterior_variance}")
    print("-" * 50)

size of J_reduced:  (204, 204)
Ortholog ID: 1CPN2
Posterior mean for root node: 11585.139492304294
Posterior variance for root node: 4192.0271292983725
--------------------------------------------------
size of J_reduced:  (203, 203)
Ortholog ID: 1CQBX
Posterior mean for root node: 45775.45150739554
Posterior variance for root node: 3610.232700151948
--------------------------------------------------
size of J_reduced:  (205, 205)
Ortholog ID: 1CQJ6
Posterior mean for root node: 8758.692219838955
Posterior variance for root node: 3506.7370253340227
--------------------------------------------------
size of J_reduced:  (205, 205)
Ortholog ID: 1CR8Z
Posterior mean for root node: 7490.846996389608
Posterior variance for root node: 3486.1782806976985
--------------------------------------------------
size of J_reduced:  (203, 203)
Ortholog ID: 1CTEU
Posterior mean for root node: 36951.730117071085
Posterior variance for root node: 3610.232700151948
-----------------------------------------

In [None]:
def hard_assignment_EM_real_data(real_data, n, alpha_init, beta_init, alpha0_init, sigma_sq_init, root = 407):
    
    for i in range(n):
        alpha = np.array([alpha_init] * real_data.shape[0])
        beta = np.array([beta_init] * real_data.shape[0])
        sigma_sq = np.array([sigma_sq_init] * real_data.shape[0])
        alpha_0 = np.array([alpha0_init] * real_data.shape[0])

        for i, row in real_data.iterrows():
            orth_id = row['orthId']
            glength_dict = row['glength_dict']

            X_indices = list(glength_dict.keys())
            X_values = np.array(list(glength_dict.values()))

            J_hat_z, h_hat_z = inference_algorithm(G, alpha[i], beta[i], sigma_sq[i], alpha_0[i], X_indices, X_values, root)
            alpha_0_hat, _ = information_to_standard(J_hat_z, h_hat_z, root)

            df = simulate_full_data(G, 100, alpha[i], beta[i], sigma_sq[i], alpha_0[i], 5000)
            alpha_hat, beta_hat, sigma_sq_hat = estimate_alpha_beta_sigma2(df)

            alpha[i] = alpha_hat
            beta[i] = beta_hat
            sigma_sq[i] = sigma_sq_hat
            alpha_0[i] = alpha_0_hat

    return alpha, beta, sigma_sq, alpha_0

alpha, beta, sigma_sq, alpha_0 = hard_assignment_EM_real_data(real_data, 1, 0.5, 1, 50000, 5000)

for i in range(real_data.shape[0]):
    print("Ortholog ID:", real_data.iloc[i]['orthId'])
    print("Estimated alpha:", alpha[i])
    print("Estimated beta:", beta[i])
    print("Estimated sigma_sq:", sigma_sq[i])
    print("Estimated alpha_0:", alpha_0[i])
    print("-" * 50)


Ortholog ID: 1CPN2
Estimated alpha: 0.6467585811273409
Estimated beta: 1
Estimated sigma_sq: 4963
Estimated alpha_0: 2350
--------------------------------------------------
Ortholog ID: 1CQBX
Estimated alpha: 0.5267713000991117
Estimated beta: 1
Estimated sigma_sq: 4934
Estimated alpha_0: 45775
--------------------------------------------------
Ortholog ID: 1CQJ6
Estimated alpha: 0.4601338704921266
Estimated beta: 0
Estimated sigma_sq: 4968
Estimated alpha_0: 704
--------------------------------------------------
Ortholog ID: 1CR8Z
Estimated alpha: 0.6731696578460222
Estimated beta: 0
Estimated sigma_sq: 5000
Estimated alpha_0: 653
--------------------------------------------------
Ortholog ID: 1CTEU
Estimated alpha: 0.5435080775646471
Estimated beta: 0
Estimated sigma_sq: 4976
Estimated alpha_0: 36951
--------------------------------------------------
Ortholog ID: 1CTI9
Estimated alpha: 0.6042859624534023
Estimated beta: 1
Estimated sigma_sq: 4960
Estimated alpha_0: 42606
------------