In [None]:
import numpy as np
import pandas as pd
import umap.umap_ as umap
import plotly.express as px
import os

### Load phenotype data

In [None]:
# set path to raw data
raw_data_dir = "/Users/nick/Cole Trapnell's Lab Dropbox/Nick Lammers/Nick/morphseq/zfin/20240326/"

# set output directory
built_data_dir =  "/Users/nick/Cole Trapnell's Lab Dropbox/Nick Lammers/Nick/morphseq/zfin/20240326/built_data_py/" 
if not os.path.isdir(built_data_dir):
    os.makedirs(built_data_dir)
    
# load phenotype data and stage DF
phenotype_df_cole = pd.read_csv(os.path.join(raw_data_dir, "clean_zfin_single-mut_with-ids_phenotype_df.csv"))
# stage_to_hpf_key = pd.read_csv(os.path.join(raw_data_dir, "stage_to_hpf_key.csv"))
# phenotype_df = phenotype_df_raw.merge(stage_to_hpf_key, how = "left", on="start_stage")

### Load ontology info

In [None]:
anatomy_nodes_df = pd.read_csv(os.path.join(raw_data_dir, "anatomy_item.txt"), sep='\t', header=1)
anatomy_edges_df = pd.read_csv(os.path.join(raw_data_dir, "anatomy_relationship.txt"), sep='\t', header=1)
anatomy_synonyms_df = pd.read_csv(os.path.join(raw_data_dir, "anatomy_synonyms.txt"), sep='\t', header=1)
zfin_pheno_df_raw = pd.read_csv(os.path.join(raw_data_dir, "phenoGeneCleanData_fish.txt"), sep='\t', header=1)
stage_df = pd.read_csv(os.path.join(raw_data_dir, "stage_ontology.txt"), sep='\t', header=1)

In [None]:
# for each phenotype, check for morpholinos
disp_vec = zfin_pheno_df_raw["Fish Display Name"].tolist()
MO_names = [disp for disp in disp_vec if "MO" in disp]
MO_names_alt = [disp for disp in disp_vec if "+ mo" in disp.lower()]

print(len(disp_vec))
print(len(MO_names))
print(len(MO_names_alt))
# zfin_pheno_df_raw['Phenotype Tag']
# zfin_pheno_df_raw['Phenotype Keyword Name']
# zfin_pheno_df_raw.columns

## Build cleaned zfin dataset

In [None]:
disp_vec = zfin_pheno_df_raw["Fish Display Name"].tolist()
MO_flags = [1 if "MO" in disp else 0 for disp in disp_vec]
zfin_pheno_df_raw["morpholino_flag"] = MO_flags
zfin_pheno_df = zfin_pheno_df_raw.rename(columns={
            "Affected Structure or Process 1 superterm ID": "structure_1_ID",
            "Affected Structure or Process 1 superterm Name": "structure_1",
            "Affected Structure or Process 2 superterm ID": "structure_2_ID",
            "Affected Structure or Process 2 superterm name": "structure_2",
            "Gene Symbol" : "gene",
            "Gene ID": "gene_ID",
            "Phenotype Keyword ID": "pheno_ID"
}).loc[:, ["gene", "gene_ID", "structure_1", "structure_1_ID", 'Phenotype Tag', 'Phenotype Keyword Name',
           "morpholino_flag",
           "structure_2", "structure_2_ID", "pheno_ID",
           "Start Stage ID", "End Stage ID", "Figure ID"]]

# keep only genes considered by Cole (consider dropping this)
zfin_pheno_df = zfin_pheno_df.merge(phenotype_df_cole.loc[:, "gene"].drop_duplicates(), how="inner", on="gene")

# add staging info
zfin_pheno_df = zfin_pheno_df.merge(stage_df.loc[:, ["Stage ID", "Begin Hours"]], how="left", 
                                    left_on="Start Stage ID", right_on="Stage ID")

zfin_pheno_df = zfin_pheno_df.rename(columns={"Begin Hours":"start_hpf"})

zfin_pheno_df = zfin_pheno_df.merge(stage_df.loc[:, ["Stage ID", "End Hours"]], how="left", 
                                    left_on="End Stage ID", right_on="Stage ID")

zfin_pheno_df = zfin_pheno_df.rename(columns={"End Hours":"end_hpf"})


#### Make phenotype DF longform

In [None]:
# zfin_pheno_long = pd.wide_to_long(zfin_pheno_df, stubnames=["structure"])
zfin_pheno_temp = zfin_pheno_df.copy()
# zfin_pheno_long["id"] = zfin_pheno_long.index
# id_key1 = zfin_pheno_long.loc[:, ["structure_1", "structure_1_ID"]].rename(columns={"structure_1":"structure", 
#                                                                                     "structure_1":"ID"})
# id_key2 = zfin_pheno_long.loc[:, ["structure_2", "structure_2_ID"]].rename(columns={"structure_2":"structure", 
#                                                                                     "structure_2":"ID"})
                                                                           
# id_key = pd.concat([id_key1, id_key2], axis=0, ignore_index=True).drop_duplicates()

zfin_pheno1 = zfin_pheno_temp.drop(labels=["structure_2", "structure_2_ID", "Stage ID_x", "Stage ID_y"], 
                                   axis=1).rename(columns={"structure_1":"structure", 
                                                                                    "structure_1_ID":"ID"})

# zfin_pheno2 = zfin_pheno_temp.drop(labels=["structure_1", "structure_1_ID", "Stage ID_x", "Stage ID_y"], 
#                                    axis=1).rename(columns={"structure_2":"structure", 
#                                                                                     "structure_2_ID":"ID"})

zfin_pheno_long = zfin_pheno1.dropna(subset=["structure", "ID"]).drop(labels=["morpholino_flag"], axis=1).drop_duplicates()

print(zfin_pheno_long.shape)
gene_pheno_temp = zfin_pheno1.loc[:, ["gene", "ID", "start_hpf", "morpholino_flag"]].drop_duplicates()
gene_pheno_temp = gene_pheno_temp.groupby(["gene", "ID", "start_hpf"]).max()
gene_pheno_temp = gene_pheno_temp.reset_index(drop=False)

zfin_pheno_long = zfin_pheno_long.merge(gene_pheno_temp, how="left", on=["gene", "ID", "start_hpf"])
print(zfin_pheno_long.shape)

zfin_pheno_long.to_csv(os.path.join(built_data_dir, "zfin_phenotypes_clean.csv"), index=False)
zfin_pheno_long = zfin_pheno_long.loc[zfin_pheno_long["Phenotype Tag"]=='abnormal', :].drop_duplicates()
zfin_pheno_long.head()

In [None]:
# add on morpholino info
gene_pheno_temp = zfin_pheno1.loc[:, ["gene", "ID", "start_hpf", "morpholino_flag"]].drop_duplicates()
gene_pheno_temp = gene_pheno_temp.groupby(["gene", "ID", "start_hpf"]).max()
gene_pheno_temp.reset_index(drop=False).head()

#### Filter for desired time period and zfa identifiers 

In [None]:
# remove phneotypes that present after 72hpf
print(zfin_pheno_long.shape)
zfin_pheno_ft = zfin_pheno_long.loc[zfin_pheno_long["start_hpf"]<=72, :].copy()
print(zfin_pheno_ft.shape)
# remove any remaining structure IDs
id_vec = zfin_pheno_ft.loc[:, "ID"].tolist()
keep_flags = np.asarray(["ZFA" in i for i in id_vec])
zfin_pheno_ft = zfin_pheno_ft.loc[keep_flags]
print(zfin_pheno_ft.shape)
# Keep only structures that are in the anatomy graph
zfin_pheno_ft = zfin_pheno_ft.merge(anatomy_nodes_df.loc[:, ["Anatomy ID"]].drop_duplicates(), how="inner",
                                    left_on="ID", right_on="Anatomy ID")
zfin_pheno_ft = zfin_pheno_ft.drop(labels=["Anatomy ID"], axis=1)
print(zfin_pheno_ft.shape)
zfin_pheno_ft.head()

## Clean up anatomy data and build an ontology graph

In [None]:
# First, construct full graph
edge_vec = anatomy_edges_df["Relationship Type ID"].to_list()
keep_edge_types = ["part of"]
keep_flags = np.asarray([e in keep_edge_types for e in edge_vec])

# filter for only desired edge types
edge_df = anatomy_edges_df.loc[keep_flags, ["Parent Item ID", "Child Item ID", "Relationship Type ID"]]
edge_df.reset_index(inplace=True, drop=True)
node_df = anatomy_nodes_df.loc[:, ["Anatomy ID", "Anatomy Name"]].drop_duplicates()
node_df.reset_index(inplace=True, drop=True)
node_df.loc[:, "node_id"] = node_df.index

# get num observations in the zfin database
node_df_temp = node_df.copy().loc[:, ["Anatomy ID", "node_id"]]
node_df_temp = node_df_temp.merge(zfin_pheno_ft.loc[:, "ID"], how="left", left_on="Anatomy ID", right_on="ID").loc[:, ["node_id", "ID"]]
count_df = node_df_temp.groupby("node_id").count()
count_df.reset_index(inplace=True)

node_df = node_df.merge(count_df, how="left", on="node_id")
node_df = node_df.rename(columns={"ID": "zfin_counts"})


# construct node dictionary
anatomy_nodes_id_vec = node_df["Anatomy ID"].to_numpy()
node_container = []
for i, a_term in enumerate(node_df["Anatomy Name"]):
    node_container.append(tuple([i, {"name": a_term, "id": anatomy_nodes_id_vec[i]}]))


# # join node df to edges to get edge IDs
edge_df = edge_df.merge(node_df.loc[:, ["Anatomy ID", "node_id"]], 
                        how="left", left_on="Parent Item ID", right_on="Anatomy ID")
edge_df = edge_df.rename(columns={"node_id":"from_id"})

edge_df = edge_df.merge(node_df.loc[:, ["Anatomy ID", "node_id"]], 
                        how="left", left_on="Child Item ID", right_on="Anatomy ID")
edge_df = edge_df.rename(columns={"node_id":"to_id"})
                         
edge_df = edge_df.loc[:, ["Parent Item ID", "Child Item ID", "Relationship Type ID", "from_id", "to_id"]]
edge_df = edge_df.dropna(subset=["from_id", "to_id"])
edge_df.reset_index(inplace=True, drop=True)
edge_df.head()

In [None]:
import networkx as nx
anatomy_graph = nx.DiGraph()
anatomy_graph.add_nodes_from(node_container)

edge_container = []
for i in range(edge_df.shape[0]):
    edge_container.append(tuple([edge_df.loc[i, "from_id"], edge_df.loc[i, "to_id"]]))
    
anatomy_graph.add_edges_from(edge_container)

Identify nodes with no parent. 

If a node has no parent and no children, remove it unless it has reported observations in the zfin database.

In [None]:
node_list = anatomy_graph.nodes
root_node_list = []
rm_node_list = []
for node in node_list:
    n_successors = len(list(anatomy_graph.successors(node)))
    n_predecessors = len(list(anatomy_graph.predecessors(node)))
    
    if (n_predecessors==0) and (n_successors > 0):
        root_node_list.append(node)
        
    elif n_predecessors==0:
        if node_df.loc[node_df["node_id"]==node, "zfin_counts"].values[0] > 0:
            root_node_list.append(node)
        else:
            rm_node_list.append(node)
    

In [None]:
a_graph_cleaned = anatomy_graph.copy()

# remove flagged nodes
for node in rm_node_list:
    a_graph_cleaned.remove_node(node)
    
# add master dummy node to connect the graph
dummy_id = np.max(node_list) + 1
a_graph_cleaned.add_node(dummy_id, name="linker_node", id="NA")

In [None]:
# add edges
link_edge_container = []
for i in root_node_list:
    link_edge_container.append(tuple([dummy_id, i]))
    
a_graph_cleaned.add_edges_from(link_edge_container)

In [None]:
import matplotlib.pyplot as plt

pos = nx.nx_agraph.graphviz_layout(a_graph_cleaned, prog="twopi", args="")
plt.figure(figsize=(8, 8))
nx.draw(a_graph_cleaned, pos, node_size=10, alpha=0.5, node_color="blue", with_labels=False)
plt.axis("equal")
plt.show()

## Convert to tree topology

In [None]:
# from ChatGPT
from collections import deque

def graph_to_tree_bfs(graph, root_node):
    tree = nx.Graph()
    visited = set()
    queue = deque([root_node])

    while queue:
        current_node = queue.popleft()
        visited.add(current_node)
        tree.add_node(current_node, **graph.nodes[current_node])

        for neighbor in graph.neighbors(current_node):
            if neighbor not in visited:
                tree.add_edge(current_node, neighbor)
                queue.append(neighbor)
                visited.add(neighbor)

    return tree


def graph_to_directed_tree_bfs(graph, root_node):
    tree = nx.DiGraph()
    visited = set()
    queue = deque([root_node])

    while queue:
        current_node = queue.popleft()
        visited.add(current_node)
        tree.add_node(current_node, **graph.nodes[current_node])

        for neighbor in graph.neighbors(current_node):
            if neighbor not in visited:
                tree.add_edge(current_node, neighbor)
                queue.append(neighbor)
                visited.add(neighbor)

    return tree

In [None]:
# Choose any node as the root
root_node = dummy_id # node_df.loc[node_df["Anatomy Name"]=="whole organism", "node_id"].to_numpy()[0]

# Convert undirected graph to directed tree
a_tree = graph_to_tree_bfs(a_graph_cleaned, root_node)
a_tree_dir = graph_to_directed_tree_bfs(a_graph_cleaned, root_node)


In [None]:
import matplotlib.pyplot as plt

pos = nx.nx_agraph.graphviz_layout(a_tree, prog="twopi", args="")
plt.figure(figsize=(8, 8))
nx.draw(a_tree, pos, node_size=10, alpha=0.5, node_color="blue", with_labels=False)
plt.axis("equal")
plt.show()

## Calculate aggregate observations that include the node AND its children

In [None]:
node_index = np.asarray(list(a_tree_dir.nodes))
node_index = node_index[np.asarray(node_index) != dummy_id]
# generate count dict
node_count_dict = dict({})
for node in node_index:
    z_counts = node_df.loc[node_df["node_id"]==node, "zfin_counts"].values[0]
    node_count_dict[node] = z_counts
    
# get DF that contains only the nodes we kept
node_df_tree = node_df.copy()
keep_indices = np.asarray([i for i in node_df_tree["node_id"] if i in node_index])
node_df_tree = node_df_tree.loc[keep_indices, :]
node_df_tree.reset_index(inplace=True, drop=True)

# get counts that include children
for node in node_index:
    
    # initialize
    z_counts = 0
    
    # get counts from successor nodes
    d_nodes = list(nx.descendants(a_tree_dir, node))
    nz_counts = 0
    for d in d_nodes:
        z_counts += node_count_dict[d]
        if node_count_dict[d] > 0:
            nz_counts += 1
        
    node_df_tree.loc[node_df_tree["node_id"]==node, "d_counts"] = z_counts
    node_df_tree.loc[node_df_tree["node_id"]==node, "n_counts"] = len(d_nodes)
    node_df_tree.loc[node_df_tree["node_id"]==node, "nz_counts"] = nz_counts

In [None]:
x_limit = np.percentile(node_df_tree["zfin_counts"], 95)
y_limit = np.percentile(node_df_tree["d_counts"], 95)

node_df_tree.loc[:, "importance_flag"] = 0
i_filter = (node_df_tree["zfin_counts"] >= x_limit) & (node_df_tree["d_counts"] >= y_limit)
node_df_tree.loc[i_filter, "importance_flag"] = 1
node_df_tree.loc[:, "importance_flag"] = node_df_tree.loc[:, "importance_flag"].astype(str)

node_df_tree["log_n_counts"] = 25 + node_df_tree.loc[:, "n_counts"].to_numpy() #np.log(node_df_tree.loc[:, "n_counts"].to_numpy() + 1)

In [None]:
fig_path = "/Users/nick/Cole Trapnell's Lab Dropbox/Nick Lammers/Nick/morphseq/zfin/figures/"

fig = px.scatter(node_df_tree, x="zfin_counts", y="d_counts", color="importance_flag", size="log_n_counts",
                 log_x=True, log_y=True,
                 labels={"zfin_counts": "direct zfin observations", 
                         "d_counts": "descendant zfin observations",
                         "importance_flag": "importance flag",
                         "n_counts": "number of descendant phenotypes"},
                hover_data={"Anatomy Name":True, "Anatomy ID":True, "log_n_counts":False, 
                            "n_counts":True, "importance_flag": False})

fig.update_layout(showlegend=False,
    yaxis_title="# descendant reports", xaxis_title="# direct reports on zfin"
)

fig.show()
fig.write_html(os.path.join(fig_path, "zfin_importance_scatter.html"))

### Now, iterate through genes and calculate the following:

1) Total importance across all phenotypes
2) TF flag
3) Top 3 (?) reported phenotypes by importance
4) Effects for those 3

In [None]:
# load gene ID table
gene_info_path = "/Users/nick/Cole Trapnell's Lab Dropbox/Nick Lammers/Nick/morphseq/zfin/20240326/built_data/gene_set_mf.csv"
gene_info_df = pd.read_csv(gene_info_path)
gene_info_df = gene_info_df.loc[:, ["gene_symbol", "gs_name", "gs_description"]].rename(columns={"gene_symbol": "gene"})
tf_flag_vec = [1 if "transcription" in name.lower() else 0 for name in list(gene_info_df["gs_name"]) ]
gene_info_df["TF_flag"] = tf_flag_vec

# Make gene phenotype DF and join on the above info
gene_pheno_df = zfin_pheno_ft.loc[:, ["gene", "gene_ID"]].drop_duplicates().reset_index(drop=True)

# Note, about 15% of genes are not in this reference table and so may be incorrectly labeled as non-TF
gene_pheno_df = gene_pheno_df.merge(gene_info_df.loc[gene_info_df["TF_flag"]==1, ["gene", "TF_flag"]].drop_duplicates(),
                                    how="left", on="gene")

gene_pheno_df.loc[np.isnan(gene_pheno_df["TF_flag"]), "TF_flag"] = 0

gene_pheno_df.head(5)

Iterate through each gene and calculate an aggregate importance score, as well as its top 3 "most important" phenotypes

In [None]:
from tqdm import tqdm 

zfin_pheno_node = zfin_pheno_ft.merge(node_df.loc[:, ["Anatomy ID", "node_id"]].drop_duplicates(), how="left",
                                      left_on="ID", right_on="Anatomy ID").drop(labels="Anatomy ID", axis=1)

zfin_pheno_node = zfin_pheno_node.loc[zfin_pheno_node["morpholino_flag"]==0
                                        , ["gene", "structure", "ID", "Phenotype Keyword Name",
                                          "start_hpf", "node_id"]].drop_duplicates().reset_index(drop=True).rename(
                                        columns={"Phenotype Keyword Name":"keyword"})

zfin_pheno_node = zfin_pheno_node.loc[zfin_pheno_node["start_hpf"]>0, :]
# filter out evidence from mopholino experiment

gene_index = list(gene_pheno_df["gene"])
gene_node_list = []
for g, gene in enumerate(tqdm(gene_index)):
    
    # get nodes for each phenotype
    pheno_nodes = zfin_pheno_node.loc[zfin_pheno_node["gene"]==gene, "node_id"].to_numpy()
    
    # get de-duped list of all descendants
    d_list = list(np.unique(pheno_nodes))
    for d in d_list:
        d_list += list(nx.descendants(a_tree_dir, d))
    
    d_index = np.unique(d_list)
    nz_count = 0
    for d in d_index:
        if node_count_dict[d] > 0:
            nz_count += 1
            
    # add to DF
    gene_pheno_df.loc[g, "importance_score"] = nz_count
    
    # get importance ranking for each phenotype
    nz_count_list = []
    dd_nodes = np.unique(pheno_nodes)
    for node in dd_nodes:
        nz = node_df_tree.loc[node_df_tree["node_id"]==node, "nz_counts"].values[0] + 1
        nz_count_list.append(nz)
        
    si = np.argsort(nz_count_list)
    
    # add info to table
    gene_table = zfin_pheno_node.loc[zfin_pheno_node["gene"]==gene, :].reset_index(drop=True)
    ranked_node_ids = dd_nodes[si[::-1]]
    if len(ranked_node_ids) > 0:
        filter0 = gene_table["node_id"]==ranked_node_ids[0]
        gene_pheno_df.loc[g, "phenotype_1"] = gene_table.loc[filter0, "structure"].values[0]
        gene_pheno_df.loc[g, "effect_1"] = gene_table.loc[filter0, "keyword"].values[0]
        gene_pheno_df.loc[g, "start_hpf_1"] = gene_table.loc[filter0, "start_hpf"].values[0]
    else:
        gene_pheno_df.loc[g, "phenotype_1"] = ""
        gene_pheno_df.loc[g, "effect_1"] = ""
        gene_pheno_df.loc[g, "start_hpf_1"] = np.nan
    
    if len(ranked_node_ids) > 1:
        filter1 = gene_table["node_id"]==ranked_node_ids[1]
        gene_pheno_df.loc[g, "phenotype_2"] = gene_table.loc[filter1, "structure"].values[0]
        gene_pheno_df.loc[g, "effect_2"] = gene_table.loc[filter1, "keyword"].values[0]
        gene_pheno_df.loc[g, "start_hpf_2"] = gene_table.loc[filter1, "start_hpf"].values[0]
    else:
        gene_pheno_df.loc[g, "phenotype_2"] = ""
        gene_pheno_df.loc[g, "effect_2"] = ""
        gene_pheno_df.loc[g, "start_hpf_2"] = np.nan
        
    if len(ranked_node_ids) > 2:
        filter2 = gene_table["node_id"]==ranked_node_ids[2]
        gene_pheno_df.loc[g, "phenotype_3"] = gene_table.loc[filter2, "structure"].values[0]
        gene_pheno_df.loc[g, "effect_3"] = gene_table.loc[filter2, "keyword"].values[0]
        gene_pheno_df.loc[g, "start_hpf_3"] = gene_table.loc[filter2, "start_hpf"].values[0]
    else:
        gene_pheno_df.loc[g, "phenotype_3"] = ""
        gene_pheno_df.loc[g, "effect_3"] = ""
        gene_pheno_df.loc[g, "start_hpf_3"] = np.nan
        
        
# remove genes with no phenotype
print(gene_pheno_df.shape)
gene_pheno_df = gene_pheno_df.loc[gene_pheno_df["phenotype_1"] != "", :]
print(gene_pheno_df.shape)

In [None]:
gene_pheno_df = gene_pheno_df.sort_values(by=["TF_flag", "importance_score"], axis=0, ascending=False)
gene_pheno_df.reset_index(inplace=True, drop=True)
gene_pheno_df.to_csv(os.path.join(built_data_dir, "zfin_gene_KO_candidates.csv"))

In [None]:
zfin_pheno_node.loc[zfin_pheno_node["gene"]=="otx2b", :]

## Filter the phenotype table

In [None]:
min_counts = 1
node_df_tree["agg_counts"] = node_df_tree["zfin_counts"] + node_df_tree["d_counts"]
count_filter = (node_df_tree["zfin_counts"] >= min_counts) & (node_df_tree["agg_counts"] >= 10)
nodes_filtered = node_df_tree.loc[count_filter, "node_id"].to_numpy()

zfin_pheno_node2 = zfin_pheno_ft.merge(node_df.loc[:, ["Anatomy ID", "node_id"]].drop_duplicates(), how="left",
                                      left_on="ID", right_on="Anatomy ID").drop(labels="Anatomy ID", axis=1)


gene_index = np.unique(zfin_pheno_node2["gene"])
phenotype_array = np.zeros((len(gene_index), len(nodes_filtered)), dtype=np.uint8)

for g, gene in enumerate(tqdm(gene_index)):
    p_nodes = np.unique(zfin_pheno_node2.loc[zfin_pheno_node2["gene"]==gene, "node_id"])
    for node in p_nodes:
        ft = node == nodes_filtered
        if np.sum(ft) == 1:
            phenotype_array[g, ft] = 1
            
phenotype_array = phenotype_array[np.max(phenotype_array, axis=1) > 0, :]

In [None]:
phenotype_array.shape

In [None]:
from sklearn.decomposition import TruncatedSVD, PCA

# gene_dist_mat_fit = gene_dist_mat.copy()
# gene_dist_mat_fit[gene_dist_mat_fit > 6] = 6
# pull out binary phenotype array
n_lsa_comp = 25
svd_model = TruncatedSVD(n_components=n_lsa_comp, 
                         algorithm='randomized',
                         n_iter=10, random_state=42)
svd_model.fit(phenotype_array.T)

In [None]:
from sklearn.preprocessing import StandardScaler

keep_indices = np.where(svd_model.explained_variance_ratio_ >= 0.005)[0]
print(keep_indices)

n_umap_comp = 2
# fit UMAP
svd_components = svd_model.components_.T[:, keep_indices]
reducer = umap.UMAP(n_components=n_umap_comp, n_neighbors=15)
scaled_svd = StandardScaler().fit_transform(svd_components)
embedding = reducer.fit_transform(scaled_svd)

In [None]:
svd_model.explained_variance_ratio_

In [None]:
fig = px.scatter(x=embedding[:, 0], y=embedding[:, 1])
fig.show()

In [None]:
fig = px.scatter(x=svd_components[:, 0], y=svd_components[:, 1])
fig.show()

## Try using a weighted graph to calculate distances 

In [None]:
# make a weighted version of the graph
node_key = np.asarray(list(a_tree)).astype(int)

# calculate node depths
depth_vec = np.empty((np.max(node_key)+1,))
depth_vec[:] = np.nan
for n in node_key:
    depth_vec[n] = nx.shortest_path_length(a_tree, source=root_node, target=n)
    
# get unique list of depths
depth_index = np.unique(depth_vec)
wt_edge_container = []

# generate new edge list
for node_i in tqdm(a_tree.nodes):
    for node_j in a_tree.nodes:
        if a_tree.has_edge(node_i, node_j):
            level = np.min([depth_vec[node_i], depth_vec[node_j]])
            wt = 2**(-level)
            wt_edge_container.append(tuple([node_i, node_j, wt]))
            
# make tree
a_tree_weighted = nx.Graph()
a_tree_weighted.nodes = a_tree.nodes
a_tree_weighted.add_weighted_edges_from(wt_edge_container)

In [None]:
weight = nx.get_edge_attributes(a_tree_weighted, "weight")


In [None]:
dist_mat[865, 1031]

In [None]:
node_index[1031]

In [None]:
dist_mat[int(node_index[1031]), int(node_index[865])]

In [None]:
import plotly.express as px

# a_tree_uni = anatomy_tree.to_undirected()

distance_dict = dict(nx.shortest_path_length(a_tree_weighted, weight="weight"))
node_list = list(a_tree_weighted.nodes)
# make distance matrix
dist_mat = np.zeros((len(distance_dict), len(distance_dict)))
for i in range(len(distance_dict)):
    for j in range(len(distance_dict)):
        try:
            dist_mat[i, j] = distance_dict[node_list[i]][node_list[j]]
        except:
            pass

In [None]:
node_index = np.zeros((np.max(node_list)+1,))-1
for i, node in enumerate(node_list):
    node_index[node] = i

In [None]:
fig = px.imshow(dist_mat)
fig.show()

## Next, construct a gene-level graph

In [None]:
# add node ID info
zfin_pheno_node2 = zfin_pheno_ft.merge(node_df.loc[:, ["Anatomy ID", "node_id"]].drop_duplicates(), how="left",
                                      left_on="ID", right_on="Anatomy ID").drop(labels="pheno_ID", axis=1)

zfin_pheno_node2 = zfin_pheno_node2.loc[:, ["gene", "structure", "ID", "node_id"]].drop_duplicates().reset_index(drop=True)

gene_index = np.unique(zfin_pheno_node2["gene"])
gene_node_list = []
for g, gene in enumerate(gene_index):
    gene_nodes = zfin_pheno_node2.loc[zfin_pheno_node2["gene"]==gene, "node_id"].to_numpy()
    assert len(gene_nodes) > 0
    gene_node_list.append(gene_nodes)

In [None]:
from tqdm import tqdm

max_dist = 3
# generate weighted edges
gene_edge_container = []
gene_dist_mat = np.zeros((len(gene_index), len(gene_index)))
for i in tqdm(range(len(gene_index))):
    
    for j in range(i+1, len(gene_index)):
        # get nodes
        i_nodes = node_index[gene_node_list[i]].astype(int)
        j_nodes = node_index[gene_node_list[j]].astype(int)
        # convert to indices
        
        # calculate the shortest distance to a companion node for i-> and j->i
        ij_array = np.reshape(dist_mat[j_nodes, i_nodes[:, np.newaxis]], (len(j_nodes), len(i_nodes)))
        i_mean = np.mean(np.min(ij_array, axis=0))
        j_mean = np.mean(np.min(ij_array, axis=1))
        
        dist_avg = np.max([i_mean, j_mean])
        gene_dist_mat[i , j] = i_mean
        gene_dist_mat[j , i] = j_mean
            
        if dist_avg <= max_dist:
            gene_edge_container.append(tuple([i, j, 1 / (0.1 + dist_avg)]))

In [None]:
fig = px.imshow(gene_dist_mat)
fig.show()

In [None]:
from sklearn.decomposition import TruncatedSVD, PCA

gene_dist_mat_fit = gene_dist_mat.copy()
# gene_dist_mat_fit[gene_dist_mat_fit > 6] = 6
# pull out binary phenotype array
n_lsa_comp = 100
svd_model = TruncatedSVD(n_components=n_lsa_comp, 
                         algorithm='randomized',
                         n_iter=10, random_state=42)
svd_model.fit(gene_dist_mat_fit.T)
# keep_indices = np.where(svd_model.explained_variance_ratio_ >= 0.01)[0]
# print(keep_indices)

In [None]:
from sklearn.preprocessing import StandardScaler

keep_indices = np.where(svd_model.explained_variance_ratio_ >= 0.001)[0]
print(keep_indices)

n_umap_comp = 2
# fit UMAP
# svd_components = svd_model.components_.T[:, keep_indices]
svd_components = gene_dist_mat
reducer = umap.UMAP(n_components=n_umap_comp, n_neighbors=5)
scaled_svd = StandardScaler().fit_transform(svd_components)
embedding = reducer.fit_transform(scaled_svd)

In [None]:
svd_model.components_.shape

In [None]:
fig = px.scatter(x=embedding[:, 0], y=embedding[:, 1])
fig.show()

In [None]:
embedding

## Try rolling up tree graph to a fixed level

In [None]:
node_key = np.asarray(list(anatomy_tree)).astype(int)
# calculate node depths
depth_vec = np.empty((np.max(node_key)+1,))
depth_vec[:] = np.nan
for n in node_key:
    depth_vec[n] = nx.shortest_path_length(anatomy_tree, source=root_node, target=n)

In [None]:
depth_index, d_counts = np.unique(depth_vec[~np.isnan(depth_vec)], return_counts=True)
print(d_counts)
level_dict = dict({})
for d in range(len(d_counts)):
    d_nodes = np.where(depth_vec==d)[0]
    d_id_vec = np.arange(len(d_nodes))
    lvl_dict = dict([(d_nodes[i], d_id_vec[i]) for i in range(len(d_nodes))])
    level_dict[d] = lvl_dict

Define function that traverses up or down the tree to find the parent/child nodes at the desired description level

In [None]:
def update_phenotype_vector(root_id, target_id, level, level_dict, phenotype_vec, target_depth):
        
    lvl_dict = level_dict[level]

    if target_depth == level:
        phenotype_vec[lvl_dict[target_id]] += 1

    elif target_depth > level:
        n_iters = target_depth - level
        query_nodes = [target_id]
        for n in range(int(n_iters)):
            p_nodes = []
            for q in query_nodes:
                p_nodes += list(anatomy_tree_dir.predecessors(q))
            query_nodes = p_nodes

        for p in p_nodes:
            phenotype_vec[lvl_dict[p]] += 1

    elif target_depth < level:
        n_iters = level - target_depth
        query_nodes = [target_id]
        for n in range(int(n_iters)):
            s_nodes = []
            for q in query_nodes:
                s_nodes += list(anatomy_tree_dir.successors(q))
            query_nodes = s_nodes

        for s in s_nodes:
            phenotype_vec[lvl_dict[s]] += 1
            
    return phenotype_vec


In [None]:
root_id = 3080
target_id = 2376
level = 3

# initialize vector to store results
phenotype_vec = np.zeros((d_counts[level], 1))
target_depth = depth_vec[target_id]

phenotype_vec = update_phenotype_vector(root_id, target_id, level, level_dict, phenotype_vec, target_depth)

In [None]:
all_nodes = anatomy_tree_dir.nodes

level = 1
node_list = list(level_dict[level].keys())
node_info = np.asarray([all_nodes[n] for n in node_list])
print(node_info)

In [None]:
print(all_nodes[1031])
list(anatomy_tree_dir.successors(1031))

In [None]:
all_nodes[531]

In [None]:
list(anatomy_tree_dir.successors(5))

In [None]:
ns_vec = np.zeros((np.max(node_key)+1,))
for n in node_key:
    ns_vec[n] = len(list(anatomy_tree_dir.successors(n)))

In [None]:
si = np.argsort(ns_vec)
top_nodes = si[::-1][:25]

node_info = np.asarray([all_nodes[n] for n in top_nodes])
print(node_info)
print(ns_vec[top_nodes])

In [None]:
si

Get list of nodes/phenotypes that correspond to each gene

In [None]:
# add node ID info
zfin_pheno_node = zfin_pheno_ft.merge(node_df.loc[:, ["Anatomy ID", "node_id"]].drop_duplicates(), how="left",
                                      left_on="ID", right_on="Anatomy ID").drop(labels="pheno_ID", axis=1)

zfin_pheno_node = zfin_pheno_node.loc[:, ["gene", "structure", "ID", "node_id"]].drop_duplicates().reset_index(drop=True)

gene_index = np.unique(zfin_pheno_node["gene"])
gene_node_list = []
for g, gene in enumerate(gene_index):
    gene_nodes = zfin_pheno_node.loc[zfin_pheno_node["gene"]==gene, "node_id"].to_numpy()
    assert len(gene_nodes) > 0
    gene_node_list.append(gene_nodes)

In [None]:
from tqdm import tqdm 

root_id = 3080
level = 1

# generate array to store phenotype_vectors
phenotype_array = np.zeros((len(gene_index), d_counts[level]))

# iterate through each gene to build a phenotype vector
for g, gi in tqdm(enumerate(gene_index)):
    # initialize vector
    phenotype_vec = np.zeros((d_counts[level],))
    # get list of phenotypes
    gene_nodes = gene_node_list[g]
    
    for target_id in gene_nodes:
        
        if target_id != root_id:
            # get depth
            target_depth = depth_vec[target_id]
            # update phenotype vector
            phenotype_vec = update_phenotype_vector(root_id, target_id, level, level_dict, phenotype_vec, target_depth)
        
    # add to main array
    phenotype_array[g, :] = phenotype_vec

In [None]:
keep_flags = np.sum(phenotype_array, axis=1) > 0

In [None]:
px.imshow(phenotype_array[0:10, :])

## Calculate UMAP projection

In [None]:
from sklearn.decomposition import TruncatedSVD, PCA

# pull out binary phenotype array
n_lsa_comp = 25
svd_model = TruncatedSVD(n_components=n_lsa_comp, 
                         algorithm='randomized',
                         n_iter=100, random_state=42)
svd_model.fit(phenotype_array[keep_flags, :].T)


In [None]:
from sklearn.preprocessing import StandardScaler

n_umap_comp = 2
# fit UMAP
svd_components = svd_model.components_.T[:, :14]
reducer = umap.UMAP(n_components=n_umap_comp)
# scaled_svd = StandardScaler().fit_transform(svd_components)
embedding = reducer.fit_transform(svd_components)

In [None]:
fig = px.scatter(x=embedding[:, 0], y=embedding[:, 1])
fig.show()

In [None]:
n_umap_comp = 2
# fit UMAP
# svd_components = svd_model.components_.T
reducer = umap.UMAP(n_components=n_umap_comp)
# scaled_svd = StandardScaler().fit_transform(svd_components)
embedding_full = reducer.fit_transform(phenotype_array[keep_flags, :])

In [None]:
fig = px.scatter(x=embedding_full[:, 0], y=embedding_full[:, 1])
fig.show()

In [None]:
print(svd_model.explained_variance_ratio_)