**The purpose of this notebook is to recapitulate in python the phenotype clustering script that Cole wrote in R.**

**The key steps in this process are:**
1) Load and filter the raw phenotype data
2) Convert string phenotypes to wideform binary array
3) Perform UMAP compression and cluster

In [None]:
import numpy as np
import pandas as pd
import umap.umap_ as umap
import plotly.express as px
import os

### Load phenotype data

In [None]:
# set path to raw data
raw_data_dir = "/Users/nick/Cole Trapnell's Lab Dropbox/Nick Lammers/Nick/morphseq/zfin/20240326/"

# set output directory
built_data_dir =  "/Users/nick/Cole Trapnell's Lab Dropbox/Nick Lammers/Nick/morphseq/zfin/20240326/built_data_py/" 
if not os.path.isdir(built_data_dir):
    os.makedirs(built_data_dir)
    
# load phenotype data and stage DF
phenotype_df_cole = pd.read_csv(os.path.join(raw_data_dir, "clean_zfin_single-mut_with-ids_phenotype_df.csv"))
# stage_to_hpf_key = pd.read_csv(os.path.join(raw_data_dir, "stage_to_hpf_key.csv"))
# phenotype_df = phenotype_df_raw.merge(stage_to_hpf_key, how = "left", on="start_stage")

### Load ontology info

In [None]:
anatomy_nodes_df = pd.read_csv(os.path.join(raw_data_dir, "anatomy_item.txt"), sep='\t', header=1)
anatomy_edges_df = pd.read_csv(os.path.join(raw_data_dir, "anatomy_relationship.txt"), sep='\t', header=1)
anatomy_synonyms_df = pd.read_csv(os.path.join(raw_data_dir, "anatomy_synonyms.txt"), sep='\t', header=1)
zfin_pheno_df = pd.read_csv(os.path.join(raw_data_dir, "phenoGeneCleanData_fish.txt"), sep='\t', header=1)
stage_df = pd.read_csv(os.path.join(raw_data_dir, "stage_ontology.txt"), sep='\t', header=1)
# print(anatomy_edges_df.head())
# print(anatomy_nodes_df.head())

## Build cleaned zfin dataset

In [None]:
zfin_pheno_df = zfin_pheno_df.rename(columns={
            "Affected Structure or Process 1 superterm ID": "structure_1_ID",
            "Affected Structure or Process 1 superterm Name": "structure_1",
            "Affected Structure or Process 2 superterm ID": "structure_2_ID",
            "Affected Structure or Process 2 superterm name": "structure_2",
            "Gene Symbol" : "gene",
            "Gene ID": "gene_ID",
            "Phenotype Keyword ID": "pheno_ID"
}).loc[:, ["gene", "gene_ID", "structure_1", "structure_1_ID", "structure_2", "structure_2_ID", "pheno_ID",
           "Start Stage ID", "End Stage ID", "Figure ID"]]

zfin_pheno_df = zfin_pheno_df.merge(phenotype_df_cole.loc[:, "gene"].drop_duplicates(), how="inner", on="gene")
zfin_pheno_df = zfin_pheno_df.merge(stage_df.loc[:, ["Stage ID", "Begin Hours"]], how="left", 
                                    left_on="Start Stage ID", right_on="Stage ID")

zfin_pheno_df = zfin_pheno_df.rename(columns={"Begin Hours":"start_hpf"})

zfin_pheno_df = zfin_pheno_df.merge(stage_df.loc[:, ["Stage ID", "End Hours"]], how="left", 
                                    left_on="End Stage ID", right_on="Stage ID")

zfin_pheno_df = zfin_pheno_df.rename(columns={"End Hours":"end_hpf"})


### Make phenotype DF longform

In [None]:
# zfin_pheno_long = pd.wide_to_long(zfin_pheno_df, stubnames=["structure"])
zfin_pheno_temp = zfin_pheno_df.copy()
# zfin_pheno_long["id"] = zfin_pheno_long.index
# id_key1 = zfin_pheno_long.loc[:, ["structure_1", "structure_1_ID"]].rename(columns={"structure_1":"structure", 
#                                                                                     "structure_1":"ID"})
# id_key2 = zfin_pheno_long.loc[:, ["structure_2", "structure_2_ID"]].rename(columns={"structure_2":"structure", 
#                                                                                     "structure_2":"ID"})
                                                                           
# id_key = pd.concat([id_key1, id_key2], axis=0, ignore_index=True).drop_duplicates()

zfin_pheno1 = zfin_pheno_temp.drop(labels=["structure_2", "structure_2_ID", "Stage ID_x", "Stage ID_y"], 
                                   axis=1).rename(columns={"structure_1":"structure", 
                                                                                    "structure_1_ID":"ID"})

zfin_pheno2 = zfin_pheno_temp.drop(labels=["structure_1", "structure_1_ID", "Stage ID_x", "Stage ID_y"], 
                                   axis=1).rename(columns={"structure_2":"structure", 
                                                                                    "structure_2_ID":"ID"})

zfin_pheno_long = pd.concat([zfin_pheno1, zfin_pheno2], axis=0, ignore_index=True).dropna(
    subset=["structure", "ID"]).drop_duplicates()


zfin_pheno_long.head()
zfin_pheno_long.to_csv(os.path.join(built_data_dir, "zfin_phenotypes_clean.csv"), index=False)
print(zfin_pheno_long.shape)
zfin_pheno_long = zfin_pheno_long.drop_duplicates()
print(zfin_pheno_long.shape)

## Clean up anatomy data and build an ontology graph

In [None]:
# First, construct full graph
edge_vec = anatomy_edges_df["Relationship Type ID"].to_list()
keep_edge_types = ["is_a", "part of"]
keep_flags = np.asarray([e in keep_edge_types for e in edge_vec])

# filter for only desired edge types
edge_df = anatomy_edges_df.loc[keep_flags, ["Parent Item ID", "Child Item ID", "Relationship Type ID"]]
edge_df.reset_index(inplace=True, drop=True)
node_df = anatomy_nodes_df.loc[:, ["Anatomy ID", "Anatomy Name"]].drop_duplicates()
node_df.reset_index(inplace=True, drop=True)
node_df.loc[:, "node_id"] = node_df.index
# construct node dictionary
anatomy_nodes_id_vec = node_df["Anatomy ID"].to_numpy()
node_container = []
for i, a_term in enumerate(node_df["Anatomy Name"]):
    node_container.append(tuple([i, {"name": a_term, "id": anatomy_nodes_id_vec[i]}]))


# join node df to edges to get edge IDs
edge_df = edge_df.merge(node_df.loc[:, ["Anatomy ID", "node_id"]], 
                        how="left", left_on="Parent Item ID", right_on="Anatomy ID")
edge_df = edge_df.rename(columns={"node_id":"from_id"})

edge_df = edge_df.merge(node_df.loc[:, ["Anatomy ID", "node_id"]], 
                        how="left", left_on="Child Item ID", right_on="Anatomy ID")
edge_df = edge_df.rename(columns={"node_id":"to_id"})
                         
edge_df = edge_df.loc[:, ["Parent Item ID", "Child Item ID", "Relationship Type ID", "from_id", "to_id"]]
edge_df = edge_df.dropna(subset=["from_id", "to_id"])
edge_df.reset_index(inplace=True, drop=True)
edge_df.head()

In [None]:
import networkx as nx
a_graph = nx.Graph()
a_graph.add_nodes_from(node_container)

edge_container = []
for i in range(edge_df.shape[0]):
    edge_container.append(tuple([edge_df.loc[i, "from_id"], edge_df.loc[i, "to_id"]]))
    
a_graph.add_edges_from(edge_container)

In [None]:
# import matplotlib.pyplot as plt

# # pos = nx.nx_agraph.graphviz_layout(a_graph, prog="twopi", args="")
# # x_vec = [pos[i][0] for i in range(len(pos))]
# # y_vec = [pos[i][1] for i in range(len(pos))]

# pos = nx.nx_agraph.graphviz_layout(a_graph, prog="twopi", args="")
# plt.figure(figsize=(8, 8))
# nx.draw(a_graph, pos, node_size=20, alpha=0.5, node_color="blue", with_labels=False)
# plt.axis("equal")
# plt.show()

### Calculate graph distance between all genes in the phenotypes dataset

In [None]:
import plotly.express as px
distance_dict = dict(nx.shortest_path_length(a_graph))

# make distance matrix
dist_mat = np.zeros((len(distance_dict), len(distance_dict)))
for i in range(len(distance_dict)):
    for j in range(len(distance_dict)):
        try:
            dist_mat[i, j] = distance_dict[i][j]
        except:
            pass

In [None]:
fig = px.imshow(dist_mat)

### Now, use the penotype graph to calculate pairwise distances between gene phenotypes

In [None]:
# remove phneotypes that present after 72hpf
print(zfin_pheno_long.shape)
zfin_pheno_ft = zfin_pheno_long.loc[zfin_pheno_long["start_hpf"]<=72, :]
print(zfin_pheno_ft.shape)
# remove any remaining structure IDs
id_vec = zfin_pheno_ft.loc[:, "ID"].tolist()
keep_flags = np.asarray(["ZFA" in i for i in id_vec])
zfin_pheno_ft = zfin_pheno_ft.loc[keep_flags]
print(zfin_pheno_ft.shape)
zfin_pheno_ft.head(20)

#### Get node numbers for phneotypes matched to each gene

In [None]:
# add node ID info
zfin_pheno_node = zfin_pheno_ft.merge(node_df.loc[:, ["Anatomy ID", "node_id"]].drop_duplicates(), how="left",
                                      left_on="ID", right_on="Anatomy ID").drop(labels="pheno_ID", axis=1)

zfin_pheno_node = zfin_pheno_node.loc[:, ["gene", "structure", "ID", "node_id"]].drop_duplicates().reset_index(drop=True)

gene_index = np.unique(zfin_pheno_node["gene"])
gene_node_list = []
for g, gene in enumerate(gene_index):
    gene_nodes = zfin_pheno_node.loc[zfin_pheno_node["gene"]==gene, "node_id"].to_numpy()
    assert len(gene_nodes) > 0
    gene_node_list.append(gene_nodes)

#### iterate through genes an calculate phenotypic distances using the graph

In [None]:
from tqdm import tqdm

max_dist = 5
# generate weighted edges
gene_edge_container = []
gene_dist_mat = np.zeros((len(gene_index), len(gene_index)))
for i in tqdm(range(len(gene_index))):
    
    for j in range(i+1, len(gene_index)):
        # get nodes
        i_nodes = gene_node_list[i]
        j_nodes = gene_node_list[j]
        # calculate the shortest distance to a companion node for i-> and j->i
        ij_array = np.reshape(dist_mat[j_nodes, i_nodes[:, np.newaxis]], (len(j_nodes), len(i_nodes)))
        i_mean = np.mean(np.min(ij_array, axis=0))
        j_mean = np.mean(np.min(ij_array, axis=1))
        
        dist_avg = np.max([i_mean, j_mean])
        gene_dist_mat[i , j] = j_mean
        gene_dist_mat[j , i] = i_mean
            
        if dist_avg <= max_dist:
            gene_edge_container.append(tuple([i, j, 1 / (0.1 + dist_avg)]))

In [None]:
px.imshow(gene_dist_mat)

In [None]:
weight_vec = np.asarray([g[2] for g in gene_edge_container])
fig = px.histogram(x=weight_vec)
fig.show()

In [None]:
nn_keep = 5

edge_container_knn = []
included_mat = np.zeros((len(gene_index), len(gene_index)))

for i in tqdm(range(len(gene_index))):
        
        dist_vec = gene_dist_mat[i, :]
        dist_vec[i] = np.inf
        si =  np.argsort(dist_vec)
        
        for j in si[:nn_keep]:
            if (not included_mat[i, j]) and (not included_mat[j, i]):
                edge_container_knn.append(tuple([i, j]))
                
            included_mat[i, j] = 1

In [None]:
# generate nodes
gene_node_container = []
for g, gene in enumerate(gene_index):
    gene_node_container.append(tuple([g, {"name": gene}]))

In [None]:
# make the graph
gene_graph = nx.Graph()
gene_graph.add_nodes_from(gene_node_container)

gene_graph.add_weighted_edges_from(gene_edge_container)

In [None]:
gene_graph_knn = nx.Graph()
gene_graph_knn.add_nodes_from(gene_node_container)
gene_graph_knn.add_edges_from(edge_container_knn)

len(gene_graph_knn.edges)

In [None]:
pos=nx.spring_layout(gene_graph_knn)

In [None]:
pos_array = np.empty((len(pos), 2))
for p in range(pos_array.shape[0]):
    pos_array[p, :] = pos[p]
    
fig = px.scatter(x=pos_array[:, 0], y=pos_array[:, 1], color=gene_index, opacity=1)
fig.show()

In [None]:
max_dist = 2

edge_container_knn_w = []
included_mat = np.zeros((len(gene_index), len(gene_index)))

for i in tqdm(range(len(gene_index))):
    for j in range(len(gene_index)):
        
        dist = np.max([gene_dist_mat[i, j], gene_dist_mat[j, i]])
        dist = np.max([0.1, dist])
        if dist <= max_dist:
            if (not included_mat[i, j]) and (not included_mat[j, i]):
                edge_container_knn_w.append(tuple([i, j, dist]))
                
            included_mat[i, j] = 1

In [None]:
gene_graph_kw = nx.Graph()
gene_graph_kw.add_nodes_from(gene_node_container)

gene_graph_kw.add_weighted_edges_from(edge_container_knn_w)

In [None]:
pos_kw = nx.spring_layout(gene_graph_kw, k=0.1)

In [None]:
pos_array_kw = np.empty((len(pos), 2))
for p in range(pos_array_kw.shape[0]):
    pos_array_kw[p, :] = pos_kw[p]
    
fig = px.scatter(x=pos_array_kw[:, 0], y=pos_array_kw[:, 1], color=gene_index)
fig.show()

In [None]:
pos_spec = nx.spectral_layout(gene_graph_kw)


In [None]:
pos_array_spec = np.empty((len(pos_spec), 2))
for p in range(pos_array_spec.shape[0]):
    pos_array_spec[p, :] = pos_spec[p]
    
fig = px.scatter(x=pos_array_spec[:, 0], y=pos_array_spec[:, 1], color=gene_index)
fig.show()

In [None]:
pos_array_spec

## Generate a matrix that we can use for UMAP compression and clustering

In [None]:
pheno_df_wide = pd.pivot_table(phenotype_df_long.loc[:, ["gene", "start_hpf", "aff_struct_super_1", "val"]],
                         index=["gene"], values=["val"], columns=["aff_struct_super_1"],
                              fill_value=0)

pheno_df_wide.reset_index(inplace=True)
pheno_df_wide.columns = pheno_df_wide.columns.get_level_values(1)
pheno_df_wide = pheno_df_wide.rename(columns={"": "gene"})

## Calculate UMAP projection

In [None]:
from sklearn.decomposition import TruncatedSVD, PCA

# pull out binary phenotype array
phen_mat = pheno_df_wide.iloc[:, 1:].to_numpy()

# first, we need to use LSA to obtain lower-dim input for UMAP
# n_lsa_comp = 3
# svd_model = TruncatedSVD(n_components=n_lsa_comp, 
#                          algorithm='randomized',
#                          n_iter=10, random_state=42)
# svd_model.fit(phen_mat.T)

transformer = PCA(n_components=5, random_state=0)
transformer.fit(phen_mat.T)

In [None]:
transformer.components_.shape

In [None]:
from sklearn.preprocessing import StandardScaler

n_umap_comp = 2
# fit UMAP
svd_components = transformer.components_.T
reducer = umap.UMAP(n_components=n_umap_comp)
# scaled_svd = StandardScaler().fit_transform(svd_components)
embedding = reducer.fit_transform(svd_components)

In [None]:
fig = px.scatter(x=embedding[:, 0], y=embedding[:, 1])
fig.show()

In [None]:
print(svd_model.explained_variance_ratio_)