**The purpose of this notebook is to recapitulate in python the phenotype clustering script that Cole wrote in R.**

**The key steps in this process are:**
1) Load and filter the raw phenotype data
2) Convert string phenotypes to wideform binary array
3) Perform UMAP compression and cluster

In [11]:
import numpy as np
import pandas as pd
import umap.umap_ as umap
import plotly.express as px
import os

### Load phenotype data

In [13]:
# set path to raw data
raw_data_dir = "/Users/nick/Cole Trapnell's Lab Dropbox/Nick Lammers/Nick/morphseq/zfin/20240326/"

# set output directory
built_data_dir =  "/Users/nick/Cole Trapnell's Lab Dropbox/Nick Lammers/Nick/morphseq/zfin/20240326/built_data_py/" 
if not os.path.isdir(built_data_dir):
    os.makedirs(built_data_dir)
    
# load phenotype data and stage DF
phenotype_df_cole = pd.read_csv(os.path.join(raw_data_dir, "clean_zfin_single-mut_with-ids_phenotype_df.csv"))
# stage_to_hpf_key = pd.read_csv(os.path.join(raw_data_dir, "stage_to_hpf_key.csv"))
# phenotype_df = phenotype_df_raw.merge(stage_to_hpf_key, how = "left", on="start_stage")

### Load ontology info

In [14]:
anatomy_nodes_df = pd.read_csv(os.path.join(raw_data_dir, "anatomy_item.txt"), sep='\t', header=1)
anatomy_edges_df = pd.read_csv(os.path.join(raw_data_dir, "anatomy_relationship.txt"), sep='\t', header=1)
anatomy_synonyms_df = pd.read_csv(os.path.join(raw_data_dir, "anatomy_synonyms.txt"), sep='\t', header=1)
zfin_pheno_df = pd.read_csv(os.path.join(raw_data_dir, "phenoGeneCleanData_fish.txt"), sep='\t', header=1)
stage_df = pd.read_csv(os.path.join(raw_data_dir, "stage_ontology.txt"), sep='\t', header=1)
# print(anatomy_edges_df.head())
# print(anatomy_nodes_df.head())

(152927, 10)
(95889, 10)


## Build cleaned zfin dataset

In [None]:
zfin_pheno_df = zfin_pheno_df.rename(columns={
            "Affected Structure or Process 1 superterm ID": "structure_1_ID",
            "Affected Structure or Process 1 superterm Name": "structure_1",
            "Affected Structure or Process 2 superterm ID": "structure_2_ID",
            "Affected Structure or Process 2 superterm name": "structure_2",
            "Gene Symbol" : "gene",
            "Gene ID": "gene_ID",
            "Phenotype Keyword ID": "pheno_ID"
}).loc[:, ["gene", "gene_ID", "structure_1", "structure_1_ID", "structure_2", "structure_2_ID", "pheno_ID",
           "Start Stage ID", "End Stage ID", "Figure ID"]]

zfin_pheno_df = zfin_pheno_df.merge(phenotype_df_cole.loc[:, "gene"].drop_duplicates(), how="inner", on="gene")
zfin_pheno_df = zfin_pheno_df.merge(stage_df.loc[:, ["Stage ID", "Begin Hours"]], how="left", 
                                    left_on="Start Stage ID", right_on="Stage ID")

zfin_pheno_df = zfin_pheno_df.rename(columns={"Begin Hours":"start_hpf"})

zfin_pheno_df = zfin_pheno_df.merge(stage_df.loc[:, ["Stage ID", "End Hours"]], how="left", 
                                    left_on="End Stage ID", right_on="Stage ID")

zfin_pheno_df = zfin_pheno_df.rename(columns={"End Hours":"end_hpf"})


### Make phenotype DF longform

In [82]:
# zfin_pheno_long = pd.wide_to_long(zfin_pheno_df, stubnames=["structure"])
zfin_pheno_temp = zfin_pheno_df.copy()
# zfin_pheno_long["id"] = zfin_pheno_long.index
# id_key1 = zfin_pheno_long.loc[:, ["structure_1", "structure_1_ID"]].rename(columns={"structure_1":"structure", 
#                                                                                     "structure_1":"ID"})
# id_key2 = zfin_pheno_long.loc[:, ["structure_2", "structure_2_ID"]].rename(columns={"structure_2":"structure", 
#                                                                                     "structure_2":"ID"})
                                                                           
# id_key = pd.concat([id_key1, id_key2], axis=0, ignore_index=True).drop_duplicates()

zfin_pheno1 = zfin_pheno_temp.drop(labels=["structure_2", "structure_2_ID", "Stage ID_x", "Stage ID_y"], 
                                   axis=1).rename(columns={"structure_1":"structure", 
                                                                                    "structure_1_ID":"ID"})

zfin_pheno2 = zfin_pheno_temp.drop(labels=["structure_1", "structure_1_ID", "Stage ID_x", "Stage ID_y"], 
                                   axis=1).rename(columns={"structure_2":"structure", 
                                                                                    "structure_2_ID":"ID"})

zfin_pheno_long = pd.concat([zfin_pheno1, zfin_pheno2], axis=0, ignore_index=True).dropna(
    subset=["structure", "ID"]).drop_duplicates()


zfin_pheno_long.head()
zfin_pheno_long.to_csv(os.path.join(built_data_dir, "zfin_phenotypes_clean.csv"), index=False)

## Clean up anatomy data and build an ontology graph

In [49]:
# First, construct full graph
edge_vec = anatomy_edges_df["Relationship Type ID"].to_list()
keep_edge_types = ["is_a", "part of"]
keep_flags = np.asarray([e in keep_edge_types for e in edge_vec])

# filter for only desired edge types
edge_df = anatomy_edges_df.loc[keep_flags, ["Parent Item ID", "Child Item ID", "Relationship Type ID"]]
edge_df.reset_index(inplace=True, drop=True)
node_df = anatomy_nodes_df.loc[:, ["Anatomy ID", "Anatomy Name"]].drop_duplicates()
node_df.reset_index(inplace=True, drop=True)
node_df.loc[:, "node_id"] = node_df.index
# construct node dictionary
anatomy_nodes_id_vec = node_df["Anatomy ID"].to_numpy()
node_container = []
for i, a_term in enumerate(node_df["Anatomy Name"]):
    node_container.append(tuple([i, {"name": a_term, "id": anatomy_nodes_id_vec[i]}]))


# join node df to edges to get edge IDs
edge_df = edge_df.merge(node_df.loc[:, ["Anatomy ID", "node_id"]], 
                        how="left", left_on="Parent Item ID", right_on="Anatomy ID")
edge_df = edge_df.rename(columns={"node_id":"from_id"})

edge_df = edge_df.merge(node_df.loc[:, ["Anatomy ID", "node_id"]], 
                        how="left", left_on="Child Item ID", right_on="Anatomy ID")
edge_df = edge_df.rename(columns={"node_id":"to_id"})
                         
edge_df = edge_df.loc[:, ["Parent Item ID", "Child Item ID", "Relationship Type ID", "from_id", "to_id"]]
edge_df = edge_df.dropna(subset=["from_id", "to_id"])
edge_df.reset_index(inplace=True, drop=True)
edge_df.head()

Unnamed: 0,Parent Item ID,Child Item ID,Relationship Type ID,from_id,to_id
0,ZFA:0001129,ZFA:0001130,is_a,2112.0,2107
1,ZFA:0001129,ZFA:0001131,is_a,2112.0,2108
2,ZFA:0001129,ZFA:0001132,is_a,2112.0,2111
3,ZFA:0001129,ZFA:0001133,is_a,2112.0,2110
4,ZFA:0001129,ZFA:0001134,is_a,2112.0,2109


In [50]:
import networkx as nx
a_graph = nx.Graph()
a_graph.add_nodes_from(node_container)

edge_container = []
for i in range(edge_df.shape[0]):
    edge_container.append(tuple([edge_df.loc[i, "from_id"], edge_df.loc[i, "to_id"]]))
    
a_graph.add_edges_from(edge_container)

In [66]:
# import matplotlib.pyplot as plt

# # pos = nx.nx_agraph.graphviz_layout(a_graph, prog="twopi", args="")
# # x_vec = [pos[i][0] for i in range(len(pos))]
# # y_vec = [pos[i][1] for i in range(len(pos))]

# pos = nx.nx_agraph.graphviz_layout(a_graph, prog="twopi", args="")
# plt.figure(figsize=(8, 8))
# nx.draw(a_graph, pos, node_size=20, alpha=0.5, node_color="blue", with_labels=False)
# plt.axis("equal")
# plt.show()

In [59]:
pos[1][0]

2425.1

### Calculate graph distance between all genes in the phenotypes dataset

In [91]:
import plotly.express as px
distance_mat = dict(nx.shortest_path_length(a_graph))
px.imshow(distance_mat)

TypeError: ufunc 'isfinite' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''

In [93]:
distance_mat[20][10]

7

In [38]:
anatomy_nodes_df["flag"] = 1
phenotype_df_test = phenotype_df.merge(anatomy_nodes_df, left_on='aff_struct_super_1', 
                                       right_on= "Anatomy Name", how='left')


print(np.mean(~np.isnan(phenotype_df_test["flag"])))
phenotype_df_test.head(50)

0.3381054678312138


Unnamed: 0,gene,allele,start_stage,end_stage,phen_type,phen_tag,aff_struct_super_1,aff_struct_sub_1,aff_struct_super_2,aff_struct_sub_2,...,val,gene_id,start_hpf,end_hpf,Anatomy ID,Anatomy Name,Start Stage ID,End Stage ID,Unnamed: 4,flag
0,aldh1a2,i26/i26,Unknown,Unknown,quality,abnormal,pharyngeal_arch_3-7_skeleton,,,,...,1,ENSDARG00000053493,,,,,,,,
1,aldh1a2,i26/i26,Unknown,Unknown,quality,abnormal,pectoral_fin,,,,...,1,ENSDARG00000053493,,,,,,,,
2,kif11,hi486Tg,Unknown,Unknown,necrotic,abnormal,brain,,,,...,1,ENSDARG00000010948,,,ZFA:0000008,brain,ZDB-STAGE-020626-1,ZDB-STAGE-010723-39,,1.0
3,kif11,hi486Tg,Unknown,Unknown,decreased size,abnormal,eye,,,,...,1,ENSDARG00000010948,,,ZFA:0000107,eye,ZDB-STAGE-010723-10,ZDB-STAGE-010723-39,,1.0
4,kif11,hi486Tg,Unknown,Unknown,quality,abnormal,inner_ear,,,,...,1,ENSDARG00000010948,,,,,,,,
5,kif11,hi486Tg,Unknown,Unknown,bent,abnormal,whole_organism,,,,...,1,ENSDARG00000010948,,,,,,,,
6,sdad1,hi297Tg,Unknown,Unknown,decreased size,abnormal,eye,,,,...,1,ENSDARG00000105117,,,ZFA:0000107,eye,ZDB-STAGE-010723-10,ZDB-STAGE-010723-39,,1.0
7,sdad1,hi297Tg,Unknown,Unknown,quality,abnormal,gut,,,,...,1,ENSDARG00000105117,,,ZFA:0000112,gut,ZDB-STAGE-020626-1,ZDB-STAGE-010723-39,,1.0
8,sdad1,hi297Tg,Unknown,Unknown,quality,abnormal,heart,,,,...,1,ENSDARG00000105117,,,ZFA:0000114,heart,ZDB-STAGE-010723-31,ZDB-STAGE-010723-39,,1.0
9,sdad1,hi297Tg,Unknown,Unknown,quality,abnormal,liver,,,,...,1,ENSDARG00000105117,,,ZFA:0000123,liver,ZDB-STAGE-010723-32,ZDB-STAGE-010723-39,,1.0


### Perform some basic filtering steps

In [34]:
# remove phenotypes with late onset. Keep those with unknown onst time for now

phenotype_df.loc[np.isnan(phenotype_df["start_hpf"]), "start_hpf"] = -1
phenotype_df = phenotype_df.loc[phenotype_df["start_hpf"] < 72] # filter for phenotypes with onset prior to 3dpf

In [35]:
# remove rare phenotypes and uninformative phenotypes
min_freq = 4 # ad hoc threshold

phenotype_cat_df = phenotype_df.loc[:, ["aff_struct_super_1", "gene"]].groupby(["aff_struct_super_1"]).count()
phenotype_cat_df = phenotype_cat_df.rename(columns={"gene":"counts"})
phenotype_cat_df.reset_index(inplace=True)
# phenotype_cat_df = phenotype_cat_df.loc[phenotype_cat_df["counts"] >= min_freq, :]
# phenotype_cat_df = phenotype_cat_df.loc[phenotype_cat_df["aff_struct_super_1"] != "whole_organism"] # I don't find this useful

In [41]:
# use an inner join to filter
phenotype_df_long = phenotype_df.merge(phenotype_cat_df, how="inner", on="aff_struct_super_1")
phenotype_df_long = phenotype_df_long.loc[phenotype_df_long["phen_tag"] == "abnormal"] # this does nothing
phenotype_df_long["val"] = 1
phenotype_df_long.head()

Unnamed: 0,gene,allele,start_stage,end_stage,phen_type,phen_tag,aff_struct_super_1,aff_struct_sub_1,aff_struct_super_2,aff_struct_sub_2,pub_id,val,gene_id,start_hpf,end_hpf,counts
0,aldh1a2,i26/i26,Unknown,Unknown,quality,abnormal,pharyngeal_arch_3-7_skeleton,,,,ZDB-PUB-060503-2,1,ENSDARG00000053493,-1.0,,287
1,sec61a1,hi2839bTg,Unknown,Unknown,quality,abnormal,pharyngeal_arch_3-7_skeleton,,,,ZDB-PUB-060503-2,1,ENSDARG00000021669,-1.0,,287
2,fgf3,t21142/t21142,Unknown,Unknown,quality,abnormal,pharyngeal_arch_3-7_skeleton,,,,ZDB-PUB-060503-2,1,ENSDARG00000101540,-1.0,,287
3,mbtps1,hi1487Tg,Unknown,Unknown,quality,abnormal,pharyngeal_arch_3-7_skeleton,,,,ZDB-PUB-060503-2,1,ENSDARG00000014634,-1.0,,287
4,smad5,ta206/ta206,Unknown,Unknown,quality,abnormal,pharyngeal_arch_3-7_skeleton,,,,ZDB-PUB-060503-2,1,ENSDARG00000037238,-1.0,,287


## Generate a matrix that we can use for UMAP compression and clustering

In [59]:
pheno_df_wide = pd.pivot_table(phenotype_df_long.loc[:, ["gene", "start_hpf", "aff_struct_super_1", "val"]],
                         index=["gene"], values=["val"], columns=["aff_struct_super_1"],
                              fill_value=0)

pheno_df_wide.reset_index(inplace=True)
pheno_df_wide.columns = pheno_df_wide.columns.get_level_values(1)
pheno_df_wide = pheno_df_wide.rename(columns={"": "gene"})

## Calculate UMAP projection

In [93]:
from sklearn.decomposition import TruncatedSVD, PCA

# pull out binary phenotype array
phen_mat = pheno_df_wide.iloc[:, 1:].to_numpy()

# first, we need to use LSA to obtain lower-dim input for UMAP
# n_lsa_comp = 3
# svd_model = TruncatedSVD(n_components=n_lsa_comp, 
#                          algorithm='randomized',
#                          n_iter=10, random_state=42)
# svd_model.fit(phen_mat.T)

transformer = PCA(n_components=5, random_state=0)
transformer.fit(phen_mat.T)

In [94]:
transformer.components_.shape

(5, 3116)

In [97]:
from sklearn.preprocessing import StandardScaler

n_umap_comp = 2
# fit UMAP
svd_components = transformer.components_.T
reducer = umap.UMAP(n_components=n_umap_comp)
# scaled_svd = StandardScaler().fit_transform(svd_components)
embedding = reducer.fit_transform(svd_components)

In [98]:
fig = px.scatter(x=embedding[:, 0], y=embedding[:, 1])
fig.show()

In [81]:
print(svd_model.explained_variance_ratio_)

[0.10683409 0.03681909 0.03628827 0.02902549 0.02681229 0.02488121
 0.02031895 0.01869439 0.01757428 0.01713488 0.01524389 0.01473215
 0.01470781 0.01455871 0.01382487 0.0132256  0.01221768 0.01207748
 0.011155   0.00940412 0.00905084 0.00859345 0.00825216 0.00814738
 0.00796898 0.00780381 0.00751939 0.00727723 0.00685554 0.00664217
 0.00649512 0.0062659  0.0061346  0.00598767 0.0057754  0.00568255
 0.00565843 0.00555319 0.00547862 0.00549845 0.00533743 0.00532395
 0.00512139 0.00506321 0.00500917 0.00487922 0.00473922 0.00455787
 0.00444749 0.00440525 0.0042735  0.00424993 0.00421682 0.00412552
 0.00408025 0.00396641 0.00391604 0.00383022 0.00380583 0.00374549
 0.00366816 0.00357325 0.00350366 0.00350549 0.00347406 0.00338995
 0.00337525 0.00328815 0.00326105 0.00317488 0.00309581 0.00305715
 0.00301199 0.00296829 0.0029368  0.00284594 0.00282532 0.00280029
 0.00274722 0.00265983 0.00261671 0.00258984 0.00254736 0.00250024
 0.00249515 0.0024062  0.00240905 0.00238574 0.00234149 0.0023