In [None]:
import pandas as pd
import plotly.express as px
import numpy as np
import plotly.offline as pyo
from os.path import join, isdir
from os import makedirs

# path to shared dropbox folder
dropbox_path = "/Users/nick/Cole Trapnell's Lab Dropbox/Nick Lammers/Nick/morphseq/zfin/"

# path to subdirectory with built data
zfin_data_dir = join(dropbox_path, "20240326/built_data/")

## Load phenotype data files

In [None]:
# load full data files
# these have one row per gene-phenotype. So one gene can have multiple entries if it present multiple phenotypes
full_gene_df = pd.read_csv(join(zfin_data_dir, "sing_mut_df.csv")) # all genes with abnormal phenotype between 0 and 72 hpf.
full_tf_df = pd.read_csv(join(zfin_data_dir, "sing_mut_TF_df.csv")) # only TFs

# generate shortened datasets with just one row per gene
# tf_df_short = pd.read_csv(join(zfin_data_dir, "tf_df_short.csv")) #
tf_df_short = full_tf_df.loc[:,
                                 ["gene", "start_hpf", "end_hpf", "start_stage", "umap1", "umap2", "group"]
                                ].dropna().drop_duplicates().dropna().sort_values(by="group")
gene_df_short = full_gene_df.loc[:, 
                                 ["gene", "start_hpf", "end_hpf", "start_stage", "umap1", "umap2", "group"]
                                ].dropna().drop_duplicates().dropna().sort_values(by="group")


# load phenotype data frame. This contains top 5 phenotypes per cluster
pheno_df = pd.read_csv(join(zfin_data_dir, "top_phenotypes_per_cluster.csv"))
pheno_df = pheno_df.dropna()
n_keep = 5

# switch from long to wide format
group_iter = 0
group_prev = -1
for i in range(pheno_df.shape[0]):
    group = pheno_df.loc[i, "group"]
    if group != group_prev:
        group_iter = -1
    
    group_iter += 1
    pheno_df.loc[i, "pheno_rank"] = "phenotype " + str(group_iter)
    
    group_prev=group.copy()

pheno_df = pheno_df.rename(columns={"aff_struct_super_1": "phenotype"})
pheno_df = pheno_df.pivot(index=['group'], columns=['pheno_rank'], values=["phenotype"])

pheno_df.columns = pheno_df.columns.get_level_values(1)


# join phenotype info onto TF and gene tables
tf_df_short = tf_df_short.merge(pheno_df, how="left", on="group")
gene_df_short = gene_df_short.merge(pheno_df, how="left", on="group")                                                                   

## Look at distribution of phenotypes across all genes inzfin database

In [None]:
import plotly.graph_objects as go

# get colormap
cmap = px.colors.qualitative.Light24[::-1]

# generate string version of the cluster group variable
g_vec = gene_df_short.loc[:, "group"].values
g_vec = [str(int(g)) for g in g_vec]
gene_df_short["group_str"] = g_vec

gene_df_short = gene_df_short.reset_index(drop=True)

# make scatter plot
fig = px.scatter(gene_df_short, x=gene_df_short.loc[:, "umap1"], y=gene_df_short.loc[:, "umap2"]
                ,hover_data=["gene", "phenotype 1", "phenotype 2", "phenotype 3"])

fig.update_traces(marker=dict(size=8, opacity=1, color=gene_df_short.loc[:, "group"],
                              colorscale=cmap,
                              line=dict(width=1, color="SlateGray")))
fig.update_layout(showlegend=False, template="plotly", 
                  )
fig.update_xaxes(title="UMAP 1")
fig.update_yaxes(title="UMAP 2")

fig.show()

In [None]:
# Note: this is a pointless piece of code that duplicates the TF df because data frame size controls
# order of display for some god-forsaken reason
# I will fix this
tflist = []
for i in range(10):
    tflist.append(tf_df_short)

tf_df_short = pd.concat(tflist,axis=0, ignore_index=True)
tf_df_short = tf_df_short.sort_values(by="group")
tf_df_short = tf_df_short.reset_index(drop=True)

## Overlay TFs

In [None]:
g_vec = tf_df_short.loc[:, "group"].values
g_vec = [str(int(g)) for g in g_vec]
tf_df_short["group_str"] = g_vec
tf_df_short = tf_df_short.sort_values(by="group")
tf_df_short = tf_df_short.reset_index(drop=True)



fig = px.scatter(gene_df_short, x="umap1", y="umap2", color="group_str", template="plotly", 
                 color_discrete_sequence=cmap,
                hover_data=["gene", "phenotype 1", "phenotype 2", "phenotype 3"])

fig.update_traces(marker=dict(size=8, opacity=0.25))

fig2 = px.scatter(tf_df_short, x="umap1", y="umap2", color="group_str", template="plotly", 
                 color_discrete_sequence=cmap,
                 hover_data=["gene", "phenotype 1", "phenotype 2", "phenotype 3"])

fig2.update_traces(marker_symbol="x", marker=dict(size=8, opacity=0.5, line=dict(width=1, color="SlateGray")))

for d in range(len(fig2.data)):
    fig.add_trace(fig2.data[d])
    
# fig.data = fig.data[::-1]
    
fig.update_layout(showlegend=False)
fig.update_xaxes(title="UMAP 1")
fig.update_yaxes(title="UMAP 2")

#, line=dict(width=1, color="SlateGray")))

fig.show()

**Key:** <br>
         Circles = all genes <br>
         Diamonds = transcription factors <br>

## Add targets from the GAP paper

In [None]:
# read in list of targets from GAP paper
gap_df = pd.read_csv(join(zfin_data_dir, "GAP_paper_target_key.csv"))

# join on phenotype coordinates
gap_df = gap_df.merge(gene_df_short.loc[:, ["gene", "umap1", "umap2", "group", "group_str"]], on="gene", how="left")
gap_df = gap_df.dropna().sort_values(by="group")

gap_group_index =np.unique(gap_df["group"])
all_index = np.unique(gene_df_short["group"])
gap_df = gap_df.merge(pheno_df, how="left", on="group")


# Note: this is a pointless piece of code that duplicates the GAP df because data frame size controls
# order of display for some god-forsaken reason
glist = []
for i in range(50):
    glist.append(gap_df)

gap_df = pd.concat(glist,axis=0, ignore_index=True)
gap_df = gap_df.sort_values(by="group")
gap_df = gap_df.reset_index(drop=True)

In [None]:
import plotly.graph_objects as go

fig = go.Figure()

# fig.add_trace(go.Scatter(x=gap_df.loc[:, "umap1"], y=gap_df.loc[:,"umap2"], mode="markers",
#                         marker_symbol="cross-thin", 
#                          marker=dict(size=8, opacity=1, line=dict(width=1, color="black"))))

fig4 = px.scatter(tf_df_short, 
                  x="umap1",
                  y="umap2",
                  color="group_str", template="plotly", color_discrete_sequence=cmap,
                 hover_data=["gene", "phenotype 1", "phenotype 2", "phenotype 3"])
fig4.update_traces(marker_symbol="diamond", marker=dict(size=8, opacity=1, line=dict(width=1, color="black")))

fig1 = px.scatter(gene_df_short, 
                  x="umap1",
                  y="umap2",
                  color="group_str", template="plotly", 
                 color_discrete_sequence=cmap,
                 hover_data=["gene", "phenotype 1", "phenotype 2", "phenotype 3"])

fig1.update_traces(marker=dict(size=8, opacity=0.25))


# fig2 = px.scatter(x=gap_df.loc[:, "umap1"], y=gap_df.loc[:, "umap2"], template="plotly",
#                  color=gap_df.loc[:,"group_str"])
# fig2.update_traces(marker_symbol="cross", marker=dict(size=8, opacity=0.75, line=dict(width=1, color="black")))
fig2 = px.scatter(gap_df,
                  x="umap1",
                  y="umap2", 
                  color="group_str", template="plotly",
                 hover_data=["gene", "phenotype 1", "phenotype 2", "phenotype 3"])
fig2.update_traces(marker_symbol="cross-thin", marker=dict(size=8, opacity=1, line=dict(width=2, color="black")))



for d in range(len(fig1.data)):
    fig.add_trace(fig1.data[d])
    
for d in range(len(fig4.data)):
    fig.add_trace(fig4.data[d])   

for d in range(len(fig2.data)):
    fig.add_trace(fig2.data[d])
    
    
fig.update_layout(showlegend=False)
fig.update_xaxes(title="UMAP 1")
fig.update_yaxes(title="UMAP 2")

# fig.data = fig.data[::-1]    

fig.show()

**Key:** <br>
         Circles = all genes <br>
         Diamonds = transcription factors <br>
         Crosses = GAP targets