This notebook adds gene expression information to the previously built gene-phenotype table

In [1]:
import numpy as np
import pandas as pd
import umap.umap_ as umap
import plotly.express as px
import os

In [2]:
# set path to raw data
raw_data_root = "/Users/nick/Cole Trapnell's Lab Dropbox/Nick Lammers/Nick/morphseq/zfin/raw_data/"
access_date = "20240326"

gene_expression_df = pd.read_csv(os.path.join(raw_data_root, access_date, "wildtype-expression_fish_2024.04.08.txt"), 
                                sep='\t', header=1)
stage_df = pd.read_csv(os.path.join(raw_data_root, access_date, "stage_ontology.txt"), sep='\t', header=1)

# load built tables
built_data_dir = "/Users/nick/Cole Trapnell's Lab Dropbox/Nick Lammers/Nick/morphseq/zfin/built_data/20240326/"
if not os.path.isdir(built_data_dir):
    os.makedirs(built_data_dir)
    
pheno_df = pd.read_csv(os.path.join(built_data_dir, "condensed_gene_phenotypes.csv"))
KO_df = pd.read_csv(os.path.join(built_data_dir, "zfin_gene_KO_candidates.csv"))

## Clean up the gene expression info

In [3]:
# clean up names
gene_df_clean = gene_expression_df.loc[:, ["Gene ID", "Gene Symbol", "Super Structure ID", "Super Structure Name",
                                          "Assay", "Start Stage", "End Stage"]].rename(columns={
                                "Gene Symbol" : "gene",
                                "Gene ID": "gene_ID",
                                "Super Structure ID": "structure_ID",
                                "Super Structure Name": "structure", "Assay": "assay"}).drop_duplicates()

# add hpf staging info
gene_df_clean = gene_df_clean.merge(stage_df.loc[:, ["Stage Name", "Begin Hours"]], how="left", 
                                    left_on="Start Stage", right_on="Stage Name").drop(columns="Stage Name")

gene_df_clean = gene_df_clean.rename(columns={"Begin Hours":"start_hpf"})

gene_df_clean = gene_df_clean.merge(stage_df.loc[:, ["Stage Name", "End Hours"]], how="left", 
                                    left_on="End Stage", right_on="Stage Name").drop(columns="Stage Name")

gene_df_clean = gene_df_clean.rename(columns={"End Hours":"end_hpf"}).drop(columns=["Start Stage", "End Stage"])

# filter for expression during relevant time period
print(gene_df_clean.shape)
gene_df_clean = gene_df_clean.loc[gene_df_clean["start_hpf"]<=72, :]
print(gene_df_clean.shape)

# remove entries with non-specific region tags
gene_df_clean = gene_df_clean.loc[gene_df_clean["structure"] != 'whole organism', :]
gene_df_clean = gene_df_clean.loc[gene_df_clean["structure"] != 'unspecified', :]
w_flags = np.asarray([1 if "WITHDRAWN" in gene else 0 for gene in list(gene_df_clean["gene"])])
print(gene_df_clean.shape)
gene_df_clean = gene_df_clean.loc[w_flags != 1, :]

print(gene_df_clean.shape)
# aggregate by gene-structure combination


gene_df_clean = gene_df_clean.sort_values(by=["gene", "structure"], axis=0, ascending=True)

# save
gene_df_clean.to_csv(os.path.join(built_data_dir, "gene_expression_cleaned.csv"), index=False)
gene_df_clean.head()

(202230, 7)
(152910, 7)
(120613, 7)
(120602, 7)


Unnamed: 0,gene_ID,gene,structure_ID,structure,assay,start_hpf,end_hpf
197192,ZDB-GENE-060824-3,a1cf,ZFA:0000123,liver,mRNA in situ hybridization,72.0,144.0
77448,ZDB-GENE-090212-1,a2ml,ZFA:0000123,liver,mRNA in situ hybridization,72.0,96.0
36272,ZDB-GENE-090212-1,a2ml,ZFA:0000084,yolk,mRNA in situ hybridization,72.0,96.0
39672,ZDB-GENE-090212-1,a2ml,ZFA:0000088,yolk syncytial layer,mRNA in situ hybridization,6.0,8.0
131005,ZDB-GENE-090212-1,a2ml,ZFA:0000088,yolk syncytial layer,mRNA in situ hybridization,8.0,9.0


## Join on gene expression info
Simplest question to answer is just whether or not a given gene has documented specific expression

In [4]:
gene_df_temp = gene_df_clean.copy()
gene_df_temp["specific_expression_flag"] = 1
spec_df = gene_df_temp.loc[:, ["gene", "specific_expression_flag"]].drop_duplicates()

KO_out = KO_df.copy()
KO_out = KO_out.merge(spec_df, how="left", on="gene")
KO_out.loc[np.isnan(KO_out["specific_expression_flag"]), "specific_expression_flag"] = 0

print(np.mean(KO_out["specific_expression_flag"]))

0.38378684807256236


In [7]:
KO_out2 = KO_out.loc[:, list(KO_df.columns[:5]) + ["specific_expression_flag"] + list(KO_df.columns[5:-1])]
KO_out2 = KO_out2.sort_values(by=["TF_flag", "specific_expression_flag", "importance_score"], axis=0, ascending=False)

KO_out2.to_csv(os.path.join(built_data_dir, "zfin_gene_KO_candidates_v2.csv"))
KO_out2.head()

Unnamed: 0,gene,gene_ID,chromatin_flag,gene_desc,TF_flag,specific_expression_flag,importance_score,n_zfin_reports,phenotype_1,effect_1,...,effect_2,start_hpf_2,phenotype_3,effect_3,start_hpf_3,phenotype_4,effect_4,start_hpf_4,phenotype_5,effect_5
0,fosl2,ZDB-GENE-070209-164,0.0,Predicted to enable DNA-binding transcription ...,1.0,1.0,973.0,10.0,anatomical structure,absent,...,"absent, increased amount",48.0,heart,"increased distribution, increased amount",48.0,cardiac ventricle,"decreased size, increased rate, decreased amou...",42.0,,
1,vrtn,ZDB-GENE-060929-700,1.0,Enables DNA-binding transcription repressor ac...,1.0,1.0,973.0,8.0,anatomical structure,"decreased amount, increased amount",...,decreased amount,36.0,head,"decreased length, decreased size",11.66,optic primordium,decreased distribution,10.0,forebrain neural rod,decreased distribution
2,tbx16,ZDB-GENE-990615-5,1.0,Enables transcription cis-regulatory region bi...,1.0,1.0,259.0,103.0,cell,accumulation,...,"aplastic, quality, aplastic, size",0.0,surface structure,quality,0.0,head,"increased height, decreased length",48.0,vasculature,spatial pattern
3,hdac1,ZDB-GENE-020419-32,0.0,Enables transcription corepressor activity. In...,1.0,1.0,163.0,131.0,somite,"U-shaped, decreased amount, decreased amount",...,quality,0.0,head,"decreased size, arrested, decreased size",48.0,pharyngeal arch 3-7,increased process quality,48.0,trunk,"curved, curved ventral, curved"
4,cdx4,ZDB-GENE-980526-330,1.0,Enables DNA binding activity and chromatin bin...,1.0,1.0,151.0,66.0,anatomical system,quality,...,condensed,16.0,trunk,"decreased length, decreased length",24.0,mesoderm,"increased distribution, spatial pattern",19.0,neuroectoderm,"spatial pattern, spatial pattern, decreased di..."
