This notebook adds gene expression information to the previously built gene-phenotype table

In [None]:
import numpy as np
import pandas as pd
import umap.umap_ as umap
import plotly.express as px
import os

In [None]:
# set path to raw data
raw_data_root = "/Users/nick/Cole Trapnell's Lab Dropbox/Nick Lammers/Nick/morphseq/zfin/raw_data/"
access_date = "20240326"

gene_expression_df = pd.read_csv(os.path.join(raw_data_root, access_date, "wildtype-expression_fish_2024.04.08.txt"), 
                                sep='\t', header=1)
stage_df = pd.read_csv(os.path.join(raw_data_root, access_date, "stage_ontology.txt"), sep='\t', header=1)

# load built tables
built_data_dir = "/Users/nick/Cole Trapnell's Lab Dropbox/Nick Lammers/Nick/morphseq/zfin/built_data/20240326/"
if not os.path.isdir(built_data_dir):
    os.makedirs(built_data_dir)
    
pheno_df = pd.read_csv(os.path.join(built_data_dir, "condensed_gene_phenotypes.csv"))
KO_df = pd.read_csv(os.path.join(built_data_dir, "zfin_gene_KO_candidates.csv"))

## Clean up the gene expression info

In [None]:
# clean up names
gene_df_clean = gene_expression_df.loc[:, ["Gene ID", "Gene Symbol", "Super Structure ID", "Super Structure Name",
                                          "Assay", "Start Stage", "End Stage"]].rename(columns={
                                "Gene Symbol" : "gene",
                                "Gene ID": "gene_ID",
                                "Super Structure ID": "structure_ID",
                                "Super Structure Name": "structure", "Assay": "assay"}).drop_duplicates()

# add hpf staging info
gene_df_clean = gene_df_clean.merge(stage_df.loc[:, ["Stage Name", "Begin Hours"]], how="left", 
                                    left_on="Start Stage", right_on="Stage Name").drop(columns="Stage Name")

gene_df_clean = gene_df_clean.rename(columns={"Begin Hours":"start_hpf"})

gene_df_clean = gene_df_clean.merge(stage_df.loc[:, ["Stage Name", "End Hours"]], how="left", 
                                    left_on="End Stage", right_on="Stage Name").drop(columns="Stage Name")

gene_df_clean = gene_df_clean.rename(columns={"End Hours":"end_hpf"}).drop(columns=["Start Stage", "End Stage"])

# filter for expression during relevant time period
print(gene_df_clean.shape)
gene_df_clean = gene_df_clean.loc[gene_df_clean["start_hpf"]<=72, :]
print(gene_df_clean.shape)

# remove entries with non-specific region tags
gene_df_clean = gene_df_clean.loc[gene_df_clean["structure"] != 'whole organism', :]
gene_df_clean = gene_df_clean.loc[gene_df_clean["structure"] != 'unspecified', :]
w_flags = np.asarray([1 if "WITHDRAWN" in gene else 0 for gene in list(gene_df_clean["gene"])])
print(gene_df_clean.shape)
gene_df_clean = gene_df_clean.loc[w_flags != 1, :]

print(gene_df_clean.shape)
# aggregate by gene-structure combination


gene_df_clean = gene_df_clean.sort_values(by=["gene", "structure"], axis=0, ascending=True)

# save
gene_df_clean.to_csv(os.path.join(built_data_dir, "gene_expression_cleaned.csv"), index=False)
gene_df_clean.head()

## Join on gene expression info
Simplest question to answer is just whether or not a given gene has documented specific expression

In [None]:
gene_df_temp = gene_df_clean.copy()
gene_df_temp["specific_expression_flag"] = 1
spec_df = gene_df_temp.loc[:, ["gene", "specific_expression_flag"]].drop_duplicates()

KO_out = KO_df.copy()
KO_out = KO_out.merge(spec_df, how="left", on="gene")
KO_out.loc[np.isnan(KO_out["specific_expression_flag"]), "specific_expression_flag"] = 0

print(np.mean(KO_out["specific_expression_flag"]))

In [None]:
KO_out2 = KO_out.loc[:, list(KO_df.columns[:5]) + ["specific_expression_flag"] + list(KO_df.columns[5:-1])]
KO_out2 = KO_out2.sort_values(by=["TF_flag", "specific_expression_flag", "importance_score"], axis=0, ascending=False)

KO_out2.to_csv(os.path.join(built_data_dir, "zfin_gene_KO_candidates_v2.csv"))
KO_out2.head()