In [None]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import os
import plotly.offline as pyo
# Set notebook mode to work in offline
pyo.init_notebook_mode()

In [None]:
# set path to data
zfin_folder = "/Users/nick/Cole Trapnell's Lab Dropbox/Nick Lammers/Nick/morphseq/zfin/20240312/"

gene_pheno_df = pd.read_csv(os.path.join(zfin_folder, "phenoGeneCleanData_fish_2024.03.11.txt"), sep="\t", header=1)
gene_pheno_df.head()

In [None]:
prev_names_df = pd.read_csv(os.path.join(zfin_folder, "aliases_2024.03.11.txt"), sep="\t")
prev_names_df.head()


In [None]:
gene_onto_df = pd.read_csv(os.path.join(zfin_folder, "zfin_gene_ontology"), sep="\t", 
                           usecols=[1,2,3,11], names=["Gene ID", "gene_name", "relation", "gene_type"], header=36)
gene_onto_df.head()

In [None]:
variant_ncbi_df = pd.read_csv(os.path.join(zfin_folder, "VARIANT-ALLELE_NCBITaxon7955.tsv"), sep="\t", header=14)
variant_ncbi_df.head()

### Merge phenotype DF with gene name df and use names to look for paralogs

In [None]:
gene_name_key = gene_onto_df.loc[gene_onto_df["gene_type"]=="protein_coding_gene", ["Gene ID", "gene_name"]].drop_duplicates()

gene_pheno_key = gene_pheno_df.loc[:, ["Gene ID"]].drop_duplicates()
gene_pheno_key["has_phenotype"] = 1

print(gene_name_key.shape)
print(gene_pheno_key.shape)

In [None]:
gene_pheno_key.head()

In [None]:
gene_key_out

In [None]:
write_dir = os.path.join(zfin_folder, "build_datasets", "")
if not os.path.isdir(write_dir):
    os.makedirs(write_dir)

gene_key_out = gene_name_key.copy()
gene_key_out.loc[:, "Gene ID"] = gene_key_out.loc[:, "Gene ID"].astype(str)
gene_key_out = gene_key_out.loc[:, "Gene ID"]
# gene_key_out["Gene ID"] = gene_key_out["Gene ID"].astype(str)
gene_key_out.to_csv(os.path.join(write_dir, "gene_key_df.tsv"), sep="\t", index=False)

pheno_gene_key_out = gene_pheno_key.copy()
pheno_gene_key_out.loc[:, "Gene ID"] = pheno_gene_key_out.loc[:, "Gene ID"].astype(str)
pheno_gene_key_out = pheno_gene_key_out.loc[:, "Gene ID"]
# gene_key_out["Gene ID"] = gene_key_out["Gene ID"].astype(str)
pheno_gene_key_out.to_csv(os.path.join(write_dir, "pheno_gene_key_df.tsv"), sep="\t", index=False)

In [None]:
# merge
gene_df = gene_name_key.merge(gene_pheno_key, how="left", on="Gene ID")
gene_df.loc[np.isnan(gene_df["has_phenotype"]), "has_phenotype"] = 0

gene_name_list = gene_df["gene_name"].to_numpy().tolist()
gene_name_root = [g[:-1] for g in gene_name_list]
gene_suffix = [g[-1] for g in gene_name_list]

possible_paralog_flag = [s in ["a", "b", "c", "d"] for s in gene_suffix]

gene_df["gene_name_root"] = gene_name_root
gene_df["gene_suffix"] = gene_suffix
gene_df["possible_paralog_flag"] = possible_paralog_flag
gene_df.head()

In [None]:
paralog_index, paralog_ids, counts = np.unique(gene_df["gene_name_root"], return_inverse=True, return_counts=True)

counts_long = np.empty(paralog_ids.shape)
valid_par_vec = gene_df.loc[:, "possible_paralog_flag"].to_numpy()
counts_long[:] = np.nan

for p, pi in enumerate(paralog_index):
    valid_paralogs = valid_par_vec[paralog_ids==p]
    counts_long[(paralog_ids==p) & valid_par_vec] = np.sum(valid_paralogs==True)

gene_df["paralog_ID"] = paralog_ids.astype(np.uint16)
gene_df["n_paralogs"] = counts_long.astype(np.uint16)
gene_df.loc[gene_df["possible_paralog_flag"]==False, "paralog_ID"] = np.nan
gene_df.loc[gene_df["possible_paralog_flag"]==False, "n_paralogs"] = np.nan

gene_df.head()

In [None]:
paralog_df = gene_df.loc[gene_df["n_paralogs"] > 1, ["Gene ID", "gene_name", "gene_name_root", 
                                                     "gene_suffix", "has_phenotype", "paralog_ID", "n_paralogs"]]

paralog_df = paralog_df.sort_values(by=["gene_name_root", "gene_suffix"])

paralog_df.reset_index(inplace=True, drop=True)

print(paralog_df.shape)
print(len(np.unique(paralog_df["paralog_ID"])))
paralog_df.head(10)

### Now look for paralog groups with differing phenotype flags
I'm interested in cases where one paralog has a documented phenotype and the other(s) does not

In [None]:
paralog_index = np.unique(paralog_df["paralog_ID"])

pheno_class_vec = np.zeros(paralog_index.shape)

for p, pi in enumerate(paralog_index):
    pheno_flags = paralog_df.loc[paralog_df["paralog_ID"]==pi, "has_phenotype"].to_numpy()
    if np.all(pheno_flags==0):
        paralog_df.loc[paralog_df["paralog_ID"]==pi, "phenotype_class"] = 0
        pheno_class_vec[p] = 0
    elif np.all(pheno_flags==1):
        paralog_df.loc[paralog_df["paralog_ID"]==pi, "phenotype_class"] = 2
        pheno_class_vec[p] = 2
    else:
        paralog_df.loc[paralog_df["paralog_ID"]==pi, "phenotype_class"] = 1
        pheno_class_vec[p] = 1

In [None]:
class_index, counts = np.unique(pheno_class_vec, return_counts=True)
print(class_index)
print(counts)

In [None]:
paralog_df.loc[(paralog_df["n_paralogs"]>2) & (paralog_df["phenotype_class"]==1), :].drop_duplicates(subset="gene_name_root")



In [None]:
gene_df.loc[gene_df["gene_name_root"]=="dusp"]