In [87]:
import pandas as pd
df = pd.read_csv("GSE142481_PER403_797_CPM.txt", index_col=None, sep="\t", skiprows=1, names=[ 
    "gene_id",
    "gene_name",
    "PER403_dmso_1",
    "PER403_dmso_2",
    "PER403_gne_1",
    "PER403_gne_2", 
    "PER403_neo_1",
    "PER403_neo_2", 
    "PER403_otx_1",
    "PER403_otx_2", 
    "PER403_otx_gne_1",
    'PER403_otx_gne_2',
    'TC797_dmso_1',
    'TC797_dmso_2', 
    'TC797_gne_1', 
    'TC797_gne_2', 
    'TC797_neo_1',
    'TC797_neo_2', 
    'TC797_otx_1', 
    'TC797_otx_2', 
    'TC797_otx_gne_1',
    'TC797_otx_gne_2'
])
df["gene_id_root"] = df["gene_id"].str.split(".").map(lambda x: x[0])

In [88]:
df["PER403"] = (df.PER403_dmso_1 + df.PER403_dmso_2) / 2
df["TC797"] = (df.TC797_dmso_1 + df.TC797_dmso_2) / 2

In [101]:
df_lines = df[["gene_id", "gene_id_root", "gene_name", "PER403", "TC797"]]

In [102]:
df_lines[df_lines.gene_name == "NUTM1"]

Unnamed: 0,gene_id,gene_id_root,gene_name,PER403,TC797
10371,ENSG00000184507.15,ENSG00000184507,NUTM1,6.064538,6.200746


In [141]:
df_lines_high = df_lines[(df_lines.PER403 > 5) & (df_lines.TC797 > 5)]

In [142]:
df_expr = pd.read_csv("rna_tissue_consensus.tsv", sep="\t")

In [143]:
gene_ids = set(df_expr.Gene)
CTA_gene_names = set()
CTA_gene_ids = set()
for gene_id, sub_df in df_expr.groupby("Gene"):
    total = sub_df.nTPM.sum()
    if total == 0:
        continue
    testis = sub_df[sub_df.Tissue == "testis"]["nTPM"].sum()
    frac = testis/total
    if frac > 0.85:
        CTA_gene_ids.add(gene_id)
        CTA_gene_names.add(sub_df["Gene name"].iloc[0])

In [144]:
CTA_gene_names

{'AC002456.2',
 'AC007325.1',
 'AC010255.2',
 'AC010325.1',
 'AC023491.2',
 'AC026786.1',
 'AC118470.1',
 'AC134980.2',
 'AC171558.1',
 'AC171558.3',
 'AC231656.1',
 'ACSBG2',
 'ACTL7A',
 'ACTL7B',
 'ACTL8',
 'ACTL9',
 'ACTRT1',
 'ACTRT2',
 'ADAD1',
 'ADAM18',
 'ADAM2',
 'ADAM29',
 'ADAM30',
 'ADIG',
 'AKAP4',
 'AL035460.1',
 'AL772284.1',
 'ANHX',
 'ANKRD30BL',
 'ANKRD7',
 'ASB17',
 'ATXN3L',
 'BOD1L2',
 'BOLL',
 'BPIFA3',
 'BRDT',
 'BX072566.1',
 'C10orf120',
 'C10orf62',
 'C11orf94',
 'C12orf40',
 'C12orf50',
 'C16orf78',
 'C16orf82',
 'C16orf90',
 'C17orf50',
 'C17orf64',
 'C17orf98',
 'C18orf63',
 'C1orf100',
 'C1orf185',
 'C20orf141',
 'C20orf144',
 'C20orf173',
 'C2orf78',
 'C3orf22',
 'C3orf56',
 'C4orf17',
 'C4orf51',
 'C5orf47',
 'C5orf52',
 'C8orf74',
 'C9orf57',
 'CABS1',
 'CAGE1',
 'CALR3',
 'CAPZA3',
 'CATSPER1',
 'CATSPER4',
 'CATSPERD',
 'CATSPERZ',
 'CBLL2',
 'CBY2',
 'CCDC105',
 'CCDC166',
 'CCDC168',
 'CCDC179',
 'CCDC182',
 'CCDC185',
 'CCDC27',
 'CCDC42',
 'CCDC54'

In [145]:
df_sub.nTPM.sum()

21.2

In [146]:
len(CTA_gene_names)

501

In [147]:
len(CTA_gene_ids)

501

In [148]:
mask = df_lines_high.gene_id_root.isin(CTA_gene_ids)
mask.sum()

1

In [149]:
df_lines_high

Unnamed: 0,gene_id,gene_id_root,gene_name,PER403,TC797
0,ENSG00000225972.1,ENSG00000225972,MTND1P23,6.996814,7.294854
1,ENSG00000225630.1,ENSG00000225630,MTND2P28,6.906842,6.363584
2,ENSG00000237973.1,ENSG00000237973,MTCO1P12,9.695386,10.121753
5,ENSG00000248527.1,ENSG00000248527,MTATP6P1,9.610184,9.423643
12,ENSG00000188976.11,ENSG00000188976,NOC2L,8.257366,8.252355
...,...,...,...,...,...
15296,ENSG00000212907.2,ENSG00000212907,MT-ND4L,9.998498,9.945455
15297,ENSG00000198886.2,ENSG00000198886,MT-ND4,13.238789,12.970896
15298,ENSG00000198786.2,ENSG00000198786,MT-ND5,12.558341,12.485287
15299,ENSG00000198695.2,ENSG00000198695,MT-ND6,7.576112,7.188325


In [151]:
df_lines_high.gene_name[mask]

10371    NUTM1
Name: gene_name, dtype: object