In [1]:
import pandas as pd
import pyensembl 
# using HPA v22 with Ensembl v103
ensembl = pyensembl.cached_release(103)

# human protein atlas normal tissue expression
df_tissue = pd.read_csv("../hpa/v22/normal_tissue.tsv", sep="\t")
# human protein atlas sub-cellular locations
df_subcellular = pd.read_csv("../hpa/v22/subcellular_location.tsv", sep="\t")

In [21]:
total = 0
success = 0
missing = set()
# load cell line RNA-seq
cell_lines = {"14169", "1015", "797"}
cell_lines_files = {name: "NUTM1-%s-ar-%s.quant.sf" % (name, name) for name in cell_lines}
cell_line_dfs = {name: pd.read_csv(filename, sep="\t") for (name, filename) in cell_lines_files.items()}
transcript_to_gene = {}
for (name, df) in cell_line_dfs.items():
    for t_id in df.Name:
        if t_id not in transcript_to_gene:
            total += 1
            try:
            
                t_id_no_version = t_id.split(".")[0]
                t = ensembl.transcript_by_id(t_id_no_version)
                if t:
                    g_id = t.gene_id
                    success += 1
                    transcript_to_gene[t_id] = transcript_to_gene[t_id_no_version] = g_id
            except:
                missing.add(t_id)
                pass
            
for (name, df) in cell_line_dfs.items():
    df["Gene"] = df.Name.map(lambda x: transcript_to_gene.get(x.split(".")[0]))

print("Found genes for %d/%d transcripts" % (success, total))

Found genes for 234485/328361 transcripts


In [47]:
concat_parts = []
transcripts = None
genes = None
lengths = None
for (name, df) in cell_line_dfs.items():
    df = df.copy()
    if transcripts is None:
        transcripts = df.Name
        genes = df.Gene 
        lengths = df.Length
    assert (transcripts == df.Name).all()
    df.pop("Name")
    df.pop("Gene")
    df.pop("Length")
    df = df.add_suffix("_" + name)
    concat_parts.append(df)
df_all = pd.concat(concat_parts, axis=1)
df_all["Gene"] = genes
df_all["Name"] = transcripts
df_all["Length"] = lengths
df_all["TPM_min"] = [min(*values) for values in zip(*[df_all['TPM_%s' % name] for name in cell_line_dfs.keys()] )]
df_all["Gene_Ensembl_Name"] = [ensembl.gene_name_of_gene_id(gene_id) if gene_id else None for gene_id in df_all.Gene]

In [48]:
df_all[df_all.Gene.isnull()]

Unnamed: 0,EffectiveLength_1015,TPM_1015,NumReads_1015,EffectiveLength_797,TPM_797,NumReads_797,EffectiveLength_14169,TPM_14169,NumReads_14169,Gene,Name,Length,TPM_min,Gene_Ensembl_Name
14,1619.585,0.0,0.0,1628.457,0.0,0.0,1633.240,0.035184,1.159,,Hsap38.chr1.74855.76774.+,1794,0.0,
180,76.743,0.0,0.0,84.432,0.0,0.0,84.512,0.000000,0.000,,Hsap38.chr1.884947.885237.-,231,0.0,
181,197.197,0.0,0.0,206.603,0.0,0.0,210.015,0.235994,1.000,,Hsap38.chr1.895601.896266.-,369,0.0,
182,13.059,0.0,0.0,13.353,0.0,0.0,13.858,0.000000,0.000,,Hsap38.chr1.896537.897049.-,105,0.0,
183,22.000,0.0,0.0,22.000,0.0,0.0,22.000,0.000000,0.000,,Hsap38.chr1.897053.897436.-,21,0.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
265711,509.709,0.0,0.0,518.598,0.0,0.0,523.350,0.000000,0.000,,Hsap38.chrY.57161286.57162035.-,684,0.0,
265712,107.238,0.0,0.0,116.182,0.0,0.0,117.287,0.000000,0.000,,Hsap38.chrY.57161969.57162262.-,270,0.0,
265713,109.743,0.0,0.0,118.725,0.0,0.0,119.929,0.000000,0.000,,Hsap38.chrY.57162377.57162853.-,273,0.0,
265715,208.748,0.0,0.0,218.126,0.0,0.0,221.709,0.000000,0.000,,Hsap38.chrY.57171031.57171489.-,381,0.0,


In [49]:
df_all_with_subcellular = pd.merge(df_all, df_subcellular, left_on='Gene', right_on='Gene')

In [56]:
df_all_with_subcellular_and_tissue = pd.merge(
    df_all_with_subcellular, 
    df_tissue, 
    left_on='Gene', 
    right_on='Gene',
    suffixes=("_location", "_tissue"))

In [59]:
df_all_with_subcellular.columns

Index(['EffectiveLength_1015', 'TPM_1015', 'NumReads_1015',
       'EffectiveLength_797', 'TPM_797', 'NumReads_797',
       'EffectiveLength_14169', 'TPM_14169', 'NumReads_14169', 'Gene', 'Name',
       'Length', 'TPM_min', 'Gene_Ensembl_Name', 'Gene name', 'Reliability',
       'Main location', 'Additional location', 'Extracellular location',
       'Enhanced', 'Supported', 'Approved', 'Uncertain',
       'Single-cell variation intensity', 'Single-cell variation spatial',
       'Cell cycle dependency', 'GO id'],
      dtype='object')

In [55]:
df_all_with_subcellular_and_tissue.columns

Index(['EffectiveLength_1015', 'TPM_1015', 'NumReads_1015',
       'EffectiveLength_797', 'TPM_797', 'NumReads_797',
       'EffectiveLength_14169', 'TPM_14169', 'NumReads_14169', 'Gene', 'Name',
       'Length', 'TPM_min', 'Gene_Ensembl_Name', 'Gene name_x',
       'Reliability_x', 'Main location', 'Additional location',
       'Extracellular location', 'Enhanced', 'Supported', 'Approved',
       'Uncertain', 'Single-cell variation intensity',
       'Single-cell variation spatial', 'Cell cycle dependency', 'GO id',
       'Gene name_y', 'Tissue', 'Cell type', 'Level', 'Reliability_y'],
      dtype='object')

In [89]:
df_membrane = df_all_with_subcellular[df_all_with_subcellular["Main location"].str.contains("Plasma membrane", na=False)]

In [94]:
df_membrane.sort_values("TPM_min")[-50:
                                  ][["Name", "Gene", "Gene_Ensembl_Name", "TPM_min"]]

Unnamed: 0,Name,Gene,Gene_Ensembl_Name,TPM_min
38715,ENST00000607266.5,ENSG00000112514,CUTA,86.458763
285,ENST00000437146.1,ENSG00000078369,GNB1,86.792181
83111,ENST00000559176.5,ENSG00000182718,ANXA2,87.367009
68258,ENST00000396856.5,ENSG00000111640,GAPDH,89.285775
99682,ENST00000306749.4,ENSG00000169710,FASN,93.009439
118677,ENST00000360270.7,ENSG00000147065,MSN,93.678535
39016,ENST00000373365.5,ENSG00000124767,GLO1,93.723817
11119,ENST00000405333.5,ENSG00000115758,ODC1,97.556938
45596,ENST00000200457.9,ENSG00000087077,TRIP6,99.872897
56827,ENST00000302278.8,ENSG00000150093,ITGB1,100.722513


In [73]:
grouper = df_all_with_subcellular.groupby("Gene")

In [77]:
df_agg = grouper.aggregate(['sum', 'mean'])

  df_agg = grouper.aggregate(['sum', 'mean'])


Gene
ENSG00000000003    11.212423
ENSG00000000457     3.680359
ENSG00000000460    13.206462
ENSG00000000938     0.000000
ENSG00000000971     0.000000
                     ...    
ENSG00000288616     0.000000
ENSG00000288631     0.000000
ENSG00000288637     0.000000
ENSG00000288642     0.000000
ENSG00000288684     0.844765
Name: sum, Length: 13098, dtype: float64

In [87]:
df_all_with_subcellular["Main location"].value_counts()[:30]

Nucleoplasm                      31127
Cytosol                          15944
Vesicles                          8775
Mitochondria                      7307
Plasma membrane                   6536
Golgi apparatus                   4567
Cytosol;Nucleoplasm               4532
Cytosol;Plasma membrane           3102
Endoplasmic reticulum             2896
Nuclear speckles                  2711
Nucleoli                          2129
Nucleoli;Nucleoplasm              1501
Centrosome                        1394
Nuclear bodies                    1163
Nucleoplasm;Vesicles              1148
Nuclear bodies;Nucleoplasm        1107
Microtubules                      1061
Cytosol;Vesicles                  1054
Golgi apparatus;Vesicles          1020
Cell Junctions                     869
Nucleoplasm;Plasma membrane        864
Cytosol;Endoplasmic reticulum      808
Nucleoli fibrillar center          774
Actin filaments                    680
Golgi apparatus;Nucleoplasm        677
Nuclear membrane         

In [100]:
df_all_with_subcellular.sort_values("TPM_min")[["Gene_Ensembl_Name", "TPM_min", "Main location"]][-30:]

Unnamed: 0,Gene_Ensembl_Name,TPM_min,Main location
120869,MT-CYB,2075.852727,Mitochondria
49386,RPS20,2086.669875,Cytosol;Endoplasmic reticulum
102304,RPS15,2180.748135,Cytosol;Endoplasmic reticulum
120866,MT-ND3,2257.124447,Cytosol
30704,RPS3A,2280.503398,Cytosol;Endoplasmic reticulum;Nucleoli
105374,RPL18A,2290.936576,Nucleoli
93101,RPL26,2298.353805,Cytosol;Endoplasmic reticulum
38799,RPS10,2357.609619,Cytosol
17548,RPL37A,2386.336324,Cytosol;Endoplasmic reticulum
3391,RPS8,2390.517578,Cytosol;Endoplasmic reticulum


In [127]:
cta_gene_ids = set()

gene_ids_to_gene_names = {}
for i, (g, sub_df) in enumerate(sorted(df_tissue.groupby("Gene"))):
    gene_name = sub_df.iloc[0]["Gene name"]
    gene_ids_to_gene_names[sub_df.iloc[0]["Gene"]] = gene_name
    no_mask = sub_df["Level"].isin({"Not detected"})
    low_mask = sub_df["Level"].isin({"Low"})
    yes_mask = sub_df["Level"].isin({"Medium", "High"}) 
    sub_df_no = sub_df[no_mask]
    sub_df_low = sub_df[low_mask]
    sub_df_yes = sub_df[yes_mask]
    no_tissues = set(sub_df_no["Tissue"])
    low_tissues = set(sub_df_low["Tissue"])
    yes_tissues = set(sub_df_yes["Tissue"])
    target_tissues = {"testis",  "placenta"}
    if gene_name == "NUTM1":
        print(no_tissues)
        print(low_tissues)
        print(yes_tissues)
    if (len(no_tissues) > 5 and 
            low_tissues.intersection(target_tissues) == low_tissues and 
            (len(yes_tissues) > 0 and yes_tissues.intersection(target_tissues) == yes_tissues)):
        # print("%s (%s): %s (%s)" % tuple(sub_df_yes[["Gene", "Gene name", "Cell type", "Level"]].iloc[0]))
        print("%s (%s)" % tuple(sub_df_yes[["Gene", "Gene name"]].iloc[0]))
        cta_gene_ids.add(g)

ENSG00000001036 (FUCA2)
ENSG00000006047 (YBX2)
ENSG00000007350 (TKTL1)
ENSG00000010318 (PHF7)
ENSG00000033178 (UBA6)
ENSG00000039600 (SOX30)
ENSG00000042813 (ZPBP)
ENSG00000046774 (MAGEC2)
ENSG00000055957 (ITIH1)
ENSG00000064205 (CCN5)
ENSG00000064218 (DMRT3)
ENSG00000065320 (NTN1)
ENSG00000068985 (PAGE1)
ENSG00000071539 (TRIP13)
ENSG00000073146 (MOV10L1)
ENSG00000073598 (FNDC8)
ENSG00000075702 (WDR62)
ENSG00000077935 (SMC1B)
ENSG00000078403 (MLLT10)
ENSG00000079557 (AFM)
ENSG00000086288 (NME8)
ENSG00000092345 (DAZL)
ENSG00000099399 (MAGEB2)
ENSG00000100312 (ACR)
ENSG00000101251 (SEL1L2)
ENSG00000101448 (EPPIN)
ENSG00000101883 (RHOXF1)
ENSG00000101951 (PAGE4)
ENSG00000102021 (LUZP4)
ENSG00000102243 (VGLL1)
ENSG00000102387 (TAF7L)
ENSG00000102901 (CENPT)
ENSG00000103023 (PRSS54)
ENSG00000104332 (SFRP1)
ENSG00000104755 (ADAM2)
ENSG00000104818 (CGB2)
ENSG00000104827 (CGB3)
ENSG00000104901 (DKKL1)
ENSG00000104941 (RSPH6A)
ENSG00000105246 (EBI3)
ENSG00000105679 (GAPDHS)
ENSG00000105717 (PBX

ENSG00000197140 (ADAM32)
ENSG00000198021 (SPANXA1)
ENSG00000198129 (DEFB107B)
ENSG00000198573 (SPANXC)
ENSG00000198681 (MAGEA1)
ENSG00000198732 (SMOC1)
ENSG00000198759 (EGFL6)
ENSG00000198765 (SYCP1)
ENSG00000203784 (LELP1)
ENSG00000203795 (FAM24A)
ENSG00000203926 (SPANXA2)
ENSG00000203942 (C10orf62)
ENSG00000203989 (RHOXF2B)
ENSG00000204296 (TSBP1)
ENSG00000204444 (APOM)
ENSG00000204450 (TRIM64)
ENSG00000204632 (HLA-G)
ENSG00000204941 (PSG5)
ENSG00000205108 (FAM205A)
ENSG00000205238 (SPDYE2)
ENSG00000205301 (MGAT4D)
ENSG00000205359 (SLCO6A1)
ENSG00000205642 (VCX3B)
ENSG00000205777 (GAGE1)
ENSG00000205916 (DAZ4)
ENSG00000205944 (DAZ2)
ENSG00000206026 (SMIM21)
ENSG00000206538 (VGLL3)
ENSG00000206549 (AC109583.1)
ENSG00000212122 (TSSK1B)
ENSG00000212123 (PRR22)
ENSG00000213030 (CGB8)
ENSG00000213218 (CSH2)
ENSG00000213471 (TTLL13P)
ENSG00000213714 (FAM209B)
ENSG00000214107 (MAGEB1)
ENSG00000214300 (SPDYE3)
ENSG00000215029 (TCP11X2)
ENSG00000215186 (GOLGA6B)
ENSG00000215269 (GAGE12G)
ENSG

In [114]:
len(cta_gene_ids)

451

In [115]:
df_ctas = df_all_with_subcellular[df_all_with_subcellular.Gene.isin(cta_gene_ids)]

In [122]:
df_ctas.sort_values("TPM_min")[-30:][["Gene", "Gene_Ensembl_Name", "Main location", "TPM_min"]]


Unnamed: 0,Gene,Gene_Ensembl_Name,Main location,TPM_min
35043,ENSG00000170469,SPATA24,Cytosol;Nucleoplasm,1.987176
89985,ENSG00000102901,CENPT,Nuclear bodies,1.998576
56494,ENSG00000078403,MLLT10,Nucleoplasm,2.048605
92292,ENSG00000161921,CXCL16,Golgi apparatus,2.141969
56497,ENSG00000078403,MLLT10,Nucleoplasm,2.313918
92289,ENSG00000161921,CXCL16,Golgi apparatus,2.356721
28493,ENSG00000033178,UBA6,Cytosol;Nucleoplasm,2.444998
22050,ENSG00000010318,PHF7,Nucleoplasm,2.461907
89967,ENSG00000102901,CENPT,Nuclear bodies,2.61115
56392,ENSG00000152455,SUV39H2,Mitochondria,2.946696


In [125]:
print(df_tissue[df_tissue["Gene name"] == "NUTM1"].to_string())

                   Gene Gene name             Tissue                     Cell type         Level Reliability
998678  ENSG00000184507     NUTM1     adipose tissue                    adipocytes  Not detected    Enhanced
998679  ENSG00000184507     NUTM1      adrenal gland               glandular cells  Not detected    Enhanced
998680  ENSG00000184507     NUTM1           appendix               glandular cells  Not detected    Enhanced
998681  ENSG00000184507     NUTM1           appendix               lymphoid tissue  Not detected    Enhanced
998682  ENSG00000184507     NUTM1        bone marrow           hematopoietic cells  Not detected    Enhanced
998683  ENSG00000184507     NUTM1             breast                    adipocytes  Not detected    Enhanced
998684  ENSG00000184507     NUTM1             breast               glandular cells  Not detected    Enhanced
998685  ENSG00000184507     NUTM1             breast           myoepithelial cells  Not detected    Enhanced
998686  ENSG0000018