In [151]:
import pandas as pd
import pyensembl 
# using HPA v22 with Ensembl v103
ensembl = pyensembl.cached_release(103)

# human protein atlas normal tissue expression
df_tissue = pd.read_csv("../hpa/v22/normal_tissue.tsv", sep="\t")
# human protein atlas sub-cellular locations
df_subcellular = pd.read_csv("../hpa/v22/subcellular_location.tsv", sep="\t")

In [161]:
len(df_tissue.Gene.unique())

15318

In [162]:
len(df_subcellular.Gene.unique())

13105

In [164]:
df_tissue.Gene

0          ENSG00000000003
1          ENSG00000000003
2          ENSG00000000003
3          ENSG00000000003
4          ENSG00000000003
                ...       
1194474    ENSG00000288607
1194475    ENSG00000288616
1194476    ENSG00000288631
1194477    ENSG00000288637
1194478    ENSG00000288684
Name: Gene, Length: 1194479, dtype: object

In [165]:
df_subcellular.Gene

0        ENSG00000000003
1        ENSG00000000457
2        ENSG00000000460
3        ENSG00000000938
4        ENSG00000000971
              ...       
13100    ENSG00000288616
13101    ENSG00000288631
13102    ENSG00000288637
13103    ENSG00000288642
13104    ENSG00000288684
Name: Gene, Length: 13105, dtype: object

In [152]:
testis_specific_gene_ids = set()
gene_ids_to_gene_names = {}
for i, (g, sub_df) in enumerate(sorted(df_tissue.groupby("Gene"))):
    gene_ids_to_gene_names[sub_df.iloc[0]["Gene"]] = sub_df.iloc[0]["Gene name"]
    no_mask = sub_df["Level"].isin({"Not detected"})
    low_mask = sub_df["Level"].isin({"Low"})
    yes_mask = sub_df["Level"].isin({"Medium", "High"}) 
    sub_df_no = sub_df[no_mask]
    sub_df_low = sub_df[low_mask]
    sub_df_yes = sub_df[yes_mask]
    no_tissues = set(sub_df_no["Tissue"])
    low_tissues = set(sub_df_low["Tissue"])
    yes_tissues = set(sub_df_yes["Tissue"])
    only_testis = {"testis"}
    if len(no_tissues) > 5 and (len(low_tissues) == 0 or low_tissues == only_testis) and yes_tissues == only_testis:
        # print("%s (%s): %s (%s)" % tuple(sub_df_yes[["Gene", "Gene name", "Cell type", "Level"]].iloc[0]))
        print("%s (%s)" % tuple(sub_df_yes[["Gene", "Gene name"]].iloc[0]))
        testis_specific_gene_ids.add(g)

ENSG00000001036 (FUCA2)
ENSG00000006047 (YBX2)
ENSG00000007350 (TKTL1)
ENSG00000010318 (PHF7)
ENSG00000033178 (UBA6)
ENSG00000039600 (SOX30)
ENSG00000042813 (ZPBP)
ENSG00000046774 (MAGEC2)
ENSG00000055957 (ITIH1)
ENSG00000064205 (CCN5)
ENSG00000064218 (DMRT3)
ENSG00000065320 (NTN1)
ENSG00000068985 (PAGE1)
ENSG00000071539 (TRIP13)
ENSG00000073146 (MOV10L1)
ENSG00000073598 (FNDC8)
ENSG00000075702 (WDR62)
ENSG00000077935 (SMC1B)
ENSG00000078403 (MLLT10)
ENSG00000079557 (AFM)
ENSG00000086288 (NME8)
ENSG00000092345 (DAZL)
ENSG00000099399 (MAGEB2)
ENSG00000100312 (ACR)
ENSG00000101251 (SEL1L2)
ENSG00000101448 (EPPIN)
ENSG00000101883 (RHOXF1)
ENSG00000102021 (LUZP4)
ENSG00000102387 (TAF7L)
ENSG00000102901 (CENPT)
ENSG00000103023 (PRSS54)
ENSG00000104332 (SFRP1)
ENSG00000104755 (ADAM2)
ENSG00000104901 (DKKL1)
ENSG00000104941 (RSPH6A)
ENSG00000105679 (GAPDHS)
ENSG00000105717 (PBX4)
ENSG00000105982 (RNF32)
ENSG00000106304 (SPAM1)
ENSG00000106336 (FBXO24)
ENSG00000107562 (CXCL12)
ENSG00000108691 

ENSG00000271449 (CT45A2)
ENSG00000273513 (TBC1D3K)
ENSG00000273696 (CT45A7)
ENSG00000274226 (TBC1D3H)
ENSG00000274274 (GAGE13)
ENSG00000274391 (TPTE)
ENSG00000274512 (TBC1D3L)
ENSG00000274600 (RIMBP3B)
ENSG00000274808 (TBC1D3B)
ENSG00000274933 (TBC1D3I)
ENSG00000275113 (GAGE2E)
ENSG00000275722 (LYZL6)
ENSG00000275793 (RIMBP3)
ENSG00000277535 (AL772284.1)
ENSG00000278289 (CT45A6)
ENSG00000278299 (TBC1D3C)
ENSG00000278646 (AC008162.2)
ENSG00000283706 (PRSS50)
ENSG00000284701 (TMEM247)


In [166]:
print(len(testis_specific_gene_ids))
print(len(testis_specific_gene_ids.intersection(set(df_subcellular.Gene))))

404
205


In [167]:
len(testis_specific_gene_ids)

404

In [168]:
testis_specific_gene_names = {gene_ids_to_gene_names[g_id] for g_id in testis_specific_gene_ids}

In [169]:
"NUTM1" in testis_specific_gene_names

True

In [170]:
len(df_subcellular)

13105

In [171]:
df_subcellular[df_subcellular.Gene.isin(testis_specific_gene_ids)]["Main location"].value_counts()

Nucleoplasm                              42
Plasma membrane                          24
Cytosol                                  23
Vesicles                                 23
Nucleoli                                 15
Nucleoli;Nucleoplasm                     14
Cytosol;Nucleoplasm                      10
Mitochondria                              9
Golgi apparatus                           5
Nucleoplasm;Vesicles                      4
Nuclear speckles                          4
Plasma membrane;Vesicles                  3
Nuclear bodies;Nucleoplasm                3
Nucleoplasm;Plasma membrane               2
Centrosome;Nucleoplasm                    2
Nuclear bodies                            2
Nucleoli;Nucleoli rim                     2
Endoplasmic reticulum                     2
Cytosol;Nuclear bodies                    2
Cell Junctions                            2
Cell Junctions;Plasma membrane            1
Nuclear membrane;Vesicles                 1
Cytosol;Plasma membrane         

In [172]:
df_subcellular[df_subcellular.Gene.isin(testis_specific_gene_ids)]["Main location"].value_counts().sum()

205

In [173]:
df_targets = df_subcellular[df_subcellular.Gene.isin(testis_specific_gene_ids) & df_subcellular["Main location"].str.contains("Plasma membrane")]

In [174]:
df_targets.to_csv("testis-specific-membrane-targets.csv", index=False)

In [175]:
df_targets

Unnamed: 0,Gene,Gene name,Reliability,Main location,Additional location,Extracellular location,Enhanced,Supported,Approved,Uncertain,Single-cell variation intensity,Single-cell variation spatial,Cell cycle dependency,GO id
1137,ENSG00000079557,AFM,Uncertain,Plasma membrane,,Predicted to be secreted,,,,Plasma membrane,,,,Plasma membrane (GO:0005886)
3507,ENSG00000117245,KIF17,Approved,Plasma membrane,Nucleoplasm,,,,Plasma membrane,Nucleoplasm,,,,Nucleoplasm (GO:0005654);Plasma membrane (GO:0...
3755,ENSG00000120156,TEK,Supported,Plasma membrane,Centriolar satellite,Predicted to be secreted,,Plasma membrane,,Centriolar satellite,Plasma membrane,,,Centriolar satellite (GO:0034451);Plasma membr...
3758,ENSG00000120160,EQTN,Approved,Actin filaments;Plasma membrane,,Predicted to be secreted,,,Actin filaments;Plasma membrane,,,,,Actin filaments (GO:0015629);Plasma membrane (...
4679,ENSG00000130783,CCDC62,Uncertain,Plasma membrane,Nucleoplasm,,,Nucleoplasm,,Plasma membrane,,,,Nucleoplasm (GO:0005654);Plasma membrane (GO:0...
4773,ENSG00000131721,RHOXF2,Approved,Plasma membrane,Nucleoplasm,,,,Nucleoplasm;Plasma membrane,,Nucleoplasm;Plasma membrane,,,Nucleoplasm (GO:0005654);Plasma membrane (GO:0...
4792,ENSG00000131864,USP29,Supported,Plasma membrane,Cytosol,,,Plasma membrane,Cytosol,,,,,Cytosol (GO:0005829);Plasma membrane (GO:0005886)
6984,ENSG00000152670,DDX4,Approved,Cytosol;Nucleoplasm;Plasma membrane,,,,,Cytosol;Nucleoplasm;Plasma membrane,,,,,Cytosol (GO:0005829);Nucleoplasm (GO:0005654);...
7346,ENSG00000157343,ARMC12,Approved,Nucleoplasm;Plasma membrane,,,,Nucleoplasm,,Plasma membrane,,,,Nucleoplasm (GO:0005654);Plasma membrane (GO:0...
7661,ENSG00000160886,LY6K,Approved,Plasma membrane,Nucleoplasm,Predicted to be secreted,,,Plasma membrane,Nucleoplasm,,,,Nucleoplasm (GO:0005654);Plasma membrane (GO:0...


In [196]:
len(df_targets)

33

In [223]:
total = 0
success = 0
missing = set()
# load cell line RNA-seq
cell_lines = {"14169", "1015", "797"}
cell_lines_files = {name: "NUTM1-%s-ar-%s.quant.sf" % (name, name) for name in cell_lines}
cell_line_dfs = {name: pd.read_csv(filename, sep="\t") for (name, filename) in cell_lines_files.items()}
transcript_to_gene = {}
for (name, df) in cell_line_dfs.items():
    for t_id in df.Name:
        if t_id not in transcript_to_gene:
            total += 1
            try:
            
                t_id_no_version = t_id.split(".")[0]
                t = ensembl.transcript_by_id(t_id_no_version)
                if t:
                    g_id = t.gene_id
                    success += 1
                    transcript_to_gene[t_id] = transcript_to_gene[t_id_no_version] = g_id
            except:
                missing.add(t_id)
                pass
            
for (name, df) in cell_line_dfs.items():
    df["Gene"] = df.Name.map(lambda x: transcript_to_gene.get(x.split(".")[0]))

print("Found genes for %d/%d transcripts" % (success, total))

Found genes for 234485/328361 transcripts


In [194]:
cell_line_dfs

{'1015':                      Name  Length  EffectiveLength          TPM   NumReads  \
 0       ENST00000456328.2    1657         1482.585     0.000000      0.000   
 1       ENST00000450305.2     632          457.734     0.000000      0.000   
 2       ENST00000488147.1    1351         1176.585     6.592401    120.972   
 3       ENST00000619216.1      68            8.605     0.000000      0.000   
 4       ENST00000473358.1     712          537.701     0.000000      0.000   
 ...                   ...     ...              ...          ...        ...   
 265772  ENST00000361681.2     525          350.895    85.150156    465.994   
 265773  ENST00000387459.1      69            8.755     0.000000      0.000   
 265774  ENST00000361789.2    1141          966.585  3210.012500  48391.000   
 265775  ENST00000387460.2      66            8.301     0.000000      0.000   
 265776  ENST00000387461.2      68            8.605     0.000000      0.000   
 
                    Gene  
 0       ENSG00

In [195]:
for (name, df_expr) in cell_line_dfs.items():
    merged = pd.merge(df_targets, df_expr, left_on="Gene", right_on='Gene')
    print(name, merged[merged.TPM > 1][["Gene", "Gene name", "TPM"]])
    print("---")

1015 Empty DataFrame
Columns: [Gene, Gene name, TPM]
Index: []
---
797 Empty DataFrame
Columns: [Gene, Gene name, TPM]
Index: []
---
14169                Gene Gene name       TPM
5   ENSG00000117245     KIF17  1.114378
50  ENSG00000160886      LY6K  1.773026
51  ENSG00000160886      LY6K  3.531464
53  ENSG00000160886      LY6K  2.240530
99  ENSG00000274274    GAGE13  4.975998
---


In [221]:
from collections import defaultdict
gene_sets_per_threshold = defaultdict(dict)
min_tpm_threshold = 1
max_tpm_threshold = 10
tpm_thresholds = list(range(min_tpm_threshold, max_tpm_threshold + 1))
for (name, df) in cell_line_dfs.items():
    merged = df[df.Gene.isin(testis_specific_gene_ids)].copy()
    merged["Gene name"] = merged.Gene.map(lambda x: gene_ids_to_gene_names[x])
    for tpm_threshold in tpm_thresholds:
        highly_expressed = merged[merged.TPM > tpm_threshold]
        print(tpm_threshold, name, highly_expressed[["Name", "Gene", "Gene name", "TPM"]])
        print("---")
        gene_sets_per_threshold[tpm_threshold][name] = set(highly_expressed.Gene)
common_genes_per_threshold = {}
for tpm_threshold in tpm_thresholds:
    gene_sets = gene_sets_per_threshold[tpm_threshold]
    common_genes_per_threshold[tpm_threshold] = set.intersection(*gene_sets.values())

common_gene_names_per_threshold = {
    tpm_threshold: 
    {ensembl.gene_name_of_gene_id(g_id) for g_id in gene_ids} for (tpm_threshold, gene_ids) in common_genes_per_threshold.items()}

1 1015                       Name             Gene Gene name        TPM
6521     ENST00000474844.5  ENSG00000117481     NSUN4   5.462490
6522     ENST00000307089.7  ENSG00000117481     NSUN4   8.547577
6523     ENST00000486270.5  ENSG00000117481     NSUN4   7.382388
7208    ENST00000306052.12  ENSG00000157193      LRP8   5.685523
7218     ENST00000668071.1  ENSG00000157193      LRP8   1.758078
...                    ...              ...       ...        ...
265399   ENST00000426000.6  ENSG00000188120      DAZ1   6.890554
265540   ENST00000315357.9  ENSG00000187191      DAZ3   1.679501
265541   ENST00000382365.6  ENSG00000187191      DAZ3  11.339920
265544   ENST00000415508.6  ENSG00000205916      DAZ4  46.139664
265548   ENST00000440066.6  ENSG00000205916      DAZ4  28.307055

[115 rows x 4 columns]
---
2 1015                       Name             Gene Gene name         TPM
6521     ENST00000474844.5  ENSG00000117481     NSUN4    5.462490
6522     ENST00000307089.7  ENSG00000117481   

In [222]:
common_gene_names_per_threshold

{1: {'AL162231.1',
  'APOM',
  'C12orf56',
  'CABYR',
  'CENPT',
  'CXCL16',
  'LRP8',
  'LRRC37A2',
  'LYAR',
  'LYPD6',
  'MLLT10',
  'NSUN4',
  'NUTM1',
  'PBX4',
  'PHF7',
  'PRR22',
  'PSMC3IP',
  'SPATA24',
  'SUV39H2',
  'TRIP13',
  'UBA6',
  'WDR62',
  'YBX2'},
 2: {'APOM',
  'CENPT',
  'CXCL16',
  'LRP8',
  'LYAR',
  'LYPD6',
  'MLLT10',
  'NSUN4',
  'NUTM1',
  'PHF7',
  'PRR22',
  'PSMC3IP',
  'SUV39H2',
  'TRIP13',
  'UBA6',
  'WDR62'},
 3: {'APOM',
  'CENPT',
  'LRP8',
  'LYAR',
  'LYPD6',
  'MLLT10',
  'NSUN4',
  'NUTM1',
  'PSMC3IP',
  'SUV39H2',
  'TRIP13',
  'UBA6',
  'WDR62'},
 4: {'CENPT',
  'LYAR',
  'LYPD6',
  'MLLT10',
  'NSUN4',
  'NUTM1',
  'PSMC3IP',
  'SUV39H2',
  'TRIP13',
  'UBA6',
  'WDR62'},
 5: {'CENPT',
  'LYAR',
  'MLLT10',
  'NSUN4',
  'NUTM1',
  'PSMC3IP',
  'SUV39H2',
  'TRIP13',
  'UBA6',
  'WDR62'},
 6: {'CENPT',
  'LYAR',
  'MLLT10',
  'NUTM1',
  'PSMC3IP',
  'SUV39H2',
  'TRIP13',
  'UBA6',
  'WDR62'},
 7: {'CENPT',
  'LYAR',
  'MLLT10',
  'NUTM1'

In [211]:
{ensembl.gene_name_of_gene_id(g_id) for g_id in common_genes}

{'CENPT',
 'LYAR',
 'MLLT10',
 'NSUN4',
 'NUTM1',
 'PSMC3IP',
 'SUV39H2',
 'TRIP13',
 'UBA6',
 'WDR62'}