In [1]:
import os

import numpy as np
import pandas as pd

from pyspark.sql import SparkSession
from cmapPy.pandasGEXpress.parse_gct import parse

from procyon.data.data_utils import DATA_DIR

In [2]:
pd_dir = os.path.join(
    DATA_DIR,
    "experimental_data",
    "PD_uncharacterized",
    "control_genes_reproduction"
)

def pd_path(fn: str) -> str:
    return os.path.join(pd_dir, fn)

In this notebook, we produce the lists of control genes used for the analysis of ProCyon's ability to predict the function of poorly characterized proteins with known associations to Parkinson's Disease. 

These control lists are used in the analyses shown in Figure 6 of our manuscript.

# Genes in > 3 diseases

Extract the gene - disease associations from OpenTargets. Here we use the OpenTargets dataset giving overall scores for direct disease associations. This dataset can be downloaded [here](https://platform.opentargets.org/downloads) as the "Associations - direct (overall score)" dataset.

In [3]:
# Change this to the path to your OpenTargets download
evidencePath = "/path/to/OpenTargets/associationByOverallDirect/"

In [4]:
# establish spark connection
spark = (
    SparkSession.builder
    .master('local[*]')
    .getOrCreate()
)

# read evidence dataset
evd = spark.read.parquet(evidencePath)

# Browse the evidence schema
evd.printSchema()

# select fields of interest
evdSelect = (evd).select(['diseaseId', 'targetId', 'diseaseLabel',
                          'targetName', 'targetSymbol', 'overallDatasourceHarmonicScore', 'overallDatatypeHarmonicScore'])


evdSelect.show()


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/01/28 19:30:13 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


root
 |-- diseaseId: string (nullable = true)
 |-- targetId: string (nullable = true)
 |-- diseaseLabel: string (nullable = true)
 |-- targetName: string (nullable = true)
 |-- targetSymbol: string (nullable = true)
 |-- overallDatasourceHarmonicScore: double (nullable = true)
 |-- overallDatatypeHarmonicScore: double (nullable = true)
 |-- overallDatasourceHarmonicVector: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- datasourceId: string (nullable = true)
 |    |    |-- datasourceHarmonicScore: double (nullable = true)
 |    |    |-- datasourceEvidenceCount: long (nullable = true)
 |    |    |-- weight: double (nullable = true)
 |-- overallDatatypeHarmonicVector: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- datatypeId: string (nullable = true)
 |    |    |-- datatypeHarmonicScore: double (nullable = true)
 |    |    |-- datatypeEvidenceCount: long (nullable = true)
 |    |    |-- weight: double (nullabl

In [5]:
dbscores = evdSelect.toPandas()
dbscores.head()

                                                                                

Unnamed: 0,diseaseId,targetId,diseaseLabel,targetName,targetSymbol,overallDatasourceHarmonicScore,overallDatatypeHarmonicScore
0,EFO_0000305,ENSG00000000003,breast carcinoma,tetraspanin 6,TSPAN6,0.017417,0.017417
1,EFO_0000305,ENSG00000000005,breast carcinoma,tenomodulin,TNMD,0.025026,0.025026
2,EFO_0000305,ENSG00000000971,breast carcinoma,complement factor H,CFH,0.025515,0.025515
3,EFO_0000305,ENSG00000001084,breast carcinoma,glutamate-cysteine ligase catalytic subunit,GCLC,0.022156,0.022156
4,EFO_0000305,ENSG00000001167,breast carcinoma,nuclear transcription factor Y subunit alpha,NFYA,0.004,0.004


Derive the list of targets that are associated with more than 3 diseases

In [6]:
genes_more_than_3 = pd.DataFrame(dbscores.groupby('targetId')['diseaseId'].size()[dbscores.groupby('targetId')['diseaseId'].size() > 3])

In [7]:
subset = dbscores[dbscores.targetId.isin(genes_more_than_3.index.tolist())]

In [8]:
genes_more_than3_final = subset[['targetId', 'targetSymbol', 'targetName']].drop_duplicates()
print(len(genes_more_than3_final))

genes_more_than3_final.to_csv(pd_path("genes_more_than_3_diseases.csv"))

21345


# Neuro control genes: Intersection nervous system - PD - neurodegenerative

In [9]:
nervous = pd.read_csv(
    pd_path("3-OT-EFO_0000618-nervous_system_disorders.tsv"), sep="\t"
)
neurodegenerative = pd.read_csv(
    pd_path("4-OT-EFO_0005772-neurodegenerative_disorders.tsv"), sep="\t"
)
park = pd.read_csv(pd_path("4-OT-MONDO_0021095-parkinsonian.tsv"), sep="\t")

In [10]:
neuro_control = pd.Series(list(set(nervous.symbol) - set(neurodegenerative.symbol) - set(park.symbol)))
print(len(neuro_control))
neuro_control.to_csv(pd_path('NervousSystem-Neurodegenerative-PD.csv'))

5168


# Derive list of genes expressed in brain tissues

In [11]:
tissue_expr = parse(pd_path("GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_median_tpm.gct"))
tissue_expr_pd = tissue_expr.data_df.copy()
tissue_expr_pd.index = tissue_expr.row_metadata_df["Description"]

# Set as NAN the genes that have an expression of 0 across the different brain regions
tissue_expr_pd = (
    tissue_expr_pd
    .loc[:, tissue_expr_pd.columns.str.startswith("Brain")]
    .replace(0, np.nan)
)
len(tissue_expr_pd)

  row_metadata = row_metadata.apply(lambda x: pd.to_numeric(x, errors="ignore"))
  col_metadata = col_metadata.apply(lambda x: pd.to_numeric(x, errors="ignore"))


56200

In [12]:
# Remove the rows that have NAN across all brain regions - meaning they have no expression in the brain
expressed_in_brains = tissue_expr_pd.dropna(how="all")
len(expressed_in_brains)

33761

In [13]:
expressed_in_brains.head()

cid,Brain - Amygdala,Brain - Anterior cingulate cortex (BA24),Brain - Caudate (basal ganglia),Brain - Cerebellar Hemisphere,Brain - Cerebellum,Brain - Cortex,Brain - Frontal Cortex (BA9),Brain - Hippocampus,Brain - Hypothalamus,Brain - Nucleus accumbens (basal ganglia),Brain - Putamen (basal ganglia),Brain - Spinal cord (cervical c-1),Brain - Substantia nigra
Description,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
WASH7P,1.43859,1.69285,1.56605,4.99231,5.72099,2.48317,2.14667,1.68599,1.74811,1.53899,1.44167,2.73049,1.74194
MIR1302-2HG,,,0.024264,,,0.027366,0.030382,,0.024714,0.030669,0.023474,,0.019526
OR4G4P,0.049672,0.053535,0.045572,0.024643,0.037224,0.042985,0.042143,0.052914,0.039666,0.044513,0.048495,0.041605,0.046699
OR4G11P,0.063831,0.067951,0.072035,0.046864,0.049956,0.070455,0.073162,0.077502,0.063979,0.059297,0.064973,0.057124,0.06646
OR4F5,0.078869,0.076721,0.082673,0.05986,0.057719,0.084365,0.074008,0.103119,0.083142,0.081478,0.098499,0.070749,0.073194


In [14]:
expressed_in_brains.to_csv(pd_path('1-genes_expressed_in_brains.csv'))

# General control genes: Intersection genes expressed in more than 3 diseases - genes expressed in brain tissues

In [15]:
general_control = pd.Series(list(set(genes_more_than3_final.targetSymbol) - set(expressed_in_brains.index)))
print(len(general_control))
general_control.to_csv(pd_path('Expressed3Diseases-neuraltissue.csv'))

3379
