In [1]:
import pyspark.sql.functions as f
import pyspark.sql.types as t
import statsmodels.api as sm
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.io as pio
import scipy.stats as stats


from gentropy.common.session import Session

pio.renderers.default = "vscode"


In [2]:
session = Session(
    extended_spark_conf={"spark.executor.memory": "10g", "spark.driver.memory": "10g"}
)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/05/20 12:20:46 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


This is the therapeutic area hierarchy:

In [3]:
therapy_area_hierarchy = {
    "EFO_0001444": "measurement",
    "MONDO_0045024": "cancer or benign tumor",
    "EFO_0005741": "infectious disease",
    "OTAR_0000014": "pregnancy or perinatal disease",
    "MONDO_0024458": "disorder of visual system",
    "EFO_0000319": "cardiovascular disease",
    "EFO_0009605": "pancreas disease",
    "EFO_0010282": "gastrointestinal disease",
    "OTAR_0000017": "reproductive system or breast disease",
    "EFO_0010285": "integumentary system disease",
    "EFO_0001379": "endocrine system disease",
    "OTAR_0000010": "respiratory or thoracic disease",
    "EFO_0009690": "urinary system disease",
    "OTAR_0000006": "musculoskeletal or connective tissue disease",
    "MONDO_0021205": "disorder of ear",
    "EFO_0000540": "immune system disease",
    "EFO_0005803": "hematologic disease",
    "EFO_0000618": "nervous system disease",
    "MONDO_0002025": "psychiatric disorder",
    "OTAR_0000020": "nutritional or metabolic disease",
    "OTAR_0000018": "genetic, familial or congenital disease",
    "OTAR_0000009": "injury, poisoning or other complication",
    "EFO_0003765": "sign or symptom",
    "other": "other",
    # "EFO_0000651",  # "phenotype",
    # "GO_0008150",  # "biological process"
    # "EFO_0002571", # "medical procedure","
    # "EFO_0005932": "animal disease",
}

In [4]:
target = session.spark.read.parquet(
    "gs://open-targets-data-releases/25.03/output/target"
)
studies = session.spark.read.parquet(
    "gs://open-targets-data-releases/25.03/output/study"
)
cred_sets = session.spark.read.parquet(
    "gs://open-targets-data-releases/25.03/output/credible_set"
)
tissue_expression = session.spark.read.parquet(
    "gs://open-targets-data-releases/25.03/output/target_prioritisation"
).select("targetId", "tissueSpecificity", "tissueDistribution")


25/05/20 12:20:48 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-google-hadoop-file-system.properties,hadoop-metrics2.properties
                                                                                

# Generation of Study-Index with Therapeutic Areas assigned

In [5]:
gwas = (
    studies
    .filter(f.col('studyType') =='gwas')
)

In [6]:
# This udf extracts the FIRST therapeutic area, as per hierarchy list, for each diseaseId
@f.udf(t.StringType())
def get_first_matching_therapeutic_area(therapeutic_areas_list):
    if therapeutic_areas_list is None:
        return None
    for ta in therapy_area_hierarchy:
        if ta in therapeutic_areas_list:
            return ta
    return None

# These lines create a dictionary of diseaseId to primary therapeutic area
efo_ta = ( 
    session.spark.read.parquet(
        'gs://open-targets-data-releases/25.03/output/disease/disease.parquet'
    )
    .select('id', 'ancestors', )
    .withColumn(
        'primaryTherapeuticArea',
        get_first_matching_therapeutic_area(f.col('ancestors'))
    )
    .withColumn(
        'primaryTherapeuticArea',
        f.when(f.col('primaryTherapeuticArea').isNull(), f.lit('other'))
        .otherwise(f.col('primaryTherapeuticArea'))
    )
    .join(
        studies.select(f.explode('diseaseIds').alias('efo')),
        f.col('id') == f.col('efo'),
        'semi'
    )
)
efo_ta_lookup = efo_ta.select('id', 'primaryTherapeuticArea').collect()
efo_ta_dict = {row['id']: row['primaryTherapeuticArea'] for row in efo_ta_lookup}

# This udf takes a diseaseIds arrays and creates an array of mapped therapeutic areas
@f.udf(t.ArrayType(t.StringType())) 
def map_efos_to_therapeutic_areas(efo_ids):
    if efo_ids is None:
        return None
    lookup_dict = efo_ta_dict
    mapped_areas = []
    for efo_id in efo_ids:
        mapped_areas.append(lookup_dict.get(efo_id, None))
        mapped_areas = list(set(area for area in mapped_areas if area is not None))
    return mapped_areas
        

                                                                                

In [7]:
gwas = (
    gwas.withColumn(
        "mappedTherapeuticAreas", map_efos_to_therapeutic_areas(f.col("diseaseIds"))
    )
    .withColumn(
        "measurement", f.array_contains("mappedTherapeuticAreas", "EFO_0001444")
    )
    .withColumn(
        "binaryLessCases",
        f.when(f.col("nCases") < f.col("nControls"), True).otherwise(False),
    )
    .withColumns(
        {
            "cancerOrBenignTumor": f.when(
                f.array_contains("mappedTherapeuticAreas", "MONDO_0045024"), 1
            ).otherwise(0),
            "infectiousDisease": f.when(
                f.array_contains("mappedTherapeuticAreas", "EFO_0005741"), 1
            ).otherwise(0),
            "pregnancyOrPerinatalDisease": f.when(
                f.array_contains("mappedTherapeuticAreas", "OTAR_0000014"), 1
            ).otherwise(0),
            "disorderOfVisualSystem": f.when(
                f.array_contains("mappedTherapeuticAreas", "MONDO_0024458"), 1
            ).otherwise(0),
            "cardiovascularDisease": f.when(
                f.array_contains("mappedTherapeuticAreas", "EFO_0000319"), 1
            ).otherwise(0),
            "pancreasDisease": f.when(
                f.array_contains("mappedTherapeuticAreas", "EFO_0009605"), 1
            ).otherwise(0),
            "gastrointestinalDisease": f.when(
                f.array_contains("mappedTherapeuticAreas", "EFO_0010282"), 1
            ).otherwise(0),
            "reproductiveSystemOrBreastDisease": f.when(
                f.array_contains("mappedTherapeuticAreas", "OTAR_0000017"), 1
            ).otherwise(0),
            "integumentarySystemDisease": f.when(
                f.array_contains("mappedTherapeuticAreas", "EFO_0010285"), 1
            ).otherwise(0),
            "endocrineSystemDisease": f.when(
                f.array_contains("mappedTherapeuticAreas", "EFO_0001379"), 1
            ).otherwise(0),
            "respiratoryOrThoracicDisease": f.when(
                f.array_contains("mappedTherapeuticAreas", "OTAR_0000010"), 1
            ).otherwise(0),
            "urinarySystemDisease": f.when(
                f.array_contains("mappedTherapeuticAreas", "EFO_0009690"), 1
            ).otherwise(0),
            "musculoskeletalOrConnectiveTissueDisease": f.when(
                f.array_contains("mappedTherapeuticAreas", "OTAR_0000006"), 1
            ).otherwise(0),
            "disorderOfEar": f.when(
                f.array_contains("mappedTherapeuticAreas", "MONDO_0021205"), 1
            ).otherwise(0),
            "immuneSystemDisease": f.when(
                f.array_contains("mappedTherapeuticAreas", "EFO_000540"), 1
            ).otherwise(0),
            "hematologicDisease": f.when(
                f.array_contains("mappedTherapeuticAreas", "EFO_0005803"), 1
            ).otherwise(0),
            "nervousSystemDisease": f.when(
                f.array_contains("mappedTherapeuticAreas", "EFO_0000618"), 1
            ).otherwise(0),
            "psychiatricDisorder": f.when(
                f.array_contains("mappedTherapeuticAreas", "MONDO_0002025"), 1
            ).otherwise(0),
            "nutritionalOrMetabolicDisease": f.when(
                f.array_contains("mappedTherapeuticAreas", "OTAR_0000020"), 1
            ).otherwise(0),
            "geneticFamilialOrCongenitalDisease": f.when(
                f.array_contains("mappedTherapeuticAreas", "OTAR_0000018"), 1
            ).otherwise(0),
            "injuryPoisoningOrOtherComplication": f.when(
                f.array_contains("mappedTherapeuticAreas", "OTAR_0000009"), 1
            ).otherwise(0),
            "signOrSymptom": f.when(
                f.array_contains("mappedTherapeuticAreas", "EFO_0003765"), 1
            ).otherwise(0),
            "other": f.when(
                f.array_contains("mappedTherapeuticAreas", "other"), 1
            ).otherwise(0),
        }
    )
    .withColumn(
        "totalTherapeuticAreas",
        f.col("cancerOrBenignTumor")
        + f.col("infectiousDisease")
        + f.col("pregnancyOrPerinatalDisease")
        + f.col("disorderOfVisualSystem")
        + f.col("cardiovascularDisease")
        + f.col("pancreasDisease")
        + f.col("gastrointestinalDisease")
        + f.col("reproductiveSystemOrBreastDisease")
        + f.col("integumentarySystemDisease")
        + f.col("endocrineSystemDisease")
        + f.col("respiratoryOrThoracicDisease")
        + f.col("urinarySystemDisease")
        + f.col("musculoskeletalOrConnectiveTissueDisease")
        + f.col("disorderOfEar")
        + f.col("immuneSystemDisease")
        + f.col("hematologicDisease")
        + f.col("nervousSystemDisease")
        + f.col("psychiatricDisorder")
        + f.col("nutritionalOrMetabolicDisease")
        + f.col("geneticFamilialOrCongenitalDisease")
        + f.col("injuryPoisoningOrOtherComplication")
        + f.col("signOrSymptom")
        + f.col("other"),
    )
)

In [8]:
(
    gwas
    .filter(f.col('binaryLessCases'))
    .groupBy('measurement')
    .agg(f.count('studyId'))
    .show()
)

                                                                                

+-----------+--------------+
|measurement|count(studyId)|
+-----------+--------------+
|       true|          3860|
|      false|         15389|
+-----------+--------------+



In [9]:
gwas.filter(f.col('binaryLessCases')).filter(~f.col('measurement')).sort(f.desc('totalTherapeuticAreas')).show()

25/05/20 12:21:01 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.

+--------------------+------+-----------+---------+--------------------+------------------------+---------------------+--------+--------------------+----------------------+---------------+--------------------+----------------------------------+--------------------+------+---------+--------+--------------------+---------------------+--------------------+------------------+--------------------+-------------+--------------------+-----------+---------+--------------------+--------------------+--------------------+-----------+----------------------+-----------+---------------+-------------------+-----------------+---------------------------+----------------------+---------------------+---------------+-----------------------+---------------------------------+--------------------------+----------------------+----------------------------+--------------------+----------------------------------------+-------------+-------------------+------------------+--------------------+-------------------+--

                                                                                

In [10]:
gwas.write.parquet('gs://genetics-portal-dev-analysis/dc16/output/gentropy_paper/gwas_therapeutic_areas', mode='overwrite')

                                                                                