In [1]:
import pyspark.sql.functions as f
import matplotlib.pyplot as plt
import pandas as pd

from gentropy.common.session import Session
from gentropy.dataset.colocalisation import Colocalisation
from gentropy.dataset.l2g_prediction import L2GPrediction
from gentropy.dataset.study_index import StudyIndex
from gentropy.dataset.study_locus import StudyLocus



IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html



In [2]:
session = Session(
    extended_spark_conf={
        'spark.executor.memory':'10g',
        'spark.driver.memory':'10g'}
)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/04/10 11:19:46 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
target = session.spark.read.parquet('/users/dc16/data/releases/25.03/target')
l2g = L2GPrediction.from_parquet(
    session, "/users/dc16/data/releases/25.03/l2g_prediction"
).df.select("studyLocusId", "geneId", "score")
cred_sets = StudyLocus.from_parquet(
    session, "/users/dc16/data/releases/25.03/credible_set"
)
studies = (
    session.spark.read.parquet(
        "/users/dc16/data/gentropy_paper/gwas_study_index_therapeutic_areas"
    )
    .select(
        "studyId",
        "traitFromSource",
        "traitFromSourceMappedIds",
        "nCases",
        "nControls",
        "nSamples",
        "Haematology",
        "Metabolic",
        "Congenital",
        "Signs/symptoms",
        "Neurology",
        "Immune",
        "Psychiatry",
        "Dermatology",
        "Ophthalmology",
        "Cardiovascular",
        "Oncology",
        "Respiratory",
        "Digestive",
        "Endocrine",
        "Musculoskeletal",
        "Infection",
        "Measurement",
        f.col("bianry_less_cases").alias("binary_less_cases"),
        "Other",
    )
    .filter(f.col("Measurement") == 0)
    .filter(f.col("bianry_less_cases") == 1)
)


In [4]:
l2g_signif = (
    l2g.filter(f.col("score") >= 0.5)
    .join(
        cred_sets.df.select(
            "studyLocusId",
            "variantId",
            "studyId",
            "beta",
            "pValueMantissa",
            "pValueExponent",
        ),
        "studyLocusId",
        "inner",
    )
    .join(studies, "studyId", "inner")
)
l2g_margin = (
    l2g.filter(f.col("score") >= 0.05)
    .join(
        cred_sets.df.select(
            "studyLocusId",
            "variantId",
            "studyId",
            "beta",
            "pValueMantissa",
            "pValueExponent",
        ),
        "studyLocusId",
        "inner",
    )
    .join(studies, "studyId", "inner")
)

In [5]:
l2g_signif.show(truncate=False)

25/04/10 11:19:50 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

+--------------------------------+--------------------------------+---------------+------------------+----------------+---------------------+--------------+--------------+---------------------------------------------------------------------------------------------------------------------+------------------------------+------+---------+--------+-----------+---------+----------+--------------+---------+------+----------+-----------+-------------+--------------+--------+-----------+---------+---------+---------------+---------+-----------+-----------------+-----+
|studyId                         |studyLocusId                    |geneId         |score             |variantId       |beta                 |pValueMantissa|pValueExponent|traitFromSource                                                                                                      |traitFromSourceMappedIds      |nCases|nControls|nSamples|Haematology|Metabolic|Congenital|Signs/symptoms|Neurology|Immune|Psychiatry|Dermatology

In [6]:
(
    l2g_signif
    .agg(
        f.sum('Haematology').alias('Haematology'),
        f.sum('Metabolic').alias('Metabolic'),
        f.sum('Congenital').alias('Congenital'),
        f.sum('Signs/symptoms').alias('Signs/symptoms'),
        f.sum('Neurology').alias('Neurology'),
        f.sum('Immune').alias('Immune'),
        f.sum('Psychiatry').alias('Psychiatry'),
        f.sum('Dermatology').alias('Dermatology'),
        f.sum('Ophthalmology').alias('Ophthalmology'),
        f.sum('Cardiovascular').alias('Cardiovascular'),
        f.sum('Oncology').alias('Oncology'),
        f.sum('Respiratory').alias('Respiratory'),
        f.sum('Digestive').alias('Digestive'),
        f.sum('Endocrine').alias('Endocrine'),
        f.sum('Musculoskeletal').alias('Musculoskeletal'),
        f.sum('Infection').alias('Infection'),
        f.sum('Other').alias('Other'),
    )
    .show(vertical=True)
)



-RECORD 0---------------
 Haematology     | 394  
 Metabolic       | 5861 
 Congenital      | 5480 
 Signs/symptoms  | 265  
 Neurology       | 6677 
 Immune          | 6416 
 Psychiatry      | 4039 
 Dermatology     | 1535 
 Ophthalmology   | 2628 
 Cardiovascular  | 7205 
 Oncology        | 7176 
 Respiratory     | 3167 
 Digestive       | 3703 
 Endocrine       | 5644 
 Musculoskeletal | 5287 
 Infection       | 719  
 Other           | 2919 



                                                                                

In [7]:
(
    l2g_margin
    .agg(
        f.sum('Haematology').alias('Haematology'),
        f.sum('Metabolic').alias('Metabolic'),
        f.sum('Congenital').alias('Congenital'),
        f.sum('Signs/symptoms').alias('Signs/symptoms'),
        f.sum('Neurology').alias('Neurology'),
        f.sum('Immune').alias('Immune'),
        f.sum('Psychiatry').alias('Psychiatry'),
        f.sum('Dermatology').alias('Dermatology'),
        f.sum('Ophthalmology').alias('Ophthalmology'),
        f.sum('Cardiovascular').alias('Cardiovascular'),
        f.sum('Oncology').alias('Oncology'),
        f.sum('Respiratory').alias('Respiratory'),
        f.sum('Digestive').alias('Digestive'),
        f.sum('Endocrine').alias('Endocrine'),
        f.sum('Musculoskeletal').alias('Musculoskeletal'),
        f.sum('Infection').alias('Infection'),
        f.sum('Other').alias('Other'),
    )
    .show(vertical=True)
)



-RECORD 0----------------
 Haematology     | 1061  
 Metabolic       | 13174 
 Congenital      | 13071 
 Signs/symptoms  | 760   
 Neurology       | 16564 
 Immune          | 15445 
 Psychiatry      | 10282 
 Dermatology     | 3793  
 Ophthalmology   | 5615  
 Cardiovascular  | 16869 
 Oncology        | 15381 
 Respiratory     | 7788  
 Digestive       | 9113  
 Endocrine       | 12471 
 Musculoskeletal | 12803 
 Infection       | 1683  
 Other           | 6724  



                                                                                

In [32]:
studies_per_therapeutic_area = (
    studies
    .filter(f.col("Measurement") == 0)
    .filter(f.col("bianry_less_cases") == 1)
    .agg(
        f.sum('Haematology').alias('Haematology'),
        f.sum('Metabolic').alias('Metabolic'),
        f.sum('Congenital').alias('Congenital'),
        f.sum('Signs/symptoms').alias('Signs/symptoms'),
        f.sum('Neurology').alias('Neurology'),
        f.sum('Immune').alias('Immune'),
        f.sum('Psychiatry').alias('Psychiatry'),
        f.sum('Dermatology').alias('Dermatology'),
        f.sum('Ophthalmology').alias('Ophthalmology'),
        f.sum('Cardiovascular').alias('Cardiovascular'),
        f.sum('Oncology').alias('Oncology'),
        f.sum('Respiratory').alias('Respiratory'),
        f.sum('Digestive').alias('Digestive'),
        f.sum('Endocrine').alias('Endocrine'),
        f.sum('Musculoskeletal').alias('Musculoskeletal'),
        f.sum('Infection').alias('Infection'),
        f.sum('Other').alias('Other'),
    )
)

In [34]:
genes_therapeutic_areas = (
    l2g_signif.groupBy("geneId")
    .agg(
        f.sum("Haematology").alias("Haematology"),
        f.sum("Metabolic").alias("Metabolic"),
        f.sum("Congenital").alias("Congenital"),
        f.sum("Signs/symptoms").alias("Signs/symptoms"),
        f.sum("Neurology").alias("Neurology"),
        f.sum("Immune").alias("Immune"),
        f.sum("Psychiatry").alias("Psychiatry"),
        f.sum("Dermatology").alias("Dermatology"),
        f.sum("Ophthalmology").alias("Ophthalmology"),
        f.sum("Cardiovascular").alias("Cardiovascular"),
        f.sum("Oncology").alias("Oncology"),
        f.sum("Respiratory").alias("Respiratory"),
        f.sum("Digestive").alias("Digestive"),
        f.sum("Endocrine").alias("Endocrine"),
        f.sum("Musculoskeletal").alias("Musculoskeletal"),
        f.sum("Infection").alias("Infection"),
        f.sum("Other").alias("Other"),
    )
    .withColumn(
        "total",
        f.col("Haematology")
        + f.col("Metabolic")
        + f.col("Congenital")
        + f.col("Signs/symptoms")
        + f.col("Neurology")
        + f.col("Immune")
        + f.col("Psychiatry")
        + f.col("Dermatology")
        + f.col("Ophthalmology")
        + f.col("Cardiovascular")
        + f.col("Oncology")
        + f.col("Respiratory")
        + f.col("Digestive")
        + f.col("Endocrine")
        + f.col("Musculoskeletal")
        + f.col("Infection")
        + f.col("Other"),
    )
    .withColumns(
        {
            'haematology_proportion': f.col('Haematology') / studies_per_therapeutic_area.collect()[0]['Haematology'],
            'metabolic_proportion': f.col('Metabolic') / studies_per_therapeutic_area.collect()[0]['Metabolic'],
            'congenital_proportion': f.col('Congenital') / studies_per_therapeutic_area.collect()[0]['Congenital'],
            'signs_symptoms_proportion': f.col('Signs/symptoms') / studies_per_therapeutic_area.collect()[0]['Signs/symptoms'],
            'neurology_proportion': f.col('Neurology') / studies_per_therapeutic_area.collect()[0]['Neurology'],
            'immune_proportion': f.col('Immune') / studies_per_therapeutic_area.collect()[0]['Immune'],
            'psychiatry_proportion': f.col('Psychiatry') / studies_per_therapeutic_area.collect()[0]['Psychiatry'],
            'dermatology_proportion': f.col('Dermatology') / studies_per_therapeutic_area.collect()[0]['Dermatology'],
            'ophthalmology_proportion': f.col('Ophthalmology') / studies_per_therapeutic_area.collect()[0]['Ophthalmology'],
            'cardiovascular_proportion': f.col('Cardiovascular') / studies_per_therapeutic_area.collect()[0]['Cardiovascular'],
            'oncology_proportion': f.col('Oncology') / studies_per_therapeutic_area.collect()[0]['Oncology'],
            'respiratory_proportion': f.col('Respiratory') / studies_per_therapeutic_area.collect()[0]['Respiratory'],
            'digestive_proportion': f.col('Digestive') / studies_per_therapeutic_area.collect()[0]['Digestive'],
            'endocrine_proportion': f.col('Endocrine') / studies_per_therapeutic_area.collect()[0]['Endocrine'],
            'musculoskeletal_proportion': f.col('Musculoskeletal') / studies_per_therapeutic_area.collect()[0]['Musculoskeletal'],
            'infection_proportion': f.col('Infection') / studies_per_therapeutic_area.collect()[0]['Infection'],
            'other_proportion': f.col('Other') / studies_per_therapeutic_area.collect()[0]['Other'],
        }
    )
    .join(
        target.withColumn("constraint", f.explode("constraint"))
        .select(
            "id",
            "approvedSymbol",
            "biotype",
            "genomicLocation.chromosome",
            "genomicLocation.start",
            "genomicLocation.end",
            "constraint.constraintType",
            "constraint.score",
        )
        .filter(f.col("constraintType") == "lof"),
        target["id"] == f.col("geneId"),
        "inner",
    )
    .drop("id", "constraintType")
    .withColumnRenamed("score", "lofConstraint")
    .withColumn(
        'pleiotropy',
        (
            f.when(f.col("Haematology") > 0, 1).otherwise(0)
            + f.when(f.col("Metabolic") > 0, 1).otherwise(0)
            + f.when(f.col("Congenital") > 0, 1).otherwise(0)
            + f.when(f.col("Signs/symptoms") > 0, 1).otherwise(0)
            + f.when(f.col("Neurology") > 0, 1).otherwise(0)
            + f.when(f.col("Immune") > 0, 1).otherwise(0)
            + f.when(f.col("Psychiatry") > 0, 1).otherwise(0)
            + f.when(f.col("Dermatology") > 0, 1).otherwise(0)
            + f.when(f.col("Ophthalmology") > 0, 1).otherwise(0)
            + f.when(f.col("Cardiovascular") > 0, 1).otherwise(0)
            + f.when(f.col("Oncology") > 0, 1).otherwise(0)
            + f.when(f.col("Respiratory") > 0, 1).otherwise(0)
            + f.when(f.col("Digestive") > 0, 1).otherwise(0)
            + f.when(f.col("Endocrine") > 0, 1).otherwise(0)
            + f.when(f.col("Musculoskeletal") > 0, 1).otherwise(0)
            + f.when(f.col("Infection") > 0, 1).otherwise(0)
            + f.when(f.col("Other") > 0, 1).otherwise(0)
        )
        / 17
    )
    .sort(f.desc("total"))
)

In [37]:
studies_per_therapeutic_area.collect()[0]['Cardiovascular']

1698

In [35]:
genes_therapeutic_areas.show()



+---------------+-----------+---------+----------+--------------+---------+------+----------+-----------+-------------+--------------+--------+-----------+---------+---------+---------------+---------+-----+-----+----------------------+--------------------+---------------------+-------------------------+--------------------+--------------------+---------------------+----------------------+------------------------+-------------------------+--------------------+----------------------+--------------------+--------------------+--------------------------+--------------------+--------------------+--------------+--------------+----------+---------+---------+-------------+-------------------+
|         geneId|Haematology|Metabolic|Congenital|Signs/symptoms|Neurology|Immune|Psychiatry|Dermatology|Ophthalmology|Cardiovascular|Oncology|Respiratory|Digestive|Endocrine|Musculoskeletal|Infection|Other|total|haematology_proportion|metabolic_proportion|congenital_proportion|signs_symptoms_proportion|n

                                                                                

In [30]:
genes_pd = genes_therapeutic_areas.toPandas()

                                                                                

In [11]:
genes_pd

Unnamed: 0,geneId,Haematology,Metabolic,Congenital,Signs/symptoms,Neurology,Immune,Psychiatry,Dermatology,Ophthalmology,...,Musculoskeletal,Infection,Other,total,approvedSymbol,biotype,chromosome,start,end,lofScore
0,ENSG00000147883,0,73,5,0,38,2,4,1,49,...,0,0,19,469,CDKN2B,protein_coding,9,22002903,22009305,7.336900e-03
1,ENSG00000130203,0,33,29,4,77,1,60,1,12,...,9,8,9,334,APOE,protein_coding,19,44905791,44909393,1.868500e-03
2,ENSG00000164093,0,0,26,0,38,24,2,0,9,...,7,0,8,317,PITX2,protein_coding,4,110617423,110642123,9.761800e-01
3,ENSG00000134242,3,32,13,0,11,93,0,0,7,...,51,0,3,304,PTPN22,protein_coding,1,113813811,113871753,3.492100e-21
4,ENSG00000162594,0,2,82,0,2,92,0,13,3,...,11,2,0,282,IL23R,protein_coding,1,67138907,67259979,5.596100e-04
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6829,ENSG00000111145,0,0,0,0,0,0,0,0,0,...,0,0,0,1,ELK3,protein_coding,12,96194375,96269824,8.350800e-02
6830,ENSG00000141905,0,0,0,0,0,0,0,0,0,...,0,0,0,1,NFIC,protein_coding,19,3314403,3469217,9.053400e-02
6831,ENSG00000177189,0,0,0,0,0,0,0,0,0,...,0,0,0,1,RPS6KA3,protein_coding,X,20149911,20267519,9.999900e-01
6832,ENSG00000123094,0,0,0,0,0,0,0,0,0,...,0,0,0,1,RASSF8,protein_coding,12,25958232,26079892,9.415800e-01


In [12]:
target.printSchema()

root
 |-- id: string (nullable = true)
 |-- approvedSymbol: string (nullable = true)
 |-- biotype: string (nullable = true)
 |-- transcriptIds: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- canonicalTranscript: struct (nullable = true)
 |    |-- id: string (nullable = true)
 |    |-- chromosome: string (nullable = true)
 |    |-- start: long (nullable = true)
 |    |-- end: long (nullable = true)
 |    |-- strand: string (nullable = true)
 |-- canonicalExons: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- genomicLocation: struct (nullable = true)
 |    |-- chromosome: string (nullable = true)
 |    |-- start: long (nullable = true)
 |    |-- end: long (nullable = true)
 |    |-- strand: integer (nullable = true)
 |-- alternativeGenes: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- approvedName: string (nullable = true)
 |-- go: array (nullable = true)
 |    |-- element: struct (containsNull = tru