In [1]:
!gcloud auth application-default login

Your browser has been opened to visit:

    https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=764086051850-6qr4p6gpi6hn506pt8ejuq83di341hur.apps.googleusercontent.com&redirect_uri=http%3A%2F%2Flocalhost%3A8085%2F&scope=openid+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fuserinfo.email+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fcloud-platform+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fsqlservice.login&state=sLvoHutklTii364XYQ6y7ImqbGlZIf&access_type=offline&code_challenge=w7B4jg_6VNF62RQLyknrvoi2Q5c9ZJC-go5NR7RIaQ4&code_challenge_method=S256


Credentials saved to file: [/Users/yt4/.config/gcloud/application_default_credentials.json]

These credentials will be used by any library that requests Application Default Credentials (ADC).

Quota project "open-targets-genetics-dev" was added to ADC which can be used by Google client libraries for billing and quota. Note that some services may still bill the project owning the resource.


Updates are available for some Google Clo

In [1]:
!gcloud auth login

Your browser has been opened to visit:

    https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=32555940559.apps.googleusercontent.com&redirect_uri=http%3A%2F%2Flocalhost%3A8085%2F&scope=openid+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fuserinfo.email+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fcloud-platform+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fappengine.admin+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fsqlservice.login+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fcompute+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Faccounts.reauth&state=UiDTZiS2COSkwLlSyy0l1pVKoF8Yfg&access_type=offline&code_challenge=4jMAWGLzSEe-ozTnSiVWYKCSv6hg9PyZzX727eiHbYk&code_challenge_method=S256


You are now logged in as [yt4@sanger.ac.uk].
Your current project is [open-targets-genetics-dev].  You can change this setting by running:
  $ gcloud config set project PROJECT_ID


Updates are available for some Google Cloud CLI components.  To install them,
please run:
  $ gcloud components update



In [1]:
import os

import hail as hl
import numpy as np
import pyspark.sql.functions as f
from pyspark.sql import DataFrame

from gentropy.common.session import Session
from gentropy.dataset.study_index import StudyIndex
from gentropy.dataset.summary_statistics import SummaryStatistics
from gentropy.dataset.study_locus import StudyLocus
from gentropy.susie_finemapper import SusieFineMapperStep
from gentropy.method.drug_enrichment_from_evid import chemblDrugEnrichment

"""Common utilities for the project."""

import os
from pathlib import Path
from gentropy.common.session import Session
import logging


def get_gcs_credentials() -> str:
    """Get the credentials for google cloud storage."""
    app_default_credentials = os.path.join(
        os.getenv("HOME", "."), ".config/gcloud/application_default_credentials.json"
    )

    service_account_credentials = os.path.join(
        os.getenv("HOME", "."), ".config/gcloud/service_account_credentials.json"
    )

    if Path(app_default_credentials).exists():
        return app_default_credentials
    else:
        raise FileNotFoundError("No GCS credentials found.")


def get_gcs_hadoop_connector_jar() -> str:
    """Get the google cloud storage hadoop connector for spark.

    This function will return the url to download the hadoop jar.
    """

    return (
        "https://storage.googleapis.com/hadoop-lib/gcs/gcs-connector-hadoop3-latest.jar"
    )


def gcs_conf(
    credentials_path=None, project="open-targets-genetics-dev"
) -> dict[str, str]:
    """Get the spark configuration with hadoop connector for google cloud storage."""
    credentials_path = credentials_path or get_gcs_credentials()
    return {
        "spark.driver.memory": "12g",
        "spark.kryoserializer.buffer.max": "500m",
        "spark.driver.maxResultSize":"2g",
        "spark.hadoop.fs.gs.impl": "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem",
        "spark.jars": get_gcs_hadoop_connector_jar(),
        "spark.hadoop.google.cloud.auth.service.account.enable": "true",
        "spark.hadoop.fs.gs.project.id": project,
        "spark.hadoop.google.cloud.auth.service.account.json.keyfile": credentials_path,
        "spark.hadoop.fs.gs.requester.pays.mode": "AUTO",
    }


class GentropySession(Session):
    def __init__(self, *args, **kwargs):
        if "extended_spark_conf" in kwargs:
            kwargs["extended_spark_conf"].update(gcs_conf())
        else:
            kwargs["extended_spark_conf"] = gcs_conf()
        super().__init__(*args, **kwargs)

    @property
    def conf(self):
        logging.warning(
            "To change the config restart the session and use the `extended_spark_conf` parameter."
        )
        return self.spark.sparkContext.getConf().getAll()

session= GentropySession()


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/06/27 13:05:25 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/06/27 13:05:26 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [3]:
path_to_release_folder="gs://open-targets-data-releases/25.06/"


si=StudyIndex.from_parquet(session,path_to_release_folder+"output/study/")
sl=StudyLocus.from_parquet(session,path_to_release_folder+"output/credible_set/")

                                                                                

# Using full training set to calualte FDR etc

In [4]:
training_set=session.spark.read.json("gs://genetics-portal-dev-analysis/yt4/2506_release/training_set/20250625_gentropy_paper_v1.json")

                                                                                

In [5]:
training_set.show(1)

+--------------------+---------------+---------------+------------+--------------------+---------------+
|          diseaseIds|         geneId|goldStandardSet|     studyId|        studyLocusId|      variantId|
+--------------------+---------------+---------------+------------+--------------------+---------------+
|[EFO_0004611, EFO...|ENSG00000130173|       negative|GCST90091598|08ef835a25f0bf2c8...|19_11079858_G_A|
+--------------------+---------------+---------------+------------+--------------------+---------------+
only showing top 1 row



In [6]:
# combinig it with l2g predictions
l2g=session.spark.read.parquet("gs://ot-team/irene/l2g/xgboost/2706/l2g_predictions").select("studyLocusId","geneId","score")

In [7]:
l2g.show(1)

[Stage 6:>                                                          (0 + 1) / 1]

+--------------------+---------------+-----------+
|        studyLocusId|         geneId|      score|
+--------------------+---------------+-----------+
|5b9c516a7cf716630...|ENSG00000005189|0.011335643|
+--------------------+---------------+-----------+
only showing top 1 row



                                                                                

In [8]:
table_with_score = l2g
score_column = "score"
min_score = 0.5

In [9]:
l2g.count()

                                                                                

10623371

In [10]:
training_set.groupBy("goldStandardSet").count().show()



+---------------+------+
|goldStandardSet| count|
+---------------+------+
|       positive|  8520|
|       negative|124450|
+---------------+------+



                                                                                

In [11]:
chemblDrugEnrichment.studyLocusId_based_evidence_table_vs_training_set(
    table_with_score=l2g,
    training_set=training_set,
    score_column="score",
    min_score=0.5,
    name_of_the_evidence="l2g",
)

                                                                                

Unnamed: 0,Evidence,TP,TN,FP,FN,Sensitivity (recall),Specificity (selectivity),PPV (precision),FDR,Balanced_accuracy
0,l2g,6321,123666,784,2199,0.741901,0.9937,0.889655,0.110345,0.867801


In [12]:
training_set.filter(f.col("goldStandardSet")=="positive").select("geneId", "diseaseIds", "variantId").distinct().count()

                                                                                

4468

In [13]:
fm=session.spark.read.parquet(path_to_release_folder+"intermediate/l2g_feature_matrix/")
fm=fm.filter(f.col("isProteinCoding")==1).cache()
fm.count()

25/06/27 13:09:22 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

10623371

In [22]:
combined_df = fm.join(l2g.select("studyLocusId", "geneId", "score"), on=["geneId", "studyLocusId"], how="left").fillna(0).cache()
combined_df.count()

                                                                                

10623371

In [23]:
clpp_thr=0.01
coloc_thr=0.8

combined_df = combined_df.withColumn(
    "l2g_05",
    f.when(f.col("score") >= 0.5, 1).otherwise(0)
)
combined_df = combined_df.withColumn(
    "l2g_005",
    f.when(f.col("score") >= 0.05, 1).otherwise(0)
)
combined_df = combined_df.withColumn(
    "l2g_08",
    f.when(f.col("score") >= 0.8, 1).otherwise(0)
)
combined_df = combined_df.withColumn(
    "eQTL_coloc",
    f.when(
    (f.col("eQtlColocClppMaximum")>=clpp_thr) | 
    (f.col("eQtlColocH4Maximum")>=coloc_thr), 1).otherwise(0)
)
combined_df = combined_df.withColumn(
    "pQTL_coloc",
    f.when(
    (f.col("pQtlColocClppMaximum")>=clpp_thr) | 
    (f.col("pQtlColocH4Maximum")>=coloc_thr), 1).otherwise(0)
)
combined_df = combined_df.withColumn(
    "sQTL_coloc",
    f.when(
    (f.col("sQtlColocClppMaximum")>=clpp_thr) | 
    (f.col("sQtlColocH4Maximum")>=coloc_thr), 1).otherwise(0)
)
combined_df = combined_df.withColumn(
    "VEP",
    f.when((f.col("vepMaximum")>=0.66), 1).otherwise(0)
)
combined_df = combined_df.withColumn(
    "distance",
    f.when((f.col("distanceSentinelFootprintNeighbourhood")==1) |
    (f.col("distanceSentinelTssNeighbourhood")==1), 1).otherwise(0)
)

combined_df = combined_df.withColumn(
    "sum_coloc_vep_distance",
    f.col("eQTL_coloc") + f.col("pQTL_coloc") + f.col("sQTL_coloc") + f.col("VEP") + f.col("distance")
)

combined_df = combined_df.withColumn(
    "eQTLs_not_in_L2G",
    f.when((f.col("eQTL_coloc")==1) &
    (f.col("l2g_005")==0), 1).otherwise(0)
).cache()

combined_df.count()

                                                                                

10623371

In [24]:
combined_df.show(1)

+---------------+--------------------+---------------------+---------------------+----------------------------------+-------------------------+--------------------------------------+-------------------+--------------------------------+---------------+----------------------------+--------------------+---------------------------------+------------------+-------------------------------+--------------+--------------------+---------------------------------+------------------+-------------------------------+---------------------+--------------------+---------------------------------+------------------+-------------------------------+----------+-----------------------+-------+--------------------+---------------+------------+------+-------+------+----------+----------+----------+---+--------+----------------------+----------------+
|         geneId|        studyLocusId|credibleSetConfidence|distanceFootprintMean|distanceFootprintMeanNeighbourhood|distanceSentinelFootprint|distanceSentinelFoot

In [17]:
colnames=["l2g_05","l2g_005","l2g_08","eQTL_coloc","pQTL_coloc","sQTL_coloc","VEP","distance","sum_coloc_vep_distance","eQTLs_not_in_L2G"]

In [18]:
import pandas as pd
all_res = []

for coln in colnames:
    print(coln)
    res=chemblDrugEnrichment.studyLocusId_based_evidence_table_vs_training_set(
        table_with_score=combined_df,
        training_set=training_set,
        score_column=coln,
        min_score=0.5,
        name_of_the_evidence=coln,
    )
    all_res.append(res)

l2g_05


                                                                                

l2g_005


                                                                                

l2g_08


                                                                                

eQTL_coloc


                                                                                

pQTL_coloc


                                                                                

sQTL_coloc


                                                                                

VEP


                                                                                

distance


                                                                                

sum_coloc_vep_distance


                                                                                

eQTLs_not_in_L2G


                                                                                

In [19]:
combined_res= pd.concat(all_res, ignore_index=True)

In [21]:
combined_res

Unnamed: 0,Evidence,TP,TN,FP,FN,Sensitivity (recall),Specificity (selectivity),PPV (precision),FDR,Balanced_accuracy
0,l2g_05,6321,123666,784,2199,0.741901,0.9937,0.889655,0.110345,0.867801
1,l2g_005,7564,117699,6751,956,0.887793,0.945753,0.528397,0.471603,0.916773
2,l2g_08,3773,124276,174,4747,0.44284,0.998602,0.955916,0.044084,0.720721
3,eQTL_coloc,2966,119501,4949,5554,0.348122,0.960233,0.374732,0.625268,0.654178
4,pQTL_coloc,1198,124116,334,7322,0.14061,0.997316,0.781984,0.218016,0.568963
5,sQTL_coloc,2123,122286,2164,6397,0.249178,0.982611,0.495218,0.504782,0.615895
6,VEP,1761,124021,429,6759,0.20669,0.996553,0.80411,0.19589,0.601621
7,distance,6272,121581,2869,2248,0.73615,0.976947,0.686139,0.313861,0.856548
8,sum_coloc_vep_distance,7186,116309,8141,1334,0.843427,0.934584,0.468846,0.531154,0.889006
9,eQTLs_not_in_L2G,34,121815,2635,8486,0.003991,0.978827,0.012739,0.987261,0.491409


In [21]:
combined_res

Unnamed: 0,Evidence,TP,TN,FP,FN,Sensitivity (recall),Specificity (selectivity),PPV (precision),FDR,Balanced_accuracy
0,l2g_05,2925,50485,477,1736,0.627548,0.99064,0.859788,0.140212,0.809094
1,l2g_005,3625,48793,2169,1036,0.77773,0.957439,0.625647,0.374353,0.867584
2,l2g_08,2067,50789,173,2594,0.443467,0.996605,0.922768,0.077232,0.720036
3,eQTL_coloc,1304,49198,1764,3357,0.279768,0.965386,0.425033,0.574967,0.622577
4,pQTL_coloc,580,50861,101,4081,0.124437,0.998018,0.851689,0.148311,0.561227
5,sQTL_coloc,814,50121,841,3847,0.174641,0.983498,0.491843,0.508157,0.579069
6,VEP,622,50777,185,4039,0.133448,0.99637,0.770756,0.229244,0.564909
7,distance,3009,49925,1037,1652,0.64557,0.979652,0.743697,0.256303,0.812611
8,sum_coloc_vep_distance,3494,48118,2844,1167,0.749625,0.944194,0.551278,0.448722,0.846909
9,eQTLs_not_in_L2G,48,50160,802,4613,0.010298,0.984263,0.056471,0.943529,0.497281


In [24]:
combined_res

Unnamed: 0,Evidence,TP,TN,FP,FN,Sensitivity (recall),Specificity (selectivity),PPV (precision),FDR,Balanced_accuracy
0,l2g_05,2163,37036,373,1589,0.576493,0.990029,0.852918,0.147082,0.783261
1,l2g_005,2814,35748,1661,938,0.75,0.955599,0.628827,0.371173,0.852799
2,l2g_08,1602,37279,130,2150,0.426972,0.996525,0.924942,0.075058,0.711749
3,eQTL_coloc,1078,36193,1216,2674,0.287313,0.967494,0.469922,0.530078,0.627404
4,pQTL_coloc,461,37374,35,3291,0.122868,0.999064,0.929435,0.070565,0.560966
5,sQTL_coloc,639,36714,695,3113,0.170309,0.981422,0.47901,0.52099,0.575865
6,VEP,425,37261,148,3327,0.113273,0.996044,0.74171,0.25829,0.554658
7,distance,2538,35569,1840,1214,0.676439,0.950814,0.579717,0.420283,0.813627
8,sum_coloc_vep_distance,2897,34499,2910,855,0.772122,0.922211,0.498881,0.501119,0.847166
9,eQTLs_not_in_L2G,42,36912,497,3710,0.011194,0.986714,0.077922,0.922078,0.498954


In [137]:
combined_res

Unnamed: 0,Evidence,TP,TN,FP,FN,Sensitivity (recall),Specificity (selectivity),PPV (precision),FDR,Balanced_accuracy
0,l2g_05,2448,46068,407,2149,0.532521,0.991243,0.857443,0.142557,0.761882
1,l2g_005,3308,44289,2186,1289,0.7196,0.952964,0.602111,0.397889,0.836282
2,l2g_08,1520,46352,123,3077,0.33065,0.997353,0.925137,0.074863,0.664002
3,eQTL_coloc,1304,44799,1676,3293,0.283663,0.963938,0.437584,0.562416,0.6238
4,pQTL_coloc,501,46430,45,4096,0.108984,0.999032,0.917582,0.082418,0.554008
5,sQTL_coloc,763,45472,1003,3834,0.165978,0.978419,0.43205,0.56795,0.572198
6,VEP,464,46235,240,4133,0.100935,0.994836,0.659091,0.340909,0.547886
7,distance,2741,45307,1168,1856,0.596258,0.974868,0.701202,0.298798,0.785563
8,sum_coloc_vep_distance,3260,43635,2840,1337,0.709158,0.938892,0.534426,0.465574,0.824025
9,eQTLs_not_in_L2G,76,45768,707,4521,0.016533,0.984788,0.097063,0.902937,0.50066


25/05/13 01:38:58 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 940295 ms exceeds timeout 120000 ms
25/05/13 01:38:58 WARN SparkContext: Killing executors is not supported by current scheduler.
25/05/13 01:39:03 WARN Executor: Issue communicating with driver in heartbeater
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEndpointRef.askSync(RpcEndpointRef.scala:101)
	at org.apache.spark.rpc.RpcEndpointRef.askSync(RpcEndpointRef.scala:85)
	at org.apache.spark.storage.BlockManagerMaster.registerBlockManager(BlockManagerMaster.scala:80)
	at org.apache.spark.storage.BlockManager.reregister(BlockManager.scala:642)
	at org.apache.spark.executor.Executor.reportHeartBeat(Executor.scala:1223)
	at o

In [22]:
disease_index_path=path_to_release_folder+"output/disease/disease.parquet"
disease_index_orig = session.spark.read.parquet(disease_index_path)

platform_chembl_evidence_path=path_to_release_folder+"output/evidence/sourceId=chembl"
chembl_evidence=session.spark.read.parquet(platform_chembl_evidence_path)

                                                                                

In [27]:
evidence=chemblDrugEnrichment.to_disease_target_evidence(table_with_score=l2g,
    score_column="score",
    datasource_id=coln,
    study_locus=sl,
    study_index=si,
    min_score=0.5
)
enrich=chemblDrugEnrichment.drug_enrichemnt_from_evidence(
    evid=evidence,
    disease_index_orig=disease_index_orig,
    chembl_orig=chembl_evidence, 
    indirect_assoc_score_thr=0.5,
    efo_ancestors_to_remove=["MONDO_0045024"]
)
enrich

25/06/16 15:36:22 WARN CacheManager: Asked to cache already cached data.        
25/06/16 15:36:23 WARN CacheManager: Asked to cache already cached data.
                                                                                

Unnamed: 0,clinicalPhase,odds_ratio,p_value,ci_low,ci_high,no_evid-low_clinphase,no_evid-high_clinphase,yes_evid-low_clinphase,yes_evid-high_clinphase,total_indirect_assoc
0,2+,1.221076,0.0319445,1.016564,1.466733,6027,30377,136,837,645333
1,3+,1.81969,6.353541e-20,1.598483,2.071509,20178,16226,395,578,645333
2,4+,3.02936,1.0628119999999999e-44,2.627267,3.492991,32120,4284,693,280,645333


In [24]:
old_l2g=session.spark.read.parquet("gs://open-targets-pre-data-releases/partners/24.12/output/etl/parquet/evidence/sourceId=ot_genetics_portal")
old_l2g=old_l2g.select("targetId","diseaseId","studyId","score","variantId").withColumnRenamed("score","resourceScore").cache()
old_l2g.count()

                                                                                

781184

In [25]:
enrich=chemblDrugEnrichment.drug_enrichemnt_from_evidence(
    evid=old_l2g,
    disease_index_orig=disease_index_orig,
    chembl_orig=chembl_evidence,
    indirect_assoc_score_thr=0.5,
    efo_ancestors_to_remove=["MONDO_0045024"]
)
enrich

                                                                                

Unnamed: 0,clinicalPhase,odds_ratio,p_value,ci_low,ci_high,no_evid-low_clinphase,no_evid-high_clinphase,yes_evid-low_clinphase,yes_evid-high_clinphase,total_indirect_assoc
0,2+,1.712922,0.0001131195,1.287199,2.279447,6110,30757,53,457,217543
1,3+,2.013262,1.029277e-14,1.681709,2.410182,20379,16488,194,316,217543
2,4+,3.796779,7.010254999999999e-38,3.152345,4.572954,32476,4391,337,173,217543


In [30]:
studyid_not_mvp=si.df.filter(f.col("pubmedId")!=39024449).select("studyId").distinct().cache()
studyid_not_mvp.count()

                                                                                

1954579

In [33]:
sl_no_mvp=sl.df.select("studyLocusId","studyId").join(studyid_not_mvp, on="studyId", how="inner").select("studyLocusId").cache()
sl_no_mvp.count()

                                                                                

2608953

In [35]:
l2g_no_mvp=l2g.join(sl_no_mvp, on="studyLocusId", how="inner")

evidence=chemblDrugEnrichment.to_disease_target_evidence(table_with_score=l2g_no_mvp,
    score_column="score",
    datasource_id=coln,
    study_locus=sl,
    study_index=si,
    min_score=0.5
)
enrich=chemblDrugEnrichment.drug_enrichemnt_from_evidence(
    evid=evidence,
    disease_index_orig=disease_index_orig,
    chembl_orig=chembl_evidence, 
    indirect_assoc_score_thr=0.5,
    efo_ancestors_to_remove=["MONDO_0045024"]
)
enrich

                                                                                

Unnamed: 0,clinicalPhase,odds_ratio,p_value,ci_low,ci_high,no_evid-low_clinphase,no_evid-high_clinphase,yes_evid-low_clinphase,yes_evid-high_clinphase,total_indirect_assoc
0,2+,1.427641,0.001647258,1.135942,1.794246,6078,30603,85,611,479213
1,3+,1.862291,7.903428e-16,1.597963,2.170344,20295,16386,278,418,479213
2,4+,3.007808,1.272236e-32,2.545331,3.554316,32318,4363,495,201,479213


25/06/16 17:13:09 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 1051371 ms exceeds timeout 120000 ms
25/06/16 17:13:09 WARN SparkContext: Killing executors is not supported by current scheduler.
25/06/16 17:13:15 WARN Executor: Issue communicating with driver in heartbeater
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEndpointRef.askSync(RpcEndpointRef.scala:101)
	at org.apache.spark.rpc.RpcEndpointRef.askSync(RpcEndpointRef.scala:85)
	at org.apache.spark.storage.BlockManagerMaster.registerBlockManager(BlockManagerMaster.scala:80)
	at org.apache.spark.storage.BlockManager.reregister(BlockManager.scala:642)
	at org.apache.spark.executor.Executor.reportHeartBeat(Executor.scala:1223)
	at 

# Using the most strict training set

In [99]:
training_set=session.spark.read.json("gs://genetics-portal-dev-analysis/yt4/2506_release/training_set/2506_traing_set_no_literature_0_95.json")

                                                                                

In [100]:
training_set.groupBy("goldStandardSet").count().show()



+---------------+-----+
|goldStandardSet|count|
+---------------+-----+
|       positive| 6809|
|       negative|68651|
+---------------+-----+



                                                                                

In [101]:
training_set.filter(f.col("goldStandardSet")=="positive").select("geneId", "diseaseIds", "variantId").distinct().count()

                                                                                

4164

In [102]:
training_set.filter(f.col("goldStandardSet")=="positive").select("geneId", "diseaseIds").distinct().count()

                                                                                

1121

In [103]:
chemblDrugEnrichment.studyLocusId_based_evidence_table_vs_training_set(
    table_with_score=l2g,
    training_set=training_set,
    score_column="score",
    min_score=0.5,
    name_of_the_evidence="l2g",
)

25/05/13 00:49:37 WARN CacheManager: Asked to cache already cached data.


Unnamed: 0,Evidence,TP,TN,FP,FN,Sensitivity (recall),Specificity (selectivity),PPV (precision),FDR,Balanced_accuracy
0,l2g,4605,68285,366,2204,0.676311,0.994669,0.926373,0.073627,0.83549


In [104]:
from pyspark.sql import Window
tmp1=training_set.filter(f.col("goldStandardSet")=="positive")
tmp1=tmp1.select("studyLocusId", "geneId", "diseaseIds", "variantId")
tmp1.count()

                                                                                

6809

In [105]:
window_spec = Window.partitionBy("geneId", "diseaseIds", "variantId").orderBy(f.lit(1))
tmp1 = tmp1.withColumn("row_number", f.row_number().over(window_spec))

tmp1 = tmp1.filter(f.col("row_number") == 1).drop("row_number")

tmp1=tmp1.select("studyLocusId").distinct().cache()
tmp1.count()

25/05/13 00:49:40 WARN CacheManager: Asked to cache already cached data.


4164

In [106]:
training_set=training_set.join(tmp1, on="studyLocusId", how="inner").cache()
training_set.count()

25/05/13 00:49:41 WARN CacheManager: Asked to cache already cached data.


46109

In [107]:
training_set.filter(f.col("goldStandardSet")=="positive").select("geneId", "diseaseIds", "variantId").distinct().count()

4164

In [108]:
chemblDrugEnrichment.studyLocusId_based_evidence_table_vs_training_set(
    table_with_score=l2g,
    training_set=training_set,
    score_column="score",
    min_score=0.5,
    name_of_the_evidence="l2g",
)

25/05/13 00:49:42 WARN CacheManager: Asked to cache already cached data.


Unnamed: 0,Evidence,TP,TN,FP,FN,Sensitivity (recall),Specificity (selectivity),PPV (precision),FDR,Balanced_accuracy
0,l2g,2240,41608,337,1924,0.537944,0.991966,0.869228,0.130772,0.764955


In [109]:
import pandas as pd
all_res = []

for coln in colnames:
    print(coln)
    res=chemblDrugEnrichment.studyLocusId_based_evidence_table_vs_training_set(
        table_with_score=combined_df,
        training_set=training_set,
        score_column=coln,
        min_score=0.5,
        name_of_the_evidence=coln,
    )
    all_res.append(res)

l2g_05
l2g_005


                                                                                

l2g_08


                                                                                

eQTL_coloc


                                                                                

pQTL_coloc
sQTL_coloc
VEP
distance
sum_coloc_vep_distance


                                                                                

eQTLs_not_in_L2G


                                                                                

In [110]:
combined_res= pd.concat(all_res, ignore_index=True)

In [111]:
combined_res

Unnamed: 0,Evidence,TP,TN,FP,FN,Sensitivity (recall),Specificity (selectivity),PPV (precision),FDR,Balanced_accuracy
0,l2g_05,2240,41608,337,1924,0.537944,0.991966,0.869228,0.130772,0.764955
1,l2g_005,3025,39970,1975,1139,0.726465,0.952915,0.605,0.395,0.83969
2,l2g_08,1401,41850,95,2763,0.336455,0.997735,0.936497,0.063503,0.667095
3,eQTL_coloc,1198,40411,1534,2966,0.287704,0.963428,0.438507,0.561493,0.625566
4,pQTL_coloc,461,41910,35,3703,0.110711,0.999166,0.929435,0.070565,0.554938
5,sQTL_coloc,723,41004,941,3441,0.173631,0.977566,0.434495,0.565505,0.575598
6,VEP,411,41732,213,3753,0.098703,0.994922,0.658654,0.341346,0.546813
7,distance,2484,40897,1048,1680,0.596542,0.975015,0.703284,0.296716,0.785778
8,sum_coloc_vep_distance,2979,39323,2622,1185,0.715418,0.93749,0.531869,0.468131,0.826454
9,eQTLs_not_in_L2G,72,41283,662,4092,0.017291,0.984217,0.098093,0.901907,0.500754


# Selecting most conservative gene

In [3]:
#training_set=session.spark.read.json("gs://genetics-portal-dev-analysis/yt4/20250403_for_gentropy_paper/training_sets/patched_training_2503-testrun-1_all_string_005_extended_EGL_variants.json")

training_set=session.spark.read.json("gs://genetics-portal-dev-analysis/yt4/2506_release/training_set/2506_traing_set_dedupl.json")

#training_set=session.spark.read.json("gs://genetics-portal-dev-analysis/yt4/2506_release/training_set/2506_traing_set_no_literature_0_95.json")

                                                                                

In [4]:
fm=session.spark.read.parquet(path_to_release_folder+"intermediate/l2g_feature_matrix/")
fm=fm.filter(f.col("isProteinCoding")==1).cache()
fm.count()

25/05/15 15:23:25 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

7734143

In [5]:
x=session.spark.read.parquet("gs://genetics-portal-dev-analysis/dc16/output/qsl_l2g_pleiotropy")

In [6]:
x.printSchema()

root
 |-- geneId: string (nullable = true)
 |-- studyLocusId: string (nullable = true)
 |-- variantId: string (nullable = true)
 |-- studyId: string (nullable = true)
 |-- beta: double (nullable = true)
 |-- zScore: double (nullable = true)
 |-- pValueMantissa: float (nullable = true)
 |-- pValueExponent: integer (nullable = true)
 |-- standardError: double (nullable = true)
 |-- finemappingMethod: string (nullable = true)
 |-- studyType: string (nullable = true)
 |-- credibleSetSize: integer (nullable = true)
 |-- posteriorProbability: double (nullable = true)
 |-- nSamples: integer (nullable = true)
 |-- nControls: integer (nullable = true)
 |-- nCases: integer (nullable = true)
 |-- majorPopulation: struct (nullable = true)
 |    |-- ldPopulation: string (nullable = true)
 |    |-- relativeSampleSize: double (nullable = true)
 |-- allelefrequencies: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- populationName: string (nullable = true)
 |    |

In [8]:
x=x.select("geneId","lofConstraint","pleiotropy").distinct().cache()
x.count()

                                                                                

5995

In [9]:
fm=fm.join(x, on="geneId", how="inner").cache()
fm.count()

                                                                                

2686238

In [10]:
from pyspark.sql import Window
import pyspark.sql.functions as f

# Define a window partitioned by studyLocusId and ordered by pleiotropy in descending order
window_spec = Window.partitionBy("studyLocusId").orderBy(f.col("pleiotropy").desc())

# Add a row number column to identify the top pleiotropy value for each studyLocusId
fm_with_rank = fm.withColumn("rank", f.row_number().over(window_spec))

# Filter to keep only the top-ranked rows (rank == 1)
fm_max_pleiotropy = fm_with_rank.filter(f.col("rank") == 1).drop("rank")

# Show the result
fm_max_pleiotropy.count()

                                                                                

557721

In [12]:
fm_max_pleiotropy.select("studyLocusId").distinct().count()

                                                                                

557721

In [13]:
fm_max_pleiotropy.show(1)



+---------------+--------------------+---------------------+---------------------+----------------------------------+-------------------------+--------------------------------------+-------------------+--------------------------------+---------------+----------------------------+--------------------+---------------------------------+------------------+-------------------------------+--------------+---------------+--------------------+---------------------------------+------------------+-------------------------------+---------------------+--------------------+---------------------------------+------------------+-------------------------------+----------+-----------------------+------------+--------------------+-------------+-------------------+
|         geneId|        studyLocusId|credibleSetConfidence|distanceFootprintMean|distanceFootprintMeanNeighbourhood|distanceSentinelFootprint|distanceSentinelFootprintNeighbourhood|distanceSentinelTss|distanceSentinelTssNeighbourhood|distanceTs

                                                                                

In [14]:
chemblDrugEnrichment.studyLocusId_based_evidence_table_vs_training_set(
    table_with_score=fm_max_pleiotropy,
    training_set=training_set,
    score_column="credibleSetConfidence",
    min_score=0,
    name_of_the_evidence="pleiytropy",
)

                                                                                

Unnamed: 0,Evidence,TP,TN,FP,FN,Sensitivity (recall),Specificity (selectivity),PPV (precision),FDR,Balanced_accuracy
0,pleiytropy,1925,45092,1383,2672,0.418751,0.970242,0.581923,0.418077,0.694497


In [16]:
from pyspark.sql import Window
import pyspark.sql.functions as f

# Define a window partitioned by studyLocusId and ordered by lofConstraint in ascending order
window_spec = Window.partitionBy("studyLocusId").orderBy(f.col("lofConstraint").asc())

# Add a row number column to identify the row with the minimal lofConstraint for each studyLocusId
fm_with_rank = fm.withColumn("rank", f.row_number().over(window_spec))

# Filter to keep only the top-ranked rows (rank == 1)
fm_min_lofConstraint = fm_with_rank.filter(f.col("rank") == 1).drop("rank")

# Show the result
fm_min_lofConstraint.count()

                                                                                

557721

In [None]:
chemblDrugEnrichment.studyLocusId_based_evidence_table_vs_training_set(
    table_with_score=fm_min_lofConstraint,
    training_set=training_set,
    score_column="credibleSetConfidence",
    min_score=0,
    name_of_the_evidence="lof",
)

                                                                                

Unnamed: 0,Evidence,TP,TN,FP,FN,Sensitivity (recall),Specificity (selectivity),PPV (precision),FDR,Balanced_accuracy
0,lof,1137,44514,1961,3460,0.247335,0.957805,0.367011,0.632989,0.60257


25/05/16 06:48:32 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 1010907 ms exceeds timeout 120000 ms
25/05/16 06:48:32 WARN SparkContext: Killing executors is not supported by current scheduler.
25/05/16 06:48:37 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:124)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndpoint$