In [1]:
!gcloud auth application-default login

Your browser has been opened to visit:

    https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=764086051850-6qr4p6gpi6hn506pt8ejuq83di341hur.apps.googleusercontent.com&redirect_uri=http%3A%2F%2Flocalhost%3A8085%2F&scope=openid+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fuserinfo.email+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fcloud-platform+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fsqlservice.login&state=kaLpz8Tzdlt0L9xTL1JVMn3Ouv7S4S&access_type=offline&code_challenge=ERR69nuwbMisSFAjyHAyeS6x1XVQKJfQglolz1RTN2Q&code_challenge_method=S256


Credentials saved to file: [/Users/yt4/.config/gcloud/application_default_credentials.json]

These credentials will be used by any library that requests Application Default Credentials (ADC).

Quota project "open-targets-genetics-dev" was added to ADC which can be used by Google client libraries for billing and quota. Note that some services may still bill the project owning the resource.


Updates are available for some Google Clo

In [None]:
!gcloud auth login

In [2]:
import os

import hail as hl
import numpy as np
import pyspark.sql.functions as f
from pyspark.sql import DataFrame

from gentropy.common.session import Session
from gentropy.dataset.study_index import StudyIndex
from gentropy.dataset.summary_statistics import SummaryStatistics
from gentropy.dataset.study_locus import StudyLocus
from gentropy.susie_finemapper import SusieFineMapperStep
from gentropy.method.drug_enrichment_from_evid import chemblDrugEnrichment

"""Common utilities for the project."""

import os
from pathlib import Path
from gentropy.common.session import Session
import logging


def get_gcs_credentials() -> str:
    """Get the credentials for google cloud storage."""
    app_default_credentials = os.path.join(
        os.getenv("HOME", "."), ".config/gcloud/application_default_credentials.json"
    )

    service_account_credentials = os.path.join(
        os.getenv("HOME", "."), ".config/gcloud/service_account_credentials.json"
    )

    if Path(app_default_credentials).exists():
        return app_default_credentials
    else:
        raise FileNotFoundError("No GCS credentials found.")


def get_gcs_hadoop_connector_jar() -> str:
    """Get the google cloud storage hadoop connector for spark.

    This function will return the url to download the hadoop jar.
    """

    return (
        "https://storage.googleapis.com/hadoop-lib/gcs/gcs-connector-hadoop3-latest.jar"
    )


def gcs_conf(
    credentials_path=None, project="open-targets-genetics-dev"
) -> dict[str, str]:
    """Get the spark configuration with hadoop connector for google cloud storage."""
    credentials_path = credentials_path or get_gcs_credentials()
    return {
        "spark.driver.memory": "12g",
        "spark.kryoserializer.buffer.max": "500m",
        "spark.driver.maxResultSize":"2g",
        "spark.hadoop.fs.gs.impl": "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem",
        "spark.jars": get_gcs_hadoop_connector_jar(),
        "spark.hadoop.google.cloud.auth.service.account.enable": "true",
        "spark.hadoop.fs.gs.project.id": project,
        "spark.hadoop.google.cloud.auth.service.account.json.keyfile": credentials_path,
        "spark.hadoop.fs.gs.requester.pays.mode": "AUTO",
    }


class GentropySession(Session):
    def __init__(self, *args, **kwargs):
        if "extended_spark_conf" in kwargs:
            kwargs["extended_spark_conf"].update(gcs_conf())
        else:
            kwargs["extended_spark_conf"] = gcs_conf()
        super().__init__(*args, **kwargs)

    @property
    def conf(self):
        logging.warning(
            "To change the config restart the session and use the `extended_spark_conf` parameter."
        )
        return self.spark.sparkContext.getConf().getAll()

session= GentropySession()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/07/01 17:41:05 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
path_to_release_folder="gs://open-targets-data-releases/25.06/"
#path_to_release_folder="gs://open-targets-pre-data-releases/24.12-uo_test-3/output/genetics/parquet/"
#path_to_release_folder="gs://ot_orchestration/releases/25.02_freeze1/"

si=StudyIndex.from_parquet(session,path_to_release_folder+"output/study/")
sl=StudyLocus.from_parquet(session,path_to_release_folder+"output/credible_set/")

                                                                                

# Defining novel and known L2G predictions

In [4]:
old_l2g=session.spark.read.parquet("gs://open-targets-data-releases/24.09/output/etl/parquet/evidence/sourceId=ot_genetics_portal")

In [5]:
#l2g=session.spark.read.parquet(path_to_release_folder+"output/l2g_prediction")
l2g=session.spark.read.parquet("gs://ot-team/irene/l2g/xgboost/2706/l2g_predictions").select("studyLocusId","geneId","score")

                                                                                

In [6]:
old_l2g.count()

                                                                                

781213

In [7]:
l2g.printSchema()

root
 |-- studyLocusId: string (nullable = true)
 |-- geneId: string (nullable = true)
 |-- score: float (nullable = true)



In [8]:
l2g.count()

                                                                                

10623371

In [16]:
new_l2g_evidence=chemblDrugEnrichment.to_disease_target_evidence(table_with_score=l2g,
    score_column="score",
    datasource_id="new_l2g",
    study_locus=sl,
    study_index=si,
    min_score=0.05
).cache()
new_l2g_evidence.count()

                                                                                

1667749

In [10]:
old_l2g.printSchema()

root
 |-- datasourceId: string (nullable = true)
 |-- targetId: string (nullable = true)
 |-- alleleOrigins: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- allelicRequirements: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- ancestry: string (nullable = true)
 |-- ancestryId: string (nullable = true)
 |-- beta: double (nullable = true)
 |-- betaConfidenceIntervalLower: double (nullable = true)
 |-- betaConfidenceIntervalUpper: double (nullable = true)
 |-- biologicalModelAllelicComposition: string (nullable = true)
 |-- biologicalModelGeneticBackground: string (nullable = true)
 |-- biologicalModelId: string (nullable = true)
 |-- biomarkerName: string (nullable = true)
 |-- biomarkers: struct (nullable = true)
 |    |-- geneExpression: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- id: string (nullable = true)
 |    |    |    |-- name: string (nullable = true)
 |    |-- genetic

In [11]:
disease_index_path=path_to_release_folder+"output/disease/disease.parquet"
disease_index_orig = session.spark.read.parquet(disease_index_path)

In [12]:
indirect_assoc_old_l2g=chemblDrugEnrichment.evidence_to_indirect_assosiations(
        disease_target_evidence=old_l2g.select("targetId","diseaseId","score").withColumnRenamed("score", "resourceScore"),
        disease_index_orig=disease_index_orig,
        use_max=True,
        efo_to_remove=None,
).filter(f.col("indirect_assoc_score")>=0.5).cache()

In [13]:
indirect_assoc_old_l2g.count()

                                                                                

240071

In [14]:
indirect_assoc_old_l2g.show()

+---------------+-------------+--------------------+
|       targetId|    diseaseId|indirect_assoc_score|
+---------------+-------------+--------------------+
|ENSG00000049759|  EFO_0004747|        0.8746914864|
|ENSG00000109320|  EFO_0000684|        0.8085828424|
|ENSG00000148053|MONDO_0002025|        0.8656496406|
|ENSG00000171130|  EFO_0001444|        0.7053959966|
|ENSG00000196586|MONDO_0002149|        0.7589675784|
|ENSG00000213901|  EFO_0001444|        0.5279757977|
|ENSG00000088305|  EFO_0000508|        0.6777986288|
|ENSG00000115507|  EFO_0006848|         0.787563026|
|ENSG00000147883|  EFO_0001642|        0.8152068257|
|ENSG00000147883|  EFO_0000574|        0.8152068257|
|ENSG00000152034|  EFO_0005115|        0.5517882705|
|ENSG00000163714|  EFO_0004503|        0.6146195531|
|ENSG00000005020|  EFO_0007991|        0.6031781435|
|ENSG00000005073|  EFO_0008002|        0.6886193752|
|ENSG00000006607|  EFO_0004833|        0.5389696956|
|ENSG00000006611|  EFO_0004517|        0.86379

In [17]:
new_l2g_evidence.show()

+----------+------------+---------------+-----------+-------------+--------------------+
|datatypeId|datasourceId|       targetId|  diseaseId|resourceScore|        studyLocusId|
+----------+------------+---------------+-----------+-------------+--------------------+
|      GWAS|     new_l2g|ENSG00000167645|EFO_0010968|   0.18192655|0005218bc3a62e387...|
|      GWAS|     new_l2g|ENSG00000099337|EFO_0010968|   0.87497157|0005218bc3a62e387...|
|      GWAS|     new_l2g|ENSG00000215547|EFO_0004528|   0.47645262|002462a2da2f7c279...|
|      GWAS|     new_l2g|ENSG00000215545|EFO_0004528|  0.053832773|002462a2da2f7c279...|
|      GWAS|     new_l2g|ENSG00000184937|EFO_0004309|    0.7815953|00274cac95947bd00...|
|      GWAS|     new_l2g|ENSG00000087237|EFO_0004612|    0.8629739|005bc8624f8dd7f7c...|
|      GWAS|     new_l2g|ENSG00000140853|EFO_0004612|   0.09162829|005bc8624f8dd7f7c...|
|      GWAS|     new_l2g|ENSG00000197406|EFO_0004617|   0.62509125|0064268fb58ddabbb...|
|      GWAS|     new_

In [18]:
# Perform a left join
result_df = new_l2g_evidence.join(
    indirect_assoc_old_l2g,  # The second DataFrame
    on=["targetId", "diseaseId"],  # Join condition
    how="left"  # Left join ensures all rows from new_l2g_evidence are retained
)

# Show the result
result_df.count()

1667749

In [19]:
result_df=result_df.filter(f.col("indirect_assoc_score")>=0.5).cache()

In [20]:
result_df.count()

411574

In [21]:
known_studyLocusIds=result_df.select("studyLocusId").distinct().collect()

In [22]:
studyLocusId_list = [row.studyLocusId for row in known_studyLocusIds]

In [23]:
len(studyLocusId_list)

333130

In [24]:
studyLocusId_df = session.spark.createDataFrame([(id,) for id in studyLocusId_list], ["studyLocusId"])


In [25]:
studyLocusId_df.count()

25/07/01 17:53:31 WARN TaskSetManager: Stage 79 contains a task of very large size (1490 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

333130

In [26]:
studyLocusId_df.write.parquet("gs://genetics-portal-dev-analysis/yt4/20250403_for_gentropy_paper/known_studyLocusIds",mode="overwrite")

25/07/01 17:53:38 WARN TaskSetManager: Stage 82 contains a task of very large size (1490 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

In [27]:
session.spark.read.parquet("gs://genetics-portal-dev-analysis/yt4/20250403_for_gentropy_paper/known_studyLocusIds").count()

                                                                                

333130