In [1]:
!gcloud auth application-default login

Your browser has been opened to visit:

    https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=764086051850-6qr4p6gpi6hn506pt8ejuq83di341hur.apps.googleusercontent.com&redirect_uri=http%3A%2F%2Flocalhost%3A8085%2F&scope=openid+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fuserinfo.email+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fcloud-platform+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fsqlservice.login&state=eoTHtSoEC1Va3042qs25MAYdXHIso7&access_type=offline&code_challenge=DaKhUJ87jU7GhaawfYZSQaY7hmKPU8XryUg317ur7uM&code_challenge_method=S256


Credentials saved to file: [/Users/yt4/.config/gcloud/application_default_credentials.json]

These credentials will be used by any library that requests Application Default Credentials (ADC).

Quota project "open-targets-genetics-dev" was added to ADC which can be used by Google client libraries for billing and quota. Note that some services may still bill the project owning the resource.


Updates are available for some Google Clo

In [2]:
import os

import hail as hl
import numpy as np
import pyspark.sql.functions as f
from pyspark.sql import DataFrame

from gentropy.common.session import Session
from gentropy.dataset.study_index import StudyIndex
from gentropy.dataset.summary_statistics import SummaryStatistics
from gentropy.dataset.study_locus import StudyLocus
from gentropy.susie_finemapper import SusieFineMapperStep
#from gentropy.method.drug_enrichment_from_evid import chemblDrugEnrichment

"""Common utilities for the project."""

import os
from pathlib import Path
from gentropy.common.session import Session
import logging


def get_gcs_credentials() -> str:
    """Get the credentials for google cloud storage."""
    app_default_credentials = os.path.join(
        os.getenv("HOME", "."), ".config/gcloud/application_default_credentials.json"
    )

    service_account_credentials = os.path.join(
        os.getenv("HOME", "."), ".config/gcloud/service_account_credentials.json"
    )

    if Path(app_default_credentials).exists():
        return app_default_credentials
    else:
        raise FileNotFoundError("No GCS credentials found.")


def get_gcs_hadoop_connector_jar() -> str:
    """Get the google cloud storage hadoop connector for spark.

    This function will return the url to download the hadoop jar.
    """

    return (
        "https://storage.googleapis.com/hadoop-lib/gcs/gcs-connector-hadoop3-latest.jar"
    )


def gcs_conf(
    credentials_path=None, project="open-targets-genetics-dev"
) -> dict[str, str]:
    """Get the spark configuration with hadoop connector for google cloud storage."""
    credentials_path = credentials_path or get_gcs_credentials()
    return {
        "spark.driver.memory": "12g",
        "spark.kryoserializer.buffer.max": "500m",
        "spark.driver.maxResultSize":"2g",
        "spark.hadoop.fs.gs.impl": "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem",
        "spark.jars": get_gcs_hadoop_connector_jar(),
        "spark.hadoop.google.cloud.auth.service.account.enable": "true",
        "spark.hadoop.fs.gs.project.id": project,
        "spark.hadoop.google.cloud.auth.service.account.json.keyfile": credentials_path,
        "spark.hadoop.fs.gs.requester.pays.mode": "AUTO",
    }


class GentropySession(Session):
    def __init__(self, *args, **kwargs):
        if "extended_spark_conf" in kwargs:
            kwargs["extended_spark_conf"].update(gcs_conf())
        else:
            kwargs["extended_spark_conf"] = gcs_conf()
        super().__init__(*args, **kwargs)

    @property
    def conf(self):
        logging.warning(
            "To change the config restart the session and use the `extended_spark_conf` parameter."
        )
        return self.spark.sparkContext.getConf().getAll()

session= GentropySession()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/06/23 10:45:21 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
path_to_release_folder="gs://open-targets-data-releases/25.06/"

#
si=StudyIndex.from_parquet(session,path_to_release_folder+"output/study/")
sl=StudyLocus.from_parquet(session,path_to_release_folder+"output/credible_set/")

                                                                                

# Load the data

In [4]:
si.df.printSchema()

root
 |-- studyId: string (nullable = true)
 |-- projectId: string (nullable = true)
 |-- studyType: string (nullable = true)
 |-- traitFromSource: string (nullable = true)
 |-- traitFromSourceMappedIds: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- diseaseIds: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- geneId: string (nullable = true)
 |-- biosampleFromSourceId: string (nullable = true)
 |-- biosampleId: string (nullable = true)
 |-- pubmedId: string (nullable = true)
 |-- publicationTitle: string (nullable = true)
 |-- publicationFirstAuthor: string (nullable = true)
 |-- publicationDate: string (nullable = true)
 |-- publicationJournal: string (nullable = true)
 |-- backgroundTraitFromSourceMappedIds: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- backgroundDiseaseIds: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- initialSampleSize: string (nullable = true)
 

In [5]:
si_df=si.df.select("studyId","diseaseIds","geneId","biosampleId","pubmedId","cohorts","ldPopulationStructure")
result_df = sl.df.select("studyId","variantId","studyType","studyLocusId").join(si_df, on="studyId", how="left").cache()

In [6]:
result_df.count()

                                                                                

2833758

# Replicated GWAS

In [7]:
gwas=result_df.filter(f.col("studyType")=="gwas").cache()
gwas.count()

                                                                                

789453

In [8]:
#Criteria - if the lead varaint was assosiated with the same EFO at least twice

In [9]:
gwas = gwas.withColumn("diseaseId", f.explode(f.col("diseaseIds"))).cache()
gwas.count()


                                                                                

835359

In [10]:
df=gwas.select("variantId", "diseaseId","cohorts","pubmedId","ldPopulationStructure")
df.count()

835359

In [11]:
df=df.dropDuplicates()
df.count()

                                                                                

709842

In [12]:
grouped = df.groupBy("variantId", "diseaseId").agg(f.count("*").alias("count"))
filtered = grouped.filter(f.col("count") >= 2)
result = gwas.join(filtered, on=["variantId", "diseaseId"], how="inner").select("studyLocusId").distinct()
result.count()

                                                                                

263705

In [13]:
result.write.mode("overwrite").parquet("gs://genetics-portal-dev-analysis/yt4/20250403_for_gentropy_paper/list_of_gwas_replicated_CSs.parquet")

                                                                                

# Replicated molQTLs

In [14]:
molQTLs=result_df.filter(~(f.col("studyType")=="gwas")).cache()
molQTLs.count()

                                                                                

2044305

In [15]:
#Criteria - if the lead varaint was assosiated with the same geneId at least twice

In [16]:
grouped = molQTLs.groupBy("variantId", "geneId").agg(f.count("*").alias("count"))
filtered = grouped.filter(f.col("count") >= 2)
result = molQTLs.join(filtered, on=["variantId", "geneId"], how="inner").select("studyLocusId").distinct().cache()
result.count()

                                                                                

1461445

In [17]:
result.write.mode("overwrite").parquet("gs://genetics-portal-dev-analysis/yt4/20250403_for_gentropy_paper/list_of_molqtls_replicated_CSs.parquet")

                                                                                