In [None]:
import pyspark.sql.functions as f

from gentropy.common.session import Session
from gentropy.dataset.colocalisation import Colocalisation


In [None]:
session = Session(extended_spark_conf={"spark.executor.memory": "10g", "spark.driver.memory": "10g"})


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/04/01 10:15:30 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [None]:
coloc = Colocalisation.from_parquet(session, "/users/dc16/data/releases/24.12/colocalisation/coloc/")
ecaviar = Colocalisation.from_parquet(session, "/users/dc16/data/releases/24.12/colocalisation/ecaviar/")


In [None]:
coloc_count = coloc.df.select("leftStudyLocusId", "rightStudyLocusId").distinct().count()
ecaviar_count = ecaviar.df.select("leftStudyLocusId", "rightStudyLocusId").distinct().count()
coloc_h4_count = coloc.df.filter(f.col("h4") >= 0.8).count()
ecaviar_clpp_count = ecaviar.df.filter(f.col("clpp") >= 0.01).count()

print(f"Total number of coloc overlaps: {coloc_count:,}")
print(f"Total number of ecaviar overlaps: {ecaviar_count:,}")
print(f"Number of coloc overlaps with H4 > 0.8: {coloc_h4_count:,}")
print(f"Number of ecaviar overlaps with clpp > 0.01: {ecaviar_clpp_count:,}")


                                                                                

Total number of coloc overlaps: 23,709,155
Total number of ecaviar overlaps: 48,821,549
Number of coloc overlaps with H4 > 0.8: 17,553,867
Number of ecaviar overlaps with clpp > 0.01: 31,873,679


In [None]:
(
    coloc.df.groupBy("rightStudyType")
    .agg(
        f.format_number(f.count("*"), 0).alias("COLOC overlaps"),
        f.format_number(f.count(f.when(f.col("h4") >= 0.8, 1)), 0).alias("H4 > 0.8"),
    )
    .show()
)


+--------------+--------------+----------+
|rightStudyType|COLOC overlaps|  H4 > 0.8|
+--------------+--------------+----------+
|          gwas|    12,850,416|11,439,294|
|          sqtl|       723,171|   428,430|
|          pqtl|     1,536,457| 1,408,379|
|         tuqtl|     1,341,078|   774,303|
|          eqtl|     4,609,874| 2,345,418|
|       sctuqtl|       279,962|   169,159|
|        sceqtl|     2,242,196|   918,555|
|        scsqtl|       126,001|    70,329|
+--------------+--------------+----------+



                                                                                

In [None]:
(
    ecaviar.df.groupBy("rightStudyType")
    .agg(
        f.format_number(f.count("*"), 0).alias("eCAVIAR overlaps"),
        f.format_number(f.count(f.when(f.col("clpp") >= 0.01, 1)), 0).alias("CLPP > 1%"),
    )
    .show()
)




+--------------+----------------+----------+
|rightStudyType|eCAVIAR overlaps| CLPP > 1%|
+--------------+----------------+----------+
|          gwas|      28,855,277|23,196,177|
|          sqtl|       1,448,104|   601,706|
|          pqtl|       2,972,496| 2,530,275|
|         tuqtl|       2,576,057| 1,029,218|
|          eqtl|       8,379,027| 3,062,911|
|       sctuqtl|         541,580|   219,830|
|        sceqtl|       3,801,921| 1,138,481|
|        scsqtl|         247,087|    95,081|
+--------------+----------------+----------+



                                                                                

In [None]:
coloc.df.agg(
    f.max("numberColocalisingVariants"), f.median("numberColocalisingVariants"), f.mean("numberColocalisingVariants")
).show()


+-------------------------------+----------------------------------+-------------------------------+
|max(numberColocalisingVariants)|median(numberColocalisingVariants)|avg(numberColocalisingVariants)|
+-------------------------------+----------------------------------+-------------------------------+
|                           2879|                               4.0|             153.01816812956852|
+-------------------------------+----------------------------------+-------------------------------+



                                                                                

In [None]:
ecaviar.df.agg(
    f.max("numberColocalisingVariants"), f.median("numberColocalisingVariants"), f.mean("numberColocalisingVariants")
).show()




+-------------------------------+----------------------------------+-------------------------------+
|max(numberColocalisingVariants)|median(numberColocalisingVariants)|avg(numberColocalisingVariants)|
+-------------------------------+----------------------------------+-------------------------------+
|                           3081|                               6.0|             122.61254260900243|
+-------------------------------+----------------------------------+-------------------------------+



                                                                                

In [None]:
target = session.spark.read.parquet("/users/dc16/data/releases/25.03/target")
feature_matrix = session.spark.read.parquet("/users/dc16/data/releases/25.03/l2g_feature_matrix/").join(
    target.select("id", "biotype").filter(f.col("biotype") == "protein_coding"), f.col("geneId") == f.col("id"), "inner"
)


In [None]:
print(
    "Number of unique credible sets in feature matrix (protein-coding genes only):",
    feature_matrix.select("studyLocusId").distinct().count(),
)




Number of unique credible sets in feature matrix (protein-coding genes only): 568548


                                                                                

In [None]:
print("Numbers with at least one significant colocalisation:")
(
    feature_matrix.filter(
        (f.col("eQTlColocClppMaximum") >= 0.01)
        | (f.col("eQTlColocH4Maximum") >= 0.8)
        | (f.col("sQTlColocClppMaximum") >= 0.01)
        | (f.col("sQTlColocH4Maximum") >= 0.8)
        | (f.col("pQTlColocClppMaximum") >= 0.01)
        | (f.col("pQTlColocH4Maximum") >= 0.8)
    )
    .agg(
        f.count_distinct("studyLocusId").alias("numberOfCredibleSets"),
        f.count_distinct("geneId").alias("numberOfGenes"),
    )
    .show()
)


Numbers with at least one significant colocalisation:




+--------------------+-------------+
|numberOfCredibleSets|numberOfGenes|
+--------------------+-------------+
|              341412|        14412|
+--------------------+-------------+



                                                                                

In [None]:
print("Percentage of credible sets with at least one significant colocalisation:")
(341412 / 568548) * 100


Percentage of credible sets with at least one significant colocalisation:


60.04981109774372

In [None]:
print("Numbers with a significant eQTL in feature matrix:")
(
    feature_matrix.filter((f.col("eQtlColocClppMaximum") >= 0.01) | (f.col("eQtlColocH4Maximum") >= 0.8))
    .agg(
        f.count_distinct("studyLocusId").alias("numberOfCredibleSets"),
        f.count_distinct("geneId").alias("numberOfGenes"),
    )
    .show()
)


Numbers with a significant eQTL in feature matrix:




+--------------------+-------------+
|numberOfCredibleSets|numberOfGenes|
+--------------------+-------------+
|              265334|        13526|
+--------------------+-------------+



                                                                                

In [None]:
print("Numbers with a significant pQTL in feature matrix:")
(
    feature_matrix.filter((f.col("pQtlColocClppMaximum") >= 0.01) | (f.col("pQtlColocH4Maximum") >= 0.8))
    .agg(
        f.count_distinct("studyLocusId").alias("numberOfCredibleSets"),
        f.count_distinct("geneId").alias("numberOfGenes"),
    )
    .show()
)


Numbers with a significant pQTL in feature matrix:




+--------------------+-------------+
|numberOfCredibleSets|numberOfGenes|
+--------------------+-------------+
|              153199|         2489|
+--------------------+-------------+



                                                                                

In [None]:
print("Numbers with a significant sQTL in feature matrix:")
(
    feature_matrix.filter((f.col("sQtlColocClppMaximum") >= 0.01) | (f.col("sQtlColocH4Maximum") >= 0.8))
    .agg(
        f.count_distinct("studyLocusId").alias("numberOfCredibleSets"),
        f.count_distinct("geneId").alias("numberOfGenes"),
    )
    .show()
)


Numbers with a significant sQTL in feature matrix:
+--------------------+-------------+
|numberOfCredibleSets|numberOfGenes|
+--------------------+-------------+
|              175962|         9541|
+--------------------+-------------+



                                                                                