In [2]:
import pyspark.sql.functions as f
import statsmodels.api as sm
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.io as pio
import scipy.stats as stats

from gentropy.common.session import Session
from gentropy.dataset.colocalisation import Colocalisation
from gentropy.dataset.l2g_prediction import L2GPrediction
from gentropy.dataset.study_index import StudyIndex
from gentropy.dataset.study_locus import StudyLocus
from gentropy.dataset.variant_index import VariantIndex

pio.renderers.default = 'vscode'


In [3]:
session = Session(
    extended_spark_conf={
        'spark.executor.memory':'10g',
        'spark.driver.memory':'10g'}
)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/05/07 13:58:03 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/05/07 13:58:04 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [21]:
qsl_ids = session.spark.read.parquet('gs://genetics-portal-dev-analysis/yt4/20250403_for_gentropy_paper/qualified_CSs_with_oncology')
qsl = (
    session.spark.read.parquet('gs://genetics-portal-dev-analysis/ss60/gentropy-manuscript/chapters/variant-effect-prediction/rescaled-betas.parquet')
    .join(qsl_ids, 'studyLocusId', 'inner')
)
qsl.count()

                                                                                

51345

In [20]:
l2g = (
    session.spark.read.parquet('gs://open-targets-pre-data-releases/25.03/output/l2g_prediction')
    .filter(f.col('score') >= 0.5)
    .withColumnRenamed('score', 'l2gScore')
    .drop('features', 'shapBaseValue')
)
l2g.count()

                                                                                

437338

In [19]:
qsl_l2g = (
    qsl
    .join(
        l2g,
        'studyLocusId',
        'inner'
    )
    .sort('studyLocusId', f.desc('l2gScore'))
)
qsl_l2g.count()

                                                                                

39053

In [27]:
gene_pleiotropy = (
    session.spark.read.parquet('gs://genetics-portal-dev-analysis/dc16/output/pleiotropy_genes_therapeutic_areas')
    .select('geneId', 'approvedSymbol', 'lofConstraint', 'misConstraint', 'synConstraint', 'pleiotropy', 'tissueDistribution')
)
qsl_l2g_pleiotropy = (
    qsl_l2g
    .join(
        gene_pleiotropy,
        'geneId',
        'inner'
    )
)
qsl_l2g_pleiotropy.write.parquet('gs://genetics-portal-dev-analysis/dc16/output/qsl_l2g_pleiotropy', mode='overwrite')

                                                                                

In [31]:
(
    session.spark.read.parquet('gs://genetics-portal-dev-analysis/dc16/output/qsl_l2g_pleiotropy')
    .toPandas()
)

                                                                                

Unnamed: 0,geneId,studyLocusId,variantId,studyId,beta,zScore,pValueMantissa,pValueExponent,standardError,finemappingMethod,...,majorPopulationMAF,leadVariantStats,rescaledStatistics,l2gScore,approvedSymbol,lofConstraint,misConstraint,synConstraint,pleiotropy,tissueDistribution
0,ENSG00000135404,000b16a49747a293f585449237262a29,12_54963054_G_A,FINNGEN_R12_ASTHMA_CHILD_EXMORE,0.249143,,8.412000,-12,0.093979,SuSie,...,0.007140,"(46.66732469592122, 8.411999702453612e-12, 0.0...","(0.6359536885497682, binary, 46.66732469592122...",0.515348,CD63,0.817,0.76667,0.40976,0.117647,-1.0
1,ENSG00000135426,000b16a49747a293f585449237262a29,12_54963054_G_A,FINNGEN_R12_ASTHMA_CHILD_EXMORE,0.249143,,8.412000,-12,0.093979,SuSie,...,0.007140,"(46.66732469592122, 8.411999702453612e-12, 0.0...","(0.6359536885497682, binary, 46.66732469592122...",0.929386,TESPA1,0.792,0.72850,1.00960,0.352941,0.0
2,ENSG00000164512,001f8e95cf73e89fc935530ef0a7c2c0,5_56510924_A_G,GCST004894,0.075000,,5.000000,-19,0.008163,PICS,...,0.257124,"(79.42848534846462, 5e-19, 0.0004324968845716311)","(0.07819234357077874, binary, 79.4284853484646...",0.919443,ANKRD55,0.867,0.17053,0.30476,0.647059,0.5
3,ENSG00000198797,00222644be9fda2c4cb6521c17c51b09,1_177438976_C_T,GCST90319327,0.034105,,9.000000,-12,0.005102,PICS,...,0.084488,"(46.534924538433145, 9e-12, 3.447320000743258e...","(0.03343290491341074, binary, 46.5349245384331...",0.782802,BRINP2,0.210,0.86379,0.18086,0.411765,0.5
4,ENSG00000121621,0023af13e3dc6ab3265533d067cd6d79,11_28374447_C_G,FINNGEN_R12_G6_SLEEPAPNO_INCLAVO,-0.000892,,4.324000,-13,0.006158,SuSie,...,0.473706,"(52.490203111591974, 4.323999881744385e-13, 0....","(-0.04380564197934827, binary, 52.490203111591...",0.553702,KIF18A,0.477,0.54458,-0.28077,0.294118,0.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39048,ENSG00000179295,ffa707df6444d8bedb0b577da0527ded,12_112445672_G_A,GCST90043954,0.009223,6.894906,5.390056,-12,,SuSiE-inf,...,0.427081,"(47.53972405939967, 5.3900561332702636e-12, 0....","(0.10774181156325302, binary, 47.5397240593996...",0.793718,PTPN11,0.135,3.12930,0.82437,0.588235,-1.0
39049,ENSG00000181915,ffadf247a1a19747bc046cf277539744,10_62685804_A_G,FINNGEN_R12_K11_UC_STRICT2,0.111874,,1.374000,-12,0.016860,SuSie,...,0.483636,"(50.22056947045655, 1.3739999532699584e-12, 0....","(0.11887276003940084, binary, 50.2205694704565...",0.590189,ADO,0.810,1.61310,-0.41566,0.823529,-1.0
39050,ENSG00000142168,ffaefa75c4a52d7394b99c752538aa01,21_31667290_A_C,GCST90027164,0.023010,8.689766,3.631858,-18,,SuSiE-inf,...,0.000614,"(75.51203861514783, 3.631857633590698e-18, 0.0...","(1.678861372529316, binary, 75.51203861514783,...",0.973302,SOD1,0.978,0.83817,-1.27330,0.058824,-1.0
39051,ENSG00000185630,ffced5aa63596234f0f42d3887e24398,1_164542784_A_G,GCST90043616,0.017704,7.008374,2.411030,-12,,SuSiE-inf,...,0.305529,"(49.11731122972767, 2.411029815673828e-12, 0.0...","(0.05070311654206578, binary, 49.1173112297276...",0.737302,PBX1,0.255,3.82660,0.92392,0.411765,-1.0
