In [2]:
import pyspark.sql.functions as f
import statsmodels.api as sm
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.io as pio
import scipy.stats as stats

from gentropy.common.session import Session
from gentropy.dataset.colocalisation import Colocalisation
from gentropy.dataset.l2g_prediction import L2GPrediction
from gentropy.dataset.study_index import StudyIndex
from gentropy.dataset.study_locus import StudyLocus
from gentropy.dataset.variant_index import VariantIndex

pio.renderers.default = 'vscode'


In [3]:
session = Session(
    extended_spark_conf={
        'spark.executor.memory':'10g',
        'spark.driver.memory':'10g'}
)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/05/07 13:58:03 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/05/07 13:58:04 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [21]:
qsl_ids = session.spark.read.parquet('gs://genetics-portal-dev-analysis/yt4/20250403_for_gentropy_paper/qualified_CSs_with_oncology')
qsl = (
    session.spark.read.parquet('gs://genetics-portal-dev-analysis/ss60/gentropy-manuscript/chapters/variant-effect-prediction/rescaled-betas.parquet')
    .join(qsl_ids, 'studyLocusId', 'inner')
)
qsl.count()

                                                                                

51345

In [20]:
l2g = (
    session.spark.read.parquet('gs://open-targets-pre-data-releases/25.03/output/l2g_prediction')
    .filter(f.col('score') >= 0.5)
    .withColumnRenamed('score', 'l2gScore')
    .drop('features', 'shapBaseValue')
)
l2g.count()

                                                                                

437338

In [19]:
qsl_l2g = (
    qsl
    .join(
        l2g,
        'studyLocusId',
        'inner'
    )
    .sort('studyLocusId', f.desc('l2gScore'))
)
qsl_l2g.count()

                                                                                

39053

In [27]:
gene_pleiotropy = (
    session.spark.read.parquet('gs://genetics-portal-dev-analysis/dc16/output/pleiotropy_genes_therapeutic_areas')
    .select('geneId', 'approvedSymbol', 'lofConstraint', 'misConstraint', 'synConstraint', 'pleiotropy', 'tissueDistribution')
)
qsl_l2g_pleiotropy = (
    qsl_l2g
    .join(
        gene_pleiotropy,
        'geneId',
        'inner'
    )
)
qsl_l2g_pleiotropy.write.parquet('gs://genetics-portal-dev-analysis/dc16/output/qsl_l2g_pleiotropy', mode='overwrite')

                                                                                

In [30]:
(
    session.spark.read.parquet('gs://genetics-portal-dev-analysis/dc16/output/qsl_l2g_pleiotropy')
    .pandas_api()
)

                                                                                

Unnamed: 0,geneId,studyLocusId,variantId,studyId,beta,zScore,pValueMantissa,pValueExponent,standardError,finemappingMethod,studyType,credibleSetSize,posteriorProbability,nSamples,nControls,nCases,majorPopulation,allelefrequencies,vepEffect,majorPopulationAF,majorPopulationMAF,leadVariantStats,rescaledStatistics,l2gScore,approvedSymbol,lofConstraint,misConstraint,synConstraint,pleiotropy,tissueDistribution
0,ENSG00000135404,000b16a49747a293f585449237262a29,12_54963054_G_A,FINNGEN_R12_ASTHMA_CHILD_EXMORE,0.249143,,8.412,-12,0.093979,SuSie,gwas,6,0.39671,268241,259839,8402,"(fin, 1.0)","[(sas_adj, 6.588772731265922e-05), (remaining_...","(missense_variant, 0.6600000262260437, None)","[(fin_adj, 0.007139732533433321)]",0.00714,"(46.66732469592122, 8.411999702453612e-12, 0.0...","(0.6359536885497682, binary, 46.66732469592122...",0.515348,CD63,0.817,0.76667,0.40976,0.117647,-1.0
1,ENSG00000135426,000b16a49747a293f585449237262a29,12_54963054_G_A,FINNGEN_R12_ASTHMA_CHILD_EXMORE,0.249143,,8.412,-12,0.093979,SuSie,gwas,6,0.39671,268241,259839,8402,"(fin, 1.0)","[(sas_adj, 6.588772731265922e-05), (remaining_...","(missense_variant, 0.6600000262260437, None)","[(fin_adj, 0.007139732533433321)]",0.00714,"(46.66732469592122, 8.411999702453612e-12, 0.0...","(0.6359536885497682, binary, 46.66732469592122...",0.929386,TESPA1,0.792,0.7285,1.0096,0.352941,0.0
2,ENSG00000164512,001f8e95cf73e89fc935530ef0a7c2c0,5_56510924_A_G,GCST004894,0.075,,5.0,-19,0.008163,PICS,gwas,18,0.119144,183651,138587,45064,"(nfe, 0.9263167638618902)","[(sas_adj, 0.6288981288981289), (remaining_adj...","(intron_variant, 0.10000000149011612, None)","[(nfe_adj, 0.7428760476400529)]",0.257124,"(79.42848534846462, 5e-19, 0.0004324968845716311)","(0.07819234357077874, binary, 79.4284853484646...",0.919443,ANKRD55,0.867,0.17053,0.30476,0.647059,0.5
3,ENSG00000198797,00222644be9fda2c4cb6521c17c51b09,1_177438976_C_T,GCST90319327,0.034105,,9.0,-12,0.005102,PICS,gwas,66,0.039543,1349887,978703,371184,"(nfe, 1.0)","[(sas_adj, 0.16895261845386533), (remaining_ad...","(intron_variant, 0.10000000149011612, None)","[(nfe_adj, 0.08448777905232507)]",0.084488,"(46.534924538433145, 9e-12, 3.447320000743258e...","(0.03343290491341074, binary, 46.5349245384331...",0.782802,BRINP2,0.21,0.86379,0.18086,0.411765,0.5
4,ENSG00000121621,0023af13e3dc6ab3265533d067cd6d79,11_28374447_C_G,FINNGEN_R12_G6_SLEEPAPNO_INCLAVO,-0.000892,,4.324,-13,0.006158,SuSie,gwas,189,0.021886,500348,437626,62722,"(fin, 1.0)","[(sas_adj, 0.3362226277372263), (remaining_adj...","(upstream_gene_variant, 0.0, None)","[(fin_adj, 0.47370609020993554)]",0.473706,"(52.490203111591974, 4.323999881744385e-13, 0....","(-0.04380564197934827, binary, 52.490203111591...",0.553702,KIF18A,0.477,0.54458,-0.28077,0.294118,0.5
5,ENSG00000169519,0023af13e3dc6ab3265533d067cd6d79,11_28374447_C_G,FINNGEN_R12_G6_SLEEPAPNO_INCLAVO,-0.000892,,4.324,-13,0.006158,SuSie,gwas,189,0.021886,500348,437626,62722,"(fin, 1.0)","[(sas_adj, 0.3362226277372263), (remaining_adj...","(upstream_gene_variant, 0.0, None)","[(fin_adj, 0.47370609020993554)]",0.473706,"(52.490203111591974, 4.323999881744385e-13, 0....","(-0.04380564197934827, binary, 52.490203111591...",0.71092,METTL15,1.331,-0.24147,-0.59351,0.588235,-1.0
6,ENSG00000106785,0037acf945a868b235b75b96cbb32e62,9_98105907_T_C,GCST009597,-0.074551,,1.0,-10,,PICS,gwas,10,0.35676,41505,26703,14802,"(nfe, 1.0)","[(sas_adj, 0.5146997929606625), (remaining_adj...","(intron_variant, 0.10000000149011612, None)","[(nfe_adj, 0.2525227266041011)]",0.252523,"(41.82145636476129, 1e-10, 0.0010076245359537715)","(-0.10785625293923334, binary, 41.821456364761...",0.807387,TRIM14,0.867,0.81687,0.21411,0.117647,-1.0
7,ENSG00000033867,003dff2d431adba05b8dabedb9833792,3_27521497_C_T,GCST90044351,0.016201,11.086573,1.457654,-28,,SuSiE-inf,gwas,1,0.999998,455303,332683,122620,"(nfe, 1.0)","[(sas_adj, 0.3404564315352697), (remaining_adj...","(upstream_gene_variant, 0.0, None)","[(nfe_adj, 0.44500220815545416)]",0.445002,"(122.91210254741203, 1.4576543569564818e-28, 0...","(0.05269991454179664, binary, 122.912102547412...",0.505859,SLC4A7,0.34,2.2455,-0.66171,0.176471,-1.0
8,ENSG00000198734,003e622c1a2bd25c32d83764947f1eec,1_169549811_C_T,GCST90044618,0.075748,6.776707,1.229462,-11,,SuSiE-inf,gwas,1,1.0,456348,452741,3607,"(nfe, 1.0)","[(sas_adj, 0.013945623050907014), (remaining_a...","(missense_variant, 0.6600000262260437, None)","[(nfe_adj, 0.024718275936931804)]",0.024718,"(45.92375294589228, 1.2294616699218749e-11, 0....","(0.5159167745895885, binary, 45.92375294589228...",0.909662,F5,0.562,0.46164,-0.17761,0.647059,0.0
9,ENSG00000151612,00458a8e76504eb789324b00be83c0b0,4_145863622_A_G,FINNGEN_R12_I9_THAORTANEUR,0.013754,,5.43,-9,0.020562,SuSie,gwas,17,0.12519,468214,463106,5108,"(fin, 1.0)","[(sas_adj, 0.4573804573804574), (remaining_adj...","(intron_variant, 0.10000000149011612, None)","[(fin_adj, 0.36130558183538314)]",0.361306,"(34.02888388157946, 5.429999828338623e-09, 7.2...","(0.12080405383860733, binary, 34.0288838815794...",0.774554,ZNF827,0.234,1.9033,-0.21891,0.529412,0.0
