In [1]:
import pyspark.sql.functions as f
import statsmodels.api as sm
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.io as pio


from gentropy.common.session import Session

pio.renderers.default = "vscode"


In [2]:
session = Session(
    extended_spark_conf={"spark.executor.memory": "10g", "spark.driver.memory": "10g"}
)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/05/28 11:00:32 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/05/28 11:00:33 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [6]:
target = session.spark.read.parquet("/users/dc16/data/releases/25.03/target")
l2g = session.spark.read.parquet("/users/dc16/data/releases/25.03/l2g_prediction"
).select("studyLocusId", "geneId", "score")
credible_sets = session.spark.read.parquet(
    '/users/dc16/data/releases/25.03/credible_set'
)
studies = (
    session.spark.read.parquet(
        "gs://genetics-portal-dev-analysis/dc16/output/gentropy_paper/gwas_therapeutic_areas"
    )
    .filter(
        ~f.col("measurement")
        & f.col('binaryLessCases')
    )
)
variants = session.spark.read.parquet("/users/dc16/data/releases/25.03/variant/"
)
qualified_studies = session.spark.read.parquet(
    "gs://genetics-portal-dev-analysis/dc16/output/gentropy_paper/qualifying_studies"
)
qualified_cs = session.spark.read.parquet(
    "gs://genetics-portal-dev-analysis/dc16/output/gentropy_paper/qualifying_credible_sets"
)
rescaled_betas = session.spark.read.parquet(
    "gs://genetics-portal-dev-analysis/ss60/gentropy-manuscript/chapters/variant-effect-prediction/rescaled-betas.parquet"
)


In [8]:
cred_sets = qualified_cs.join(
    studies.select(
        "studyId",
        "diseaseIds",
        "mappedTherapeuticAreas", 
        "cancerOrBenignTumor",
        "infectiousDisease",
        "pregnancyOrPerinatalDisease",
        "disorderOfVisualSystem",
        "cardiovascularDisease",
        "pancreasDisease",
        "gastrointestinalDisease",
        "reproductiveSystemOrBreastDisease",
        "integumentarySystemDisease",
        "endocrineSystemDisease", 
        "respiratoryOrThoracicDisease", 
        "urinarySystemDisease", 
        "musculoskeletalOrConnectiveTissueDisease", 
        "disorderOfEar",
        "immuneSystemDisease",
        "hematologicDisease",
        "nervousSystemDisease", 
        "psychiatricDisorder",
        "nutritionalOrMetabolicDisease",
        "geneticFamilialOrCongenitalDisease", 
        "injuryPoisoningOrOtherComplication", 
        "signOrSymptom",
        "other",
        "totalTherapeuticAreas"       
    ),
    "studyId",
    "inner",
)

In [9]:
print("Number of unique variantIds:", cred_sets.select("variantId").distinct().count())



Number of unique variantIds: 32469


                                                                                

In [13]:
print("Number of unique study IDs:", cred_sets.select('studyId').distinct().count())



Number of unique study IDs: 2702


                                                                                

In [16]:
variant_pleiotropy = (
    cred_sets.groupBy("variantId")
    .agg(
        f.size(
            f.array_distinct(f.flatten(f.collect_list("diseaseIds")))
        ).alias("uniqueDiseases"),
        f.size(f.array_distinct(f.flatten(f.collect_list('mappedTherapeuticAreas')))).alias('uniqueTherapeuticAreas'),
        f.sum('cancerOrBenignTumor').alias('cancerOrBenignTumor'),
        f.sum('infectiousDisease').alias('infectiousDisease'),
        f.sum('pregnancyOrPerinatalDisease').alias('pregnancyOrPerinatalDisease'),
        f.sum('disorderOfVisualSystem').alias('disorderOfVisualSystem'),
        f.sum('cardiovascularDisease').alias('cardiovascularDisease'),
        f.sum('pancreasDisease').alias('pancreasDisease'),
        f.sum('gastrointestinalDisease').alias('gastrointestinalDisease'),
        f.sum('reproductiveSystemOrBreastDisease').alias('reproductiveSystemOrBreastDisease'),
        f.sum('integumentarySystemDisease').alias('integumentarySystemDisease'),
        f.sum('endocrineSystemDisease').alias('endocrineSystemDisease'),
        f.sum('respiratoryOrThoracicDisease').alias('respiratoryOrThoracicDisease'),
        f.sum('urinarySystemDisease').alias('urinarySystemDisease'),
        f.sum('musculoskeletalOrConnectiveTissueDisease').alias('musculoskeletalOrConnectiveTissueDisease'),
        f.sum('disorderOfEar').alias('disorderOfEar'),
        f.sum('immuneSystemDisease').alias('immuneSystemDisease'),
        f.sum('hematologicDisease').alias('hematologicDisease'),
        f.sum('nervousSystemDisease').alias('nervousSystemDisease'),
        f.sum('psychiatricDisorder').alias('psychiatricDisorder'),
        f.sum('nutritionalOrMetabolicDisease').alias('nutritionalOrMetabolicDisease'),
        f.sum('geneticFamilialOrCongenitalDisease').alias('geneticFamilialOrCongenitalDisease'),
        f.sum('injuryPoisoningOrOtherComplication').alias('injuryPoisoningOrOtherComplication'),
        f.sum('signOrSymptom').alias('signOrSymptom'),
        f.sum('other').alias('other'),
    )
    .withColumn(
        "totalStudies",
        f.col("cancerOrBenignTumor")
        + f.col("infectiousDisease")
        + f.col("pregnancyOrPerinatalDisease")
        + f.col("disorderOfVisualSystem")
        + f.col("cardiovascularDisease")
        + f.col("pancreasDisease")
        + f.col("gastrointestinalDisease")
        + f.col("reproductiveSystemOrBreastDisease")
        + f.col("integumentarySystemDisease")
        + f.col("endocrineSystemDisease")
        + f.col("respiratoryOrThoracicDisease")
        + f.col("urinarySystemDisease")
        + f.col("musculoskeletalOrConnectiveTissueDisease")
        + f.col("disorderOfEar")
        + f.col("immuneSystemDisease")
        + f.col("hematologicDisease")
        + f.col("nervousSystemDisease")
        + f.col("psychiatricDisorder")
        + f.col("nutritionalOrMetabolicDisease")
        + f.col("geneticFamilialOrCongenitalDisease")
        + f.col("injuryPoisoningOrOtherComplication")
        + f.col("signOrSymptom")
        + f.col("other"),
    )
    .select("variantId", "uniqueDiseases", "uniqueTherapeuticAreas")
    .join(
        variants.select("variantId", "variantEffect")
        .withColumns(
            {
                "gerpNormalised": f.filter(
                    f.col("variantEffect"), lambda x: x["method"] == "GERP"
                )[0]["normalisedScore"],
                "vepScore": f.filter(
                    f.col("variantEffect"), lambda x: x["method"] == "VEP"
                )[0]["score"],
            }
        )
        .drop("variantEffect"),
        "variantId",
        "left",
    )
    .sort(f.desc("uniqueTherapeuticAreas"), f.desc("uniqueDiseases"))
)

In [31]:
variant_pd = variant_pleiotropy.toPandas()
variant_pd[0:10]

Unnamed: 0,variantId,uniqueDiseases,uniqueTherapeuticAreas,gerpNormalised,vepScore
0,19_44908684_T_C,55,13,0.695,0.66
1,12_111446804_T_C,27,13,0.50875,0.66
2,1_113834946_A_G,38,12,0.159,0.66
3,7_5397122_C_T,40,10,0.54,0.1
4,1_113761186_C_A,27,9,-1.0,0.0
5,5_56148856_G_A,21,9,-0.43,0.1
6,14_94378610_C_T,20,9,0.4375,0.66
7,2_162267541_C_T,16,9,0.50875,0.66
8,5_40487168_C_T,16,9,0.77125,0.1
9,12_56002984_G_C,15,9,0.125,0.1


In [32]:
variant_pd["uniqueDiseases"].corr(variant_pd["uniqueTherapeuticAreas"])

0.777561986981779

In [33]:
fig = px.histogram(
    variant_pd,
    x="uniqueTherapeuticAreas",
    histnorm="probability",
    title="Distribution of Variant pleiotropy",
    color_discrete_sequence=["#636EFA"],
)
fig.update_layout(xaxis_title="uniqueTherapeuticAreas", yaxis_title="Probability", bargap=0.1)
fig.show()

In [34]:
df = variant_pd[["gerpNormalised", "uniqueDiseases"]].dropna()
x = df[["gerpNormalised"]]
x = sm.add_constant(x)
y = df["uniqueDiseases"]
print(sm.Poisson(y, x).fit().summary())
print("P-values:\n", sm.Poisson(y, x).fit().pvalues)


Optimization terminated successfully.
         Current function value: 1.373100
         Iterations 4
                          Poisson Regression Results                          
Dep. Variable:         uniqueDiseases   No. Observations:                31318
Model:                        Poisson   Df Residuals:                    31316
Method:                           MLE   Df Model:                            1
Date:                Wed, 28 May 2025   Pseudo R-squ.:               0.0005613
Time:                        11:24:26   Log-Likelihood:                -43003.
converged:                       True   LL-Null:                       -43027.
Covariance Type:            nonrobust   LLR p-value:                 3.656e-12
                     coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------
const              0.3687      0.005     69.762      0.000       0.358       0.379
gerpNormalised   

In [35]:
df = variant_pd[["vepScore", "uniqueDiseases"]].dropna()
x = df[["vepScore"]]
x = sm.add_constant(x)
y = df["uniqueDiseases"]
print(sm.Poisson(y, x).fit().summary())
print("P-values:\n", sm.Poisson(y, x).fit().pvalues)

Optimization terminated successfully.
         Current function value: 1.371840
         Iterations 5
                          Poisson Regression Results                          
Dep. Variable:         uniqueDiseases   No. Observations:                32469
Model:                        Poisson   Df Residuals:                    32467
Method:                           MLE   Df Model:                            1
Date:                Wed, 28 May 2025   Pseudo R-squ.:                0.004269
Time:                        11:24:38   Log-Likelihood:                -44542.
converged:                       True   LL-Null:                       -44733.
Covariance Type:            nonrobust   LLR p-value:                 4.746e-85
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.2842      0.006     48.767      0.000       0.273       0.296
vepScore       0.7375      0.

In [39]:
df = variant_pd[["gerpNormalised", "uniqueTherapeuticAreas"]].dropna()
x = df[["gerpNormalised"]]
x = sm.add_constant(x)
successes = df["uniqueTherapeuticAreas"]
failures = 23 - successes
y = np.column_stack((successes, failures))
model = sm.GLM(
    y, x, family=sm.families.Binomial()
).fit()
print(model.summary())
print("P-values:\n", model.pvalues)


                 Generalized Linear Model Regression Results                  
Dep. Variable:           ['y1', 'y2']   No. Observations:                31318
Model:                            GLM   Df Residuals:                    31316
Model Family:                Binomial   Df Model:                            1
Link Function:                  Logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -35213.
Date:                Wed, 28 May 2025   Deviance:                       6213.7
Time:                        11:25:57   Pearson chi2:                 8.99e+03
No. Iterations:                     6   Pseudo R-squ. (CS):          0.0002201
Covariance Type:            nonrobust                                         
                     coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------
const             -2.9003      0.006   -485.

In [40]:
df = variant_pd[["vepScore", "uniqueTherapeuticAreas"]].dropna()
x = df[["vepScore"]]
x = sm.add_constant(x)
successes = df["uniqueTherapeuticAreas"]
failures = 23 - successes
y = np.column_stack((successes, failures))
model = sm.GLM(
    y, x, family=sm.families.Binomial()
).fit()
print(model.summary())
print("P-values:\n", model.pvalues)

                 Generalized Linear Model Regression Results                  
Dep. Variable:           ['y1', 'y2']   No. Observations:                32469
Model:                            GLM   Df Residuals:                    32467
Model Family:                Binomial   Df Model:                            1
Link Function:                  Logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -36472.
Date:                Wed, 28 May 2025   Deviance:                       6373.9
Time:                        11:26:22   Pearson chi2:                 9.09e+03
No. Iterations:                     6   Pseudo R-squ. (CS):           0.002482
Covariance Type:            nonrobust                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -2.9441      0.007   -444.793      0.0