In [1]:
import pyspark.sql.functions as f
import pyspark.sql.types as t
import statsmodels.api as sm
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.io as pio

from gentropy.common.session import Session

pio.renderers.default = "vscode"


In [2]:
session = Session(
    extended_spark_conf={"spark.executor.memory": "10g", "spark.driver.memory": "10g"}
)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/05/28 10:22:16 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
target = session.spark.read.parquet("/users/dc16/data/releases/25.03/target")
l2g = session.spark.read.parquet(
    "/users/dc16/data/releases/25.03/l2g_prediction"
).select("studyLocusId", "geneId", "score")
cred_sets = session.spark.read.parquet(
    # "gs://genetics-portal-dev-analysis/dc16/output/gentropy_paper/qualifying_credible_sets",
    '/users/dc16/data/releases/25.03/credible_set'
)
studies = session.spark.read.parquet(
    "gs://genetics-portal-dev-analysis/dc16/output/gentropy_paper/gwas_therapeutic_areas"
).filter(~f.col('measurement')).filter(f.col('binaryLessCases')).drop('geneId')
tissue_specificity = session.spark.read.parquet(
    "/users/dc16/data/releases/25.03/target_prioritisation"
).select("targetId", "tissueSpecificity", "tissueDistribution")

25/05/28 10:22:19 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-google-hadoop-file-system.properties,hadoop-metrics2.properties


In [4]:
l2g_signif = (
    l2g.filter(f.col("score") >= 0.5)
    .join(
        cred_sets.select(
            "studyLocusId",
            "studyId",
        ),
        "studyLocusId",
        "inner",
    )
    .join(studies, "studyId", "inner")
)


In [5]:
l2g_signif.count()

                                                                                

44725

In [6]:
(
    l2g_signif
    .agg(
        f.size(f.array_distinct(f.flatten(f.collect_list('diseaseIds')))).alias('uniqueDiseases'),
        f.size(f.array_distinct(f.flatten(f.collect_list('mappedTherapeuticAreas')))).alias('uniqueTherapeuticAreas'),
        f.sum('cancerOrBenignTumor').alias('cancerOrBenignTumor'),
        f.sum('infectiousDisease').alias('infectiousDisease'),
        f.sum('pregnancyOrPerinatalDisease').alias('pregnancyOrPerinatalDisease'),
        f.sum('disorderOfVisualSystem').alias('disorderOfVisualSystem'),
        f.sum('cardiovascularDisease').alias('cardiovascularDisease'),
        f.sum('pancreasDisease').alias('pancreasDisease'),
        f.sum('gastrointestinalDisease').alias('gastrointestinalDisease'),
        f.sum('reproductiveSystemOrBreastDisease').alias('reproductiveSystemOrBreastDisease'),
        f.sum('integumentarySystemDisease').alias('integumentarySystemDisease'),
        f.sum('endocrineSystemDisease').alias('endocrineSystemDisease'),
        f.sum('respiratoryOrThoracicDisease').alias('respiratoryOrThoracicDisease'),
        f.sum('urinarySystemDisease').alias('urinarySystemDisease'),
        f.sum('musculoskeletalOrConnectiveTissueDisease').alias('musculoskeletalOrConnectiveTissueDisease'),
        f.sum('disorderOfEar').alias('disorderOfEar'),
        f.sum('immuneSystemDisease').alias('immuneSystemDisease'),
        f.sum('hematologicDisease').alias('hematologicDisease'),
        f.sum('nervousSystemDisease').alias('nervousSystemDisease'),
        f.sum('psychiatricDisorder').alias('psychiatricDisorder'),
        f.sum('nutritionalOrMetabolicDisease').alias('nutritionalOrMetabolicDisease'),
        f.sum('geneticFamilialOrCongenitalDisease').alias('geneticFamilialOrCongenitalDisease'),
        f.sum('injuryPoisoningOrOtherComplication').alias('injuryPoisoningOrOtherComplication'),
        f.sum('signOrSymptom').alias('signOrSymptom'),
        f.sum('other').alias('other'),
    )
    .show(vertical=True)
)

25/05/28 10:22:26 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

-RECORD 0----------------------------------------
 uniqueDiseases                           | 1361 
 uniqueTherapeuticAreas                   | 23   
 cancerOrBenignTumor                      | 7176 
 infectiousDisease                        | 796  
 pregnancyOrPerinatalDisease              | 153  
 disorderOfVisualSystem                   | 1815 
 cardiovascularDisease                    | 5954 
 pancreasDisease                          | 4095 
 gastrointestinalDisease                  | 928  
 reproductiveSystemOrBreastDisease        | 511  
 integumentarySystemDisease               | 539  
 endocrineSystemDisease                   | 1007 
 respiratoryOrThoracicDisease             | 2937 
 urinarySystemDisease                     | 910  
 musculoskeletalOrConnectiveTissueDisease | 4069 
 disorderOfEar                            | 40   
 immuneSystemDisease                      | 885  
 hematologicDisease                       | 76   
 nervousSystemDisease                     | 4083 


In [7]:
genes_therapeutic_areas = (
    l2g_signif.groupBy("geneId")
    .agg(
        f.size(f.array_distinct(f.flatten(f.collect_list('diseaseIds')))).alias('uniqueDiseases'),
        f.size(f.array_distinct(f.flatten(f.collect_list('mappedTherapeuticAreas')))).alias('uniqueTherapeuticAreas'),
        f.sum('cancerOrBenignTumor').alias('cancerOrBenignTumor'),
        f.sum('infectiousDisease').alias('infectiousDisease'),
        f.sum('pregnancyOrPerinatalDisease').alias('pregnancyOrPerinatalDisease'),
        f.sum('disorderOfVisualSystem').alias('disorderOfVisualSystem'),
        f.sum('cardiovascularDisease').alias('cardiovascularDisease'),
        f.sum('pancreasDisease').alias('pancreasDisease'),
        f.sum('gastrointestinalDisease').alias('gastrointestinalDisease'),
        f.sum('reproductiveSystemOrBreastDisease').alias('reproductiveSystemOrBreastDisease'),
        f.sum('integumentarySystemDisease').alias('integumentarySystemDisease'),
        f.sum('endocrineSystemDisease').alias('endocrineSystemDisease'),
        f.sum('respiratoryOrThoracicDisease').alias('respiratoryOrThoracicDisease'),
        f.sum('urinarySystemDisease').alias('urinarySystemDisease'),
        f.sum('musculoskeletalOrConnectiveTissueDisease').alias('musculoskeletalOrConnectiveTissueDisease'),
        f.sum('disorderOfEar').alias('disorderOfEar'),
        f.sum('immuneSystemDisease').alias('immuneSystemDisease'),
        f.sum('hematologicDisease').alias('hematologicDisease'),
        f.sum('nervousSystemDisease').alias('nervousSystemDisease'),
        f.sum('psychiatricDisorder').alias('psychiatricDisorder'),
        f.sum('nutritionalOrMetabolicDisease').alias('nutritionalOrMetabolicDisease'),
        f.sum('geneticFamilialOrCongenitalDisease').alias('geneticFamilialOrCongenitalDisease'),
        f.sum('injuryPoisoningOrOtherComplication').alias('injuryPoisoningOrOtherComplication'),
        f.sum('signOrSymptom').alias('signOrSymptom'),
        f.sum('other').alias('other'),
    )
    .withColumn(
        "totalStudies",
        f.col("cancerOrBenignTumor")
        + f.col("infectiousDisease")
        + f.col("pregnancyOrPerinatalDisease")
        + f.col("disorderOfVisualSystem")
        + f.col("cardiovascularDisease")
        + f.col("pancreasDisease")
        + f.col("gastrointestinalDisease")
        + f.col("reproductiveSystemOrBreastDisease")
        + f.col("integumentarySystemDisease")
        + f.col("endocrineSystemDisease")
        + f.col("respiratoryOrThoracicDisease")
        + f.col("urinarySystemDisease")
        + f.col("musculoskeletalOrConnectiveTissueDisease")
        + f.col("disorderOfEar")
        + f.col("immuneSystemDisease")
        + f.col("hematologicDisease")
        + f.col("nervousSystemDisease")
        + f.col("psychiatricDisorder")
        + f.col("nutritionalOrMetabolicDisease")
        + f.col("geneticFamilialOrCongenitalDisease")
        + f.col("injuryPoisoningOrOtherComplication")
        + f.col("signOrSymptom")
        + f.col("other"),
    )
    .join(
        target.withColumns(
            {
                "lofConstraint": f.filter(
                    "constraint", lambda x: x.constraintType == "lof"
                )[0].oeUpper,
                "misConstraint": f.filter(
                    "constraint", lambda x: x.constraintType == "mis"
                )[0].score,
                "synConstraint": f.filter(
                    "constraint", lambda x: x.constraintType == "syn"
                )[0].score,
            }
        ).select(
            "id",
            "approvedSymbol",
            "lofConstraint",
            "misConstraint",
            "synConstraint",
        ),
        target["id"] == f.col("geneId"),
        "inner",
    )
    .drop(
        "id",
    )
    .join(tissue_specificity, f.col("targetId") == f.col("geneId"), "inner")
    .drop("targetId")
    .sort(f.desc('uniqueTherapeuticAreas'))
)

In [8]:
genes_therapeutic_areas.select('geneId').distinct().count()

                                                                                

6999

In [9]:
genes_therapeutic_areas.show()



+---------------+--------------+----------------------+-------------------+-----------------+---------------------------+----------------------+---------------------+---------------+-----------------------+---------------------------------+--------------------------+----------------------+----------------------------+--------------------+----------------------------------------+-------------+-------------------+------------------+--------------------+-------------------+-----------------------------+----------------------------------+----------------------------------+-------------+-----+------------+--------------+-------------+-------------+-------------+-----------------+------------------+
|         geneId|uniqueDiseases|uniqueTherapeuticAreas|cancerOrBenignTumor|infectiousDisease|pregnancyOrPerinatalDisease|disorderOfVisualSystem|cardiovascularDisease|pancreasDisease|gastrointestinalDisease|reproductiveSystemOrBreastDisease|integumentarySystemDisease|endocrineSystemDisease|respira

                                                                                

In [10]:
genes_therapeutic_areas.write.parquet('gs://genetics-portal-dev-analysis/dc16/output/gentropy_paper/genes_therapeutic_areas', mode='overwrite')

                                                                                

In [11]:
print(
    "Number of protein-coding genes with constraint score assigned to a therapeutic area:",
    genes_therapeutic_areas.count(),
)
genes_therapeutic_areas.agg(
    f.mean("lofConstraint"), f.mean("misConstraint"), f.mean("synConstraint")
).show()


                                                                                

Number of protein-coding genes with constraint score assigned to a therapeutic area: 6999


                                                                                

+------------------+------------------+--------------------+
|avg(lofConstraint)|avg(misConstraint)|  avg(synConstraint)|
+------------------+------------------+--------------------+
|0.7975087513527194|0.9320345636124465|-0.28406069025035596|
+------------------+------------------+--------------------+



In [12]:
genes_not_assigned = (
    target.filter(
        ~f.col("id").isin(
            [row.geneId for row in genes_therapeutic_areas.select("geneId").collect()]
        )
    )
    .filter(f.col("biotype") == "protein_coding")
    .withColumns(
        {
            "lofConstraint": f.filter(
                "constraint", lambda x: x.constraintType == "lof"
            )[0].oeUpper,
            "misConstraint": f.filter(
                "constraint", lambda x: x.constraintType == "mis"
            )[0].score,
            "synConstraint": f.filter(
                "constraint", lambda x: x.constraintType == "syn"
            )[0].score,
        }
    )
    .select(
        "id",
        "approvedSymbol",
        "lofConstraint",
        "misConstraint",
        "synConstraint",
    )
)
print(
    "Number of protein-coding genes with a loss-of-function constraint score not assigned to a therapeutic area:",
    genes_not_assigned.count(),
)
genes_not_assigned.agg(
    f.mean("lofConstraint"), f.mean("misConstraint"), f.mean("synConstraint")
).show()

                                                                                

Number of protein-coding genes with a loss-of-function constraint score not assigned to a therapeutic area: 13129
+------------------+------------------+--------------------+
|avg(lofConstraint)|avg(misConstraint)|  avg(synConstraint)|
+------------------+------------------+--------------------+
|0.9974878094563515|0.7125996131094996|-0.17120213635481868|
+------------------+------------------+--------------------+



In [13]:
genes_pd = genes_therapeutic_areas.toPandas()

                                                                                

In [14]:
genes_pd

Unnamed: 0,geneId,uniqueDiseases,uniqueTherapeuticAreas,cancerOrBenignTumor,infectiousDisease,pregnancyOrPerinatalDisease,disorderOfVisualSystem,cardiovascularDisease,pancreasDisease,gastrointestinalDisease,...,injuryPoisoningOrOtherComplication,signOrSymptom,other,totalStudies,approvedSymbol,lofConstraint,misConstraint,synConstraint,tissueSpecificity,tissueDistribution
0,ENSG00000175164,70,17,6,14,0,5,34,4,7,...,1,2,21,117,ABO,,,,0.50,0.0
1,ENSG00000111252,57,16,9,3,0,3,20,10,5,...,0,0,6,102,SH2B3,0.803,0.41489,1.92960,0.50,-1.0
2,ENSG00000166949,68,16,10,2,0,0,11,1,1,...,0,3,13,150,SMAD3,0.400,3.47820,-0.95833,-1.00,-1.0
3,ENSG00000130203,78,15,1,10,0,3,46,10,5,...,1,4,34,208,APOE,1.034,0.74028,1.03800,0.75,-1.0
4,ENSG00000140718,69,15,18,1,3,0,16,35,0,...,0,0,13,159,FTO,0.850,0.56199,-0.53789,-1.00,-1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6994,ENSG00000286105,2,1,0,0,0,0,0,0,0,...,0,0,0,2,TMEM217B,,,,1.00,1.0
6995,ENSG00000288636,1,1,0,0,0,0,0,0,0,...,0,0,5,5,ENSG00000288636,,,,0.50,0.5
6996,ENSG00000288701,1,1,0,0,0,0,0,0,0,...,0,0,0,2,PRRC2B,,,,-1.00,-1.0
6997,ENSG00000291237,1,1,0,0,0,0,0,0,0,...,0,0,0,1,SOD2,,,,0.50,-1.0


In [15]:
fig = px.histogram(
    genes_pd,
    x="uniqueDiseases",
    histnorm="probability",
    title="Distribution of unique diseases per gene",
    color_discrete_sequence=["#636EFA"],
)
fig.update_layout(xaxis_title="Unique disease IDs", yaxis_title="Probability", bargap=0.1)
fig.show()

In [16]:
fig = px.histogram(
    genes_pd,
    x="uniqueTherapeuticAreas",
    histnorm="probability",
    title="Distribution of unique therapeutic areas per gene",
    color_discrete_sequence=["#636EFA"],
)
fig.update_layout(xaxis_title="Unique therapeutic areas", yaxis_title="Probability", bargap=0.1)
fig.show()

In [17]:
print("Min:", genes_pd["uniqueTherapeuticAreas"].min())
print("Mean:", genes_pd["uniqueTherapeuticAreas"].mean())
print("Median:", genes_pd["uniqueTherapeuticAreas"].median())
print("Max:", genes_pd["uniqueTherapeuticAreas"].max())

Min: 1
Mean: 2.365337905415059
Median: 2.0
Max: 17


In [18]:
print("Min:", genes_pd["lofConstraint"].min())
print("Mean:", genes_pd["lofConstraint"].mean())
print("Median:", genes_pd["lofConstraint"].median())
print("Max:", genes_pd["lofConstraint"].max())

Min: 0.03
Mean: 0.7975088
Median: 0.719
Max: 1.995


In [19]:
genes_pd["uniqueTherapeuticAreas"].corr(genes_pd["uniqueDiseases"]) # This is higher than previous 'pleiotropy' score correlation - 0.75

0.8428794241900018

In [20]:
genes_pd[['geneId', 'approvedSymbol', 'uniqueDiseases', 'uniqueTherapeuticAreas','lofConstraint', 'misConstraint', 'synConstraint', 'tissueSpecificity', 'tissueDistribution']][0:10]

Unnamed: 0,geneId,approvedSymbol,uniqueDiseases,uniqueTherapeuticAreas,lofConstraint,misConstraint,synConstraint,tissueSpecificity,tissueDistribution
0,ENSG00000175164,ABO,70,17,,,,0.5,0.0
1,ENSG00000111252,SH2B3,57,16,0.803,0.41489,1.9296,0.5,-1.0
2,ENSG00000166949,SMAD3,68,16,0.4,3.4782,-0.95833,-1.0,-1.0
3,ENSG00000130203,APOE,78,15,1.034,0.74028,1.038,0.75,-1.0
4,ENSG00000140718,FTO,69,15,0.85,0.56199,-0.53789,-1.0,-1.0
5,ENSG00000181915,ADO,39,15,0.81,1.6131,-0.41566,-1.0,-1.0
6,ENSG00000115267,IFIH1,27,14,1.548,-0.78852,-1.3339,0.5,-1.0
7,ENSG00000147883,CDKN2B,107,14,1.833,-0.69508,-0.47619,0.5,0.0
8,ENSG00000107249,GLIS3,30,13,0.736,-3.1361,-4.7329,0.5,0.0
9,ENSG00000153814,JAZF1,36,13,0.405,2.3454,-0.89592,0.5,-1.0


In [23]:
df = genes_pd[["lofConstraint", "uniqueDiseases"]].dropna()
x = df[["lofConstraint"]]
x = sm.add_constant(x)
y = df["uniqueDiseases"]
model = sm.Poisson(
    y,
    x,
).fit()
print(model.summary())
print("P-values:\n", model.pvalues)

Optimization terminated successfully.
         Current function value: 3.407499
         Iterations 5
                          Poisson Regression Results                          
Dep. Variable:         uniqueDiseases   No. Observations:                 6799
Model:                        Poisson   Df Residuals:                     6797
Method:                           MLE   Df Model:                            1
Date:                Wed, 28 May 2025   Pseudo R-squ.:                 0.01463
Time:                        10:23:04   Log-Likelihood:                -23168.
converged:                       True   LL-Null:                       -23512.
Covariance Type:            nonrobust   LLR p-value:                1.208e-151
                    coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------
const             1.6115      0.012    140.068      0.000       1.589       1.634
lofConstraint    -0.

In [43]:
df = genes_pd[["lofConstraint", "uniqueTherapeuticAreas"]].dropna()
x = df[["lofConstraint"]]
x = sm.add_constant(x)
successes = df['uniqueTherapeuticAreas']
failures = 23 - successes
y = np.column_stack((successes, failures))
model = sm.GLM(
    y,
    x,
    family=sm.families.Binomial()
).fit()
print(model.summary())
print("P-values:\n", model.pvalues)

                 Generalized Linear Model Regression Results                  
Dep. Variable:           ['y1', 'y2']   No. Observations:                 6799
Model:                            GLM   Df Residuals:                     6797
Model Family:                Binomial   Df Model:                            1
Link Function:                  Logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -13100.
Date:                Wed, 28 May 2025   Deviance:                       9555.3
Time:                        10:55:33   Pearson chi2:                 1.17e+04
No. Iterations:                     6   Pseudo R-squ. (CS):            0.04607
Covariance Type:            nonrobust                                         
                    coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------
const            -1.9186      0.016   -122.733

In [24]:
df = genes_pd[["misConstraint", "uniqueDiseases"]].dropna()
x = df[["misConstraint"]]
x = sm.add_constant(x)
y = df["uniqueDiseases"]
model = sm.Poisson(
    y,
    x,
).fit()
print(model.summary())
print("P-values:\n", model.pvalues)

Optimization terminated successfully.
         Current function value: 3.447441
         Iterations 4
                          Poisson Regression Results                          
Dep. Variable:         uniqueDiseases   No. Observations:                 6834
Model:                        Poisson   Df Residuals:                     6832
Method:                           MLE   Df Model:                            1
Date:                Wed, 28 May 2025   Pseudo R-squ.:                0.001467
Time:                        10:23:04   Log-Likelihood:                -23560.
converged:                       True   LL-Null:                       -23594.
Covariance Type:            nonrobust   LLR p-value:                 8.709e-17
                    coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------
const             1.3116      0.008    173.403      0.000       1.297       1.326
misConstraint     0.

In [46]:
df = genes_pd[["uniqueTherapeuticAreas", "misConstraint"]].dropna()
x = df["misConstraint"]
x = sm.add_constant(x)
successes = df['uniqueTherapeuticAreas']
failures = 23 - successes
y = np.column_stack((successes, failures))
model = sm.GLM(
    y,
    x,
    family=sm.families.Binomial()
).fit()
print(model.summary())
print("P-values:\n", model.pvalues)

                 Generalized Linear Model Regression Results                  
Dep. Variable:           ['y1', 'y2']   No. Observations:                 6834
Model:                            GLM   Df Residuals:                     6832
Model Family:                Binomial   Df Model:                            1
Link Function:                  Logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -13298.
Date:                Wed, 28 May 2025   Deviance:                       9872.6
Time:                        10:56:19   Pearson chi2:                 1.21e+04
No. Iterations:                     5   Pseudo R-squ. (CS):           0.005305
Covariance Type:            nonrobust                                         
                    coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------
const            -2.1967      0.010   -217.261

In [26]:
df = genes_pd[["synConstraint", "uniqueDiseases"]].dropna()
x = df[["synConstraint"]]
x = sm.add_constant(x)
y = df["uniqueDiseases"]
model = sm.Poisson(
    y,
    x,
).fit()
print(model.summary())
print("P-values:\n", model.pvalues)

Optimization terminated successfully.
         Current function value: 3.436806
         Iterations 5
                          Poisson Regression Results                          
Dep. Variable:         uniqueDiseases   No. Observations:                 6834
Model:                        Poisson   Df Residuals:                     6832
Method:                           MLE   Df Model:                            1
Date:                Wed, 28 May 2025   Pseudo R-squ.:                0.004548
Time:                        10:23:04   Log-Likelihood:                -23487.
converged:                       True   LL-Null:                       -23594.
Covariance Type:            nonrobust   LLR p-value:                 1.354e-48
                    coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------
const             1.3266      0.006    208.593      0.000       1.314       1.339
synConstraint    -0.

In [47]:
df = genes_pd[["uniqueTherapeuticAreas", "synConstraint"]].dropna()
x = df["synConstraint"]
x = sm.add_constant(x)
successes = df['uniqueTherapeuticAreas']
failures = 23 - successes
y = np.column_stack((successes, failures))
model = sm.GLM(
    y,
    x,
    family=sm.families.Binomial()
).fit()
print(model.summary())
print("P-values:\n", model.pvalues)

                 Generalized Linear Model Regression Results                  
Dep. Variable:           ['y1', 'y2']   No. Observations:                 6834
Model:                            GLM   Df Residuals:                     6832
Model Family:                Binomial   Df Model:                            1
Link Function:                  Logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -13279.
Date:                Wed, 28 May 2025   Deviance:                       9834.2
Time:                        10:56:32   Pearson chi2:                 1.21e+04
No. Iterations:                     5   Pseudo R-squ. (CS):            0.01087
Covariance Type:            nonrobust                                         
                    coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------
const            -2.1785      0.009   -255.596

In [48]:
df = genes_pd[["tissueSpecificity", "uniqueTherapeuticAreas"]].dropna()
x = df[["tissueSpecificity"]]
x = sm.add_constant(x)
y = df['uniqueTherapeuticAreas']
successes = df['uniqueTherapeuticAreas']
failures = 23 - successes
y = np.column_stack((successes, failures))
model = sm.GLM(
    y,
    x,
    family=sm.families.Binomial()
).fit()
print(model.summary())
print("P-values:\n", model.pvalues)

                 Generalized Linear Model Regression Results                  
Dep. Variable:           ['y1', 'y2']   No. Observations:                 6869
Model:                            GLM   Df Residuals:                     6867
Model Family:                Binomial   Df Model:                            1
Link Function:                  Logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -13408.
Date:                Wed, 28 May 2025   Deviance:                       10010.
Time:                        10:57:16   Pearson chi2:                 1.23e+04
No. Iterations:                     5   Pseudo R-squ. (CS):           0.001124
Covariance Type:            nonrobust                                         
                        coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------
const                -2.1619      0.00

In [29]:
df = genes_pd[["tissueSpecificity", "uniqueDiseases"]].dropna()
x = df[["tissueSpecificity"]]
x = sm.add_constant(x)
y = df["uniqueDiseases"]
model = sm.Poisson(
    y,
    x,
).fit()
print(model.summary())
print("P-values:\n", model.pvalues)

Optimization terminated successfully.
         Current function value: 3.453446
         Iterations 4
                          Poisson Regression Results                          
Dep. Variable:         uniqueDiseases   No. Observations:                 6869
Model:                        Poisson   Df Residuals:                     6867
Method:                           MLE   Df Model:                            1
Date:                Wed, 28 May 2025   Pseudo R-squ.:               0.0009323
Time:                        10:23:04   Log-Likelihood:                -23722.
converged:                       True   LL-Null:                       -23744.
Covariance Type:            nonrobust   LLR p-value:                 2.855e-11
                        coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------
const                 1.3481      0.006    219.170      0.000       1.336       1.360
tissueSp

In [49]:
df = genes_pd[["tissueDistribution", "uniqueTherapeuticAreas"]].dropna()
x = df[["tissueDistribution"]]
x = sm.add_constant(x)
y = df['uniqueTherapeuticAreas']
successes = df['uniqueTherapeuticAreas']
failures = 23 - successes
y = np.column_stack((successes, failures))
model = sm.GLM(
    y,
    x,
    family=sm.families.Binomial()
).fit()
print(model.summary())
print("P-values:\n", model.pvalues)

                 Generalized Linear Model Regression Results                  
Dep. Variable:           ['y1', 'y2']   No. Observations:                 6869
Model:                            GLM   Df Residuals:                     6867
Model Family:                Binomial   Df Model:                            1
Link Function:                  Logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -13411.
Date:                Wed, 28 May 2025   Deviance:                       10015.
Time:                        10:57:45   Pearson chi2:                 1.24e+04
No. Iterations:                     5   Pseudo R-squ. (CS):          0.0003052
Covariance Type:            nonrobust                                         
                         coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------
const                 -2.1563      0

In [31]:
df = genes_pd[["tissueDistribution", "uniqueDiseases"]].dropna()
x = df[["tissueDistribution"]]
x = sm.add_constant(x)
y = df['uniqueDiseases']
model = sm.Poisson(
    y,
    x,
).fit()
print(model.summary())
print("P-values:\n", model.pvalues)

Optimization terminated successfully.
         Current function value: 3.455865
         Iterations 4
                          Poisson Regression Results                          
Dep. Variable:         uniqueDiseases   No. Observations:                 6869
Model:                        Poisson   Df Residuals:                     6867
Method:                           MLE   Df Model:                            1
Date:                Wed, 28 May 2025   Pseudo R-squ.:               0.0002325
Time:                        10:23:05   Log-Likelihood:                -23738.
converged:                       True   LL-Null:                       -23744.
Covariance Type:            nonrobust   LLR p-value:                 0.0008918
                         coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------
const                  1.3579      0.007    197.696      0.000       1.344       1.371
tissu

In [32]:
drugs = session.spark.read.parquet(
    "gs://open-targets-data-releases/25.03/output/drug_molecule"
).select(
    "id",
    "blackBoxWarning",
    "isApproved",
    "maximumClinicalTrialPhase",
    "hasBeenWithdrawn",
    f.explode("linkedTargets.rows").alias("linkedTargets"),
)

In [33]:
target_prioritisation = (
    session.spark.read.parquet("/users/dc16/data/releases/25.03/target_prioritisation")
    .select(
        "targetId",
        "hasSafetyEvent",
        "geneticConstraint",
        "mouseKOScore",
        "maxClinicalTrialPhase",
        "tissueSpecificity",
        "tissueDistribution",
    )
    .withColumn(
        "hasSafetyEvent", f.when(f.col("hasSafetyEvent") == -1.0, 1).otherwise(0)
    )
    .join(
        drugs.select("linkedTargets"),
        drugs["linkedTargets"] == f.col("targetId"),
        "semi",
    )
    .join(
        target.select("id", "biotype").filter(f.col("biotype") == "protein_coding"),
        target["id"] == f.col("targetId"),
        "inner",
    )
    .join(
        genes_therapeutic_areas.select("geneId", "uniqueTherapeuticAreas", "uniqueDiseases"),
        f.col("geneId") == f.col("targetId"),
        "left",
    )
    .fillna({"uniqueTherapeuticAreas": 0.0})
    .fillna({"uniqueDiseases": 0.0})
    .drop("geneId", "id", "biotype")
    .toPandas()
)

                                                                                

In [34]:
target_prioritisation

Unnamed: 0,targetId,hasSafetyEvent,geneticConstraint,mouseKOScore,maxClinicalTrialPhase,tissueSpecificity,tissueDistribution,uniqueTherapeuticAreas,uniqueDiseases
0,ENSG00000003436,0,-0.418108,-0.898656,1.0,0.5,-1.0,2,2
1,ENSG00000004468,1,0.265368,-0.276253,1.0,0.5,0.0,1,1
2,ENSG00000004487,0,-0.789852,-0.948130,0.5,-1.0,-1.0,0,0
3,ENSG00000004948,0,0.021567,-0.533431,1.0,0.5,0.5,1,1
4,ENSG00000005339,0,-0.994999,-0.981162,0.5,-1.0,-1.0,1,2
...,...,...,...,...,...,...,...,...,...
1545,ENSG00000275546,0,,,0.5,,,0,0
1546,ENSG00000276011,0,,,0.5,,,0,0
1547,ENSG00000277443,0,,-0.694578,0.5,-1.0,-1.0,2,2
1548,ENSG00000278731,0,,,0.5,,,0,0


In [35]:
target_prioritisation[target_prioritisation["uniqueTherapeuticAreas"] == 0]

Unnamed: 0,targetId,hasSafetyEvent,geneticConstraint,mouseKOScore,maxClinicalTrialPhase,tissueSpecificity,tissueDistribution,uniqueTherapeuticAreas,uniqueDiseases
2,ENSG00000004487,0,-0.789852,-0.948130,0.5,-1.00,-1.0,0,0
5,ENSG00000005882,0,0.009273,-0.224664,1.0,0.50,-1.0,0,0
7,ENSG00000006116,0,-0.891852,0.000000,1.0,0.75,0.5,0,0
8,ENSG00000006283,0,-0.876433,-0.905834,1.0,0.50,0.5,0,0
10,ENSG00000006638,1,0.508439,-0.772443,1.0,-1.00,0.0,0,0
...,...,...,...,...,...,...,...,...,...
1544,ENSG00000275407,0,,,0.5,,,0,0
1545,ENSG00000275546,0,,,0.5,,,0,0
1546,ENSG00000276011,0,,,0.5,,,0,0
1548,ENSG00000278731,0,,,0.5,,,0,0


In [50]:
df = target_prioritisation[
    [
        "hasSafetyEvent",
        "geneticConstraint",
        "uniqueTherapeuticAreas",
        "uniqueDiseases",
        "tissueSpecificity",
        "tissueDistribution",
    ]
].dropna()
x = df[
    [
        "geneticConstraint",
        "uniqueTherapeuticAreas",
        "tissueSpecificity",
        # "tissueDistribution",
    ]
]
x = sm.add_constant(x)
y = df["hasSafetyEvent"].astype(int)
model = sm.Logit(y, x).fit()
print(model.summary())
print("P-values:\n", model.pvalues)


Optimization terminated successfully.
         Current function value: 0.570131
         Iterations 5
                           Logit Regression Results                           
Dep. Variable:         hasSafetyEvent   No. Observations:                 1486
Model:                          Logit   Df Residuals:                     1482
Method:                           MLE   Df Model:                            3
Date:                Wed, 28 May 2025   Pseudo R-squ.:                 0.02780
Time:                        10:58:16   Log-Likelihood:                -847.22
converged:                       True   LL-Null:                       -871.44
Covariance Type:            nonrobust   LLR p-value:                 1.712e-10
                             coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------------
const                     -1.3136      0.080    -16.372      0.000      -1.471      

In [37]:
subset = target_prioritisation[target_prioritisation["maxClinicalTrialPhase"] > 0.50]
subset

Unnamed: 0,targetId,hasSafetyEvent,geneticConstraint,mouseKOScore,maxClinicalTrialPhase,tissueSpecificity,tissueDistribution,uniqueTherapeuticAreas,uniqueDiseases
0,ENSG00000003436,0,-0.418108,-0.898656,1.0,0.50,-1.0,2,2
1,ENSG00000004468,1,0.265368,-0.276253,1.0,0.50,0.0,1,1
3,ENSG00000004948,0,0.021567,-0.533431,1.0,0.50,0.5,1,1
5,ENSG00000005882,0,0.009273,-0.224664,1.0,0.50,-1.0,0,0
6,ENSG00000006071,1,-0.179204,-0.263614,1.0,0.75,0.0,3,4
...,...,...,...,...,...,...,...,...,...
1540,ENSG00000265681,0,-0.659200,,1.0,-1.00,-1.0,0,0
1541,ENSG00000267534,0,0.168160,-0.909651,1.0,-1.00,0.0,0,0
1542,ENSG00000267855,0,0.855387,-0.402576,1.0,-1.00,-1.0,0,0
1543,ENSG00000274286,1,,-0.841078,1.0,-1.00,0.0,0,0


In [51]:
df = subset[
    [
        "hasSafetyEvent",
        "geneticConstraint",
        "uniqueTherapeuticAreas",
        "uniqueDiseases",
        "tissueSpecificity",
        "tissueDistribution",
    ]
].dropna()
x = df[
    [
        "geneticConstraint",
        "uniqueTherapeuticAreas",
        "tissueSpecificity",
        # "tissueDistribution",
    ]
]
x = sm.add_constant(x)
y = df["hasSafetyEvent"].astype(int)
model = sm.Logit(y, x).fit()
print(model.summary())
print("P-values:\n", model.pvalues)

Optimization terminated successfully.
         Current function value: 0.581804
         Iterations 5
                           Logit Regression Results                           
Dep. Variable:         hasSafetyEvent   No. Observations:                 1183
Model:                          Logit   Df Residuals:                     1179
Method:                           MLE   Df Model:                            3
Date:                Wed, 28 May 2025   Pseudo R-squ.:                 0.03126
Time:                        10:58:37   Log-Likelihood:                -688.27
converged:                       True   LL-Null:                       -710.48
Covariance Type:            nonrobust   LLR p-value:                 1.230e-09
                             coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------------
const                     -1.2682      0.091    -13.952      0.000      -1.446      