In [1]:
import pyspark.sql.functions as f
import pyspark.sql.types as t
import statsmodels.api as sm
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.io as pio
import scipy.stats as stats


from gentropy.common.session import Session
from gentropy.dataset.colocalisation import Colocalisation
from gentropy.dataset.l2g_prediction import L2GPrediction
from gentropy.dataset.study_index import StudyIndex
from gentropy.dataset.study_locus import StudyLocus

pio.renderers.default = 'vscode'



IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html



In [2]:
session = Session(
    extended_spark_conf={
        'spark.executor.memory':'10g',
        'spark.driver.memory':'10g'}
)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/05/12 10:36:27 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
target = session.spark.read.parquet("/users/dc16/data/releases/25.03/target")
l2g = L2GPrediction.from_parquet(
    session, "/users/dc16/data/releases/25.03/l2g_prediction"
).df.select("studyLocusId", "geneId", "score")
cred_sets = StudyLocus.from_parquet(
    session, "/users/dc16/data/releases/25.03/credible_set"
)
studies = (
    session.spark.read.parquet(
        "/users/dc16/data/gentropy_paper/gwas_study_index_therapeutic_areas"
    )
    .select(
        "studyId",
        "traitFromSource",
        "traitFromSourceMappedIds",
        "nCases",
        "nControls",
        "nSamples",
        "Haematology",
        "Metabolic",
        "Congenital",
        "Signs/symptoms",
        "Neurology",
        "Immune",
        "Psychiatry",
        "Dermatology",
        "Ophthalmology",
        'Oncology',
        "Cardiovascular",
        "Oncology",
        "Respiratory",
        "Digestive",
        "Endocrine",
        "Musculoskeletal",
        "Infection",
        "Measurement",
        f.col("bianry_less_cases").alias("binary_less_cases"),
        "Other",
    )
    .filter(f.col("Measurement") == 0)
    .filter(f.col("bianry_less_cases") == 1)
    .drop('Measurement','binary_less_cases')
)
tissue_specificity = (
    session.spark.read.parquet('/users/dc16/data/releases/25.03/target_prioritisation')
    .select('targetId', 'tissueSpecificity', 'tissueDistribution')
)


In [5]:
l2g_signif = (
    l2g.filter(f.col("score") >= 0.5)
    .join(
        cred_sets.df.select(
            "studyLocusId",
            "variantId",
            "studyId",
            "beta",
            "pValueMantissa",
            "pValueExponent",
        ),
        "studyLocusId",
        "inner",
    )
    .join(studies, "studyId", "inner")
)


In [6]:
(
    l2g_signif
    .agg(
        f.size(f.array_distinct(f.flatten(f.collect_list('traitFromSourceMappedIds')))).alias('uniqueEFOs'),
        f.sum('Haematology').alias('Haematology'),
        f.sum('Metabolic').alias('Metabolic'),
        f.sum('Congenital').alias('Congenital'),
        f.sum('Signs/symptoms').alias('Signs/symptoms'),
        f.sum('Neurology').alias('Neurology'),
        f.sum('Immune').alias('Immune'),
        f.sum('Psychiatry').alias('Psychiatry'),
        f.sum('Dermatology').alias('Dermatology'),
        f.sum('Ophthalmology').alias('Ophthalmology'),
        f.sum('Oncology').alias('Oncology'),
        f.sum('Cardiovascular').alias('Cardiovascular'),
        f.sum('Respiratory').alias('Respiratory'),
        f.sum('Digestive').alias('Digestive'),
        f.sum('Endocrine').alias('Endocrine'),
        f.sum('Musculoskeletal').alias('Musculoskeletal'),
        f.sum('Infection').alias('Infection'),
        f.sum('Other').alias('Other'),
    )
    .show(vertical=True)
)

                                                                                

-RECORD 0---------------
 uniqueEFOs      | 1384 
 Haematology     | 394  
 Metabolic       | 5861 
 Congenital      | 5480 
 Signs/symptoms  | 265  
 Neurology       | 6677 
 Immune          | 6416 
 Psychiatry      | 4039 
 Dermatology     | 1535 
 Ophthalmology   | 2628 
 Oncology        | 7176 
 Cardiovascular  | 7205 
 Respiratory     | 3167 
 Digestive       | 3703 
 Endocrine       | 5644 
 Musculoskeletal | 5287 
 Infection       | 719  
 Other           | 2919 



In [7]:
studies_per_therapeutic_area = (
    studies
    .filter(f.col("Measurement") == 0)
    .filter(f.col("bianry_less_cases") == 1)
    .agg(
        f.size(f.array_distinct(f.flatten(f.collect_list('traitFromSourceMappedIds')))).alias('uniqueEFOs'),
        f.sum('Haematology').alias('Haematology'),
        f.sum('Metabolic').alias('Metabolic'),
        f.sum('Congenital').alias('Congenital'),
        f.sum('Signs/symptoms').alias('Signs/symptoms'),
        f.sum('Neurology').alias('Neurology'),
        f.sum('Immune').alias('Immune'),
        f.sum('Psychiatry').alias('Psychiatry'),
        f.sum('Dermatology').alias('Dermatology'),
        f.sum('Ophthalmology').alias('Ophthalmology'),
        f.sum('Cardiovascular').alias('Cardiovascular'),
        f.sum('Respiratory').alias('Respiratory'),
        f.sum('Digestive').alias('Digestive'),
        f.sum('Endocrine').alias('Endocrine'),
        f.sum('Musculoskeletal').alias('Musculoskeletal'),
        f.sum('Infection').alias('Infection'),
        f.sum('Other').alias('Other'),
    )
)

In [8]:
genes_therapeutic_areas = (
    l2g_signif.groupBy("geneId")
    .agg(
        f.size(f.array_distinct(f.flatten(f.collect_list('traitFromSourceMappedIds')))).alias('uniqueEFOs'),
        f.sum("Haematology").alias("Haematology"),
        f.sum("Metabolic").alias("Metabolic"),
        f.sum("Congenital").alias("Congenital"),
        f.sum("Signs/symptoms").alias("Signs/symptoms"),
        f.sum("Neurology").alias("Neurology"),
        f.sum("Immune").alias("Immune"),
        f.sum("Psychiatry").alias("Psychiatry"),
        f.sum("Dermatology").alias("Dermatology"),
        f.sum("Ophthalmology").alias("Ophthalmology"),
        f.sum("Oncology").alias("Oncology"),
        f.sum("Cardiovascular").alias("Cardiovascular"),
        f.sum("Respiratory").alias("Respiratory"),
        f.sum("Digestive").alias("Digestive"),
        f.sum("Endocrine").alias("Endocrine"),
        f.sum("Musculoskeletal").alias("Musculoskeletal"),
        f.sum("Infection").alias("Infection"),
        f.sum("Other").alias("Other"),
    )
    .withColumn(
        "totalStudies",
        f.col("Haematology")
        + f.col("Metabolic")
        + f.col("Congenital")
        + f.col("Signs/symptoms")
        + f.col("Neurology")
        + f.col("Immune")
        + f.col("Psychiatry")
        + f.col("Dermatology")
        + f.col("Ophthalmology")
        + f.col("Oncology")
        + f.col("Cardiovascular")
        + f.col("Respiratory")
        + f.col("Digestive")
        + f.col("Endocrine")
        + f.col("Musculoskeletal")
        + f.col("Infection")
        + f.col("Other"),
    )
    # .withColumns(
    #     {
    #         "haematology_proportion": f.col("Haematology")
    #         / studies_per_therapeutic_area.collect()[0]["Haematology"],
    #         "metabolic_proportion": f.col("Metabolic")
    #         / studies_per_therapeutic_area.collect()[0]["Metabolic"],
    #         "congenital_proportion": f.col("Congenital")
    #         / studies_per_therapeutic_area.collect()[0]["Congenital"],
    #         "signs_symptoms_proportion": f.col("Signs/symptoms")
    #         / studies_per_therapeutic_area.collect()[0]["Signs/symptoms"],
    #         "neurology_proportion": f.col("Neurology")
    #         / studies_per_therapeutic_area.collect()[0]["Neurology"],
    #         "immune_proportion": f.col("Immune")
    #         / studies_per_therapeutic_area.collect()[0]["Immune"],
    #         "psychiatry_proportion": f.col("Psychiatry")
    #         / studies_per_therapeutic_area.collect()[0]["Psychiatry"],
    #         "dermatology_proportion": f.col("Dermatology")
    #         / studies_per_therapeutic_area.collect()[0]["Dermatology"],
    #         "ophthalmology_proportion": f.col("Ophthalmology")
    #         / studies_per_therapeutic_area.collect()[0]["Ophthalmology"],
    #         "cardiovascular_proportion": f.col("Cardiovascular")
    #         / studies_per_therapeutic_area.collect()[0]["Cardiovascular"],
    #         "respiratory_proportion": f.col("Respiratory")
    #         / studies_per_therapeutic_area.collect()[0]["Respiratory"],
    #         "digestive_proportion": f.col("Digestive")
    #         / studies_per_therapeutic_area.collect()[0]["Digestive"],
    #         "endocrine_proportion": f.col("Endocrine")
    #         / studies_per_therapeutic_area.collect()[0]["Endocrine"],
    #         "musculoskeletal_proportion": f.col("Musculoskeletal")
    #         / studies_per_therapeutic_area.collect()[0]["Musculoskeletal"],
    #         "infection_proportion": f.col("Infection")
    #         / studies_per_therapeutic_area.collect()[0]["Infection"],
    #         "other_proportion": f.col("Other")
    #         / studies_per_therapeutic_area.collect()[0]["Other"],
    #     }
    # )
    .join(
        target.withColumns(
            {
                "lofConstraint": f.filter(
                    "constraint", lambda x: x.constraintType == "lof"
                )[0].oeUpper,
                "misConstraint": f.filter(
                    "constraint", lambda x: x.constraintType == "mis"
                )[0].score,
                "synConstraint": f.filter(
                    "constraint", lambda x: x.constraintType == "syn"
                )[0].score,
            }
        ).select(
            "id",
            "approvedSymbol",
            "biotype",
            "genomicLocation.chromosome",
            "genomicLocation.start",
            "genomicLocation.end",
            "lofConstraint",
            "misConstraint",
            "synConstraint",
        ),
        target["id"] == f.col("geneId"),
        "inner",
    )
    .drop(
        "id",
    )
    .withColumn(
        "pleiotropy",
        (
            f.when(f.col("Haematology") > 0, 1).otherwise(0)
            + f.when(f.col("Metabolic") > 0, 1).otherwise(0)
            + f.when(f.col("Congenital") > 0, 1).otherwise(0)
            + f.when(f.col("Signs/symptoms") > 0, 1).otherwise(0)
            + f.when(f.col("Neurology") > 0, 1).otherwise(0)
            + f.when(f.col("Immune") > 0, 1).otherwise(0)
            + f.when(f.col("Psychiatry") > 0, 1).otherwise(0)
            + f.when(f.col("Dermatology") > 0, 1).otherwise(0)
            + f.when(f.col("Ophthalmology") > 0, 1).otherwise(0)
            + f.when(f.col("Oncology") > 0, 1).otherwise(0)
            + f.when(f.col("Cardiovascular") > 0, 1).otherwise(0)
            + f.when(f.col("Respiratory") > 0, 1).otherwise(0)
            + f.when(f.col("Digestive") > 0, 1).otherwise(0)
            + f.when(f.col("Endocrine") > 0, 1).otherwise(0)
            + f.when(f.col("Musculoskeletal") > 0, 1).otherwise(0)
            + f.when(f.col("Infection") > 0, 1).otherwise(0)
            + f.when(f.col("Other") > 0, 1).otherwise(0)
        )
        / 17,
    )
    .join(tissue_specificity, f.col('targetId') == f.col('geneId'), 'inner')
    .drop('targetId') 
    .sort(f.desc("pleiotropy"), f.desc('uniqueEFOs'), f.desc('totalStudies'))
)

In [9]:
genes_therapeutic_areas.write.parquet('gs://genetics-portal-dev-analysis/dc16/output/pleiotropy_genes_therapeutic_areas', mode='overwrite')

25/05/12 10:38:31 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-google-hadoop-file-system.properties,hadoop-metrics2.properties
25/05/12 10:38:31 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

In [10]:
print('Number of protein-coding genes with constraint score assigned to a therapeutic area:', genes_therapeutic_areas.count())
genes_therapeutic_areas.agg(f.mean('lofConstraint'), f.mean('misConstraint'), f.mean('synConstraint')).show()


                                                                                

Number of protein-coding genes with constraint score assigned to a therapeutic area: 6999


                                                                                

+------------------+------------------+--------------------+
|avg(lofConstraint)|avg(misConstraint)|  avg(synConstraint)|
+------------------+------------------+--------------------+
|0.7975087513527194|0.9320345636124465|-0.28406069025035596|
+------------------+------------------+--------------------+



In [11]:
genes_not_assigned =(
    target
    .filter(~f.col('id').isin([row.geneId for row in genes_therapeutic_areas.select('geneId').collect()]))
    .filter(f.col('biotype') == 'protein_coding')
    .withColumns(
            {
                "lofConstraint": f.filter(
                    "constraint", lambda x: x.constraintType == "lof"
                )[0].oeUpper,
                "misConstraint": f.filter(
                    "constraint", lambda x: x.constraintType == "mis"
                )[0].score,
                "synConstraint": f.filter(
                    "constraint", lambda x: x.constraintType == "syn"
                )[0].score,
            }
        )
    .select(
        'id',
        'approvedSymbol',
        'lofConstraint',
        'misConstraint', 
        'synConstraint',
    )
)
print("Number of protein-coding genes with a loss-of-function constraint score not assigned to a therapeutic area:", genes_not_assigned.count())
genes_not_assigned.agg(f.mean('lofConstraint'), f.mean('misConstraint'), f.mean('synConstraint')).show()

                                                                                

Number of protein-coding genes with a loss-of-function constraint score not assigned to a therapeutic area: 13129
+------------------+------------------+--------------------+
|avg(lofConstraint)|avg(misConstraint)|  avg(synConstraint)|
+------------------+------------------+--------------------+
|0.9974878094563515|0.7125996131094996|-0.17120213635481868|
+------------------+------------------+--------------------+



In [12]:
genes_pd = genes_therapeutic_areas.toPandas()

                                                                                

In [13]:
genes_pd

Unnamed: 0,geneId,uniqueEFOs,Haematology,Metabolic,Congenital,Signs/symptoms,Neurology,Immune,Psychiatry,Dermatology,...,biotype,chromosome,start,end,lofConstraint,misConstraint,synConstraint,pleiotropy,tissueSpecificity,tissueDistribution
0,ENSG00000175164,70,6,5,6,2,12,7,1,5,...,protein_coding,9,133233278,133276024,,,,1.000000,0.50,0.0
1,ENSG00000130203,79,0,33,29,4,77,1,60,1,...,protein_coding,19,44905791,44909393,1.034,0.74028,1.03800,0.941176,0.75,-1.0
2,ENSG00000140718,72,1,58,12,0,14,6,11,3,...,protein_coding,16,53701692,54158512,0.850,0.56199,-0.53789,0.882353,-1.00,-1.0
3,ENSG00000111252,58,1,14,13,0,8,36,1,3,...,protein_coding,12,111405923,111451623,0.803,0.41489,1.92960,0.882353,0.50,-1.0
4,ENSG00000091831,40,0,0,2,2,10,1,9,1,...,protein_coding,6,151656691,152129619,0.190,1.61200,-1.24030,0.882353,0.50,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6994,ENSG00000288611,1,0,0,0,0,0,0,0,0,...,protein_coding,8,52939182,52943734,,,,0.058824,0.50,0.5
6995,ENSG00000288658,1,0,0,0,0,1,0,0,0,...,protein_coding,2,222314148,222320155,,,,0.058824,0.50,1.0
6996,ENSG00000291237,1,0,0,0,0,0,0,0,0,...,protein_coding,6,159669069,159762529,,,,0.058824,0.50,-1.0
6997,ENSG00000293253,1,0,0,0,0,0,0,0,0,...,protein_coding,1,53413149,53440018,,,,0.058824,,


In [17]:
fig = px.histogram(
    genes_pd,
    x="uniqueEFOs",
    histnorm="probability",
    title="Distribution of Gene Study Scores",
    color_discrete_sequence=["#636EFA"],
)
fig.update_layout(
    xaxis_title="Unique EFOs",
    yaxis_title="Probability",
    bargap=0.1
)
fig.show()

In [18]:
fig = px.histogram(
    genes_pd,
    x="pleiotropy",
    histnorm="probability",
    nbins=18,
    title="Distribution of Gene Pleiotropy Scores",
    color_discrete_sequence=["#636EFA"],
)
fig.update_layout(
    xaxis_title="Pleiotropy Score",
    yaxis_title="Probability",
    bargap=0.1
)
fig.show()

In [20]:
print('Min:', genes_pd['pleiotropy'].min())
print('Mean:', genes_pd['pleiotropy'].mean())
print('Median:', genes_pd['pleiotropy'].median())
print('Max:', genes_pd['pleiotropy'].max())

Min: 0.058823529411764705
Mean: 0.18827899784002758
Median: 0.11764705882352941
Max: 1.0


In [21]:
print('Min:', genes_pd['lofConstraint'].min())
print('Mean:', genes_pd['lofConstraint'].mean())
print('Median:', genes_pd['lofConstraint'].median())
print('Max:', genes_pd['lofConstraint'].max())

Min: 0.03
Mean: 0.7975087
Median: 0.719
Max: 1.995


In [57]:
genes_pd['pleiotropy'].corr(genes_pd['uniqueEFOs'])

0.751559466084168

In [41]:
df = genes_pd[['lofConstraint', 'uniqueEFOs']].dropna()
x = df[['lofConstraint']]
x = sm.add_constant(x) 
y = df['uniqueEFOs']
print(sm.Poisson(y, x).fit().summary())
print('lofConstraint P-value:', sm.Poisson(y, x).fit().pvalues['lofConstraint'])


df = genes_pd[['misConstraint', 'uniqueEFOs']].dropna()
x = df[['misConstraint']]
x = sm.add_constant(x) 
y = df['uniqueEFOs']
print(sm.Poisson(y, x).fit().summary())
print('misConstraint P-value:', sm.Poisson(y, x).fit().pvalues['misConstraint'])

df = genes_pd[['synConstraint', 'uniqueEFOs']].dropna()
x = df[['synConstraint']]
x = sm.add_constant(x) 
y = df['uniqueEFOs']
print(sm.Poisson(y, x).fit().summary())
print('synConstraint P-value:', sm.Poisson(y, x).fit().pvalues['synConstraint'])


Optimization terminated successfully.
         Current function value: 3.436808
         Iterations 5
                          Poisson Regression Results                          
Dep. Variable:             uniqueEFOs   No. Observations:                 6799
Model:                        Poisson   Df Residuals:                     6797
Method:                           MLE   Df Model:                            1
Date:                Mon, 12 May 2025   Pseudo R-squ.:                 0.01447
Time:                        10:53:17   Log-Likelihood:                -23367.
converged:                       True   LL-Null:                       -23710.
Covariance Type:            nonrobust   LLR p-value:                2.969e-151
                    coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------
const             1.6180      0.011    141.161      0.000       1.596       1.640
lofConstraint    -0.

In [40]:
df = genes_pd[['pleiotropy', 'lofConstraint']].dropna()
x = df[['lofConstraint']]
x = sm.add_constant(x)
y = df['pleiotropy']
print(sm.Poisson(y, x).fit().summary())
print('lofConstraint P-value:', sm.Poisson(y, x).fit().pvalues['lofConstraint'])

df = genes_pd[['pleiotropy', 'misConstraint']].dropna()
x = df['misConstraint']  
x = sm.add_constant(x)
y = df['pleiotropy']
print(sm.Poisson(y, x).fit().summary())
print('misConstraint P-value:', sm.Poisson(y, x).fit().pvalues['misConstraint'])

df = genes_pd[['pleiotropy', 'synConstraint']].dropna()
x = df['synConstraint']
x = sm.add_constant(x)
y = df['pleiotropy']
print(sm.Poisson(y, x).fit().summary())
print('synConstraint P-value:', sm.Poisson(y, x).fit().pvalues['synConstraint'])

Optimization terminated successfully.
         Current function value: 0.432751
         Iterations 4
                          Poisson Regression Results                          
Dep. Variable:             pleiotropy   No. Observations:                 6799
Model:                        Poisson   Df Residuals:                     6797
Method:                           MLE   Df Model:                            1
Date:                Mon, 12 May 2025   Pseudo R-squ.:                0.003180
Time:                        10:52:06   Log-Likelihood:                -2942.3
converged:                       True   LL-Null:                       -2951.7
Covariance Type:            nonrobust   LLR p-value:                 1.475e-05
                    coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------
const            -1.4688      0.052    -28.054      0.000      -1.571      -1.366
lofConstraint    -0.

In [42]:
df = genes_pd[['pleiotropy', 'tissueDistribution']].dropna()
x = df[['tissueDistribution']]
x = sm.add_constant(x)
y = df['pleiotropy']
print(sm.Poisson(y, x).fit().summary())
print('Tissue distribution P-value:', sm.Poisson(y, x).fit().pvalues['tissueDistribution'])

Optimization terminated successfully.
         Current function value: 0.433810
         Iterations 3
                          Poisson Regression Results                          
Dep. Variable:             pleiotropy   No. Observations:                 6869
Model:                        Poisson   Df Residuals:                     6867
Method:                           MLE   Df Model:                            1
Date:                Mon, 12 May 2025   Pseudo R-squ.:               7.534e-06
Time:                        10:53:30   Log-Likelihood:                -2979.8
converged:                       True   LL-Null:                       -2979.9
Covariance Type:            nonrobust   LLR p-value:                    0.8322
                         coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------
const                 -1.6638      0.031    -53.332      0.000      -1.725      -1.603
tissu

In [38]:
df = genes_pd[['tissueDistribution', 'uniqueEFOs']].dropna()
x = df[['tissueDistribution']]
x = sm.add_constant(x)
y = df['uniqueEFOs']
print(sm.Poisson(y, x).fit().summary())
print('Tissue distribution P-value:', sm.Poisson(y, x).fit().pvalues['tissueDistribution'])

Optimization terminated successfully.
         Current function value: 3.484989
         Iterations 4
                          Poisson Regression Results                          
Dep. Variable:             uniqueEFOs   No. Observations:                 6869
Model:                        Poisson   Df Residuals:                     6867
Method:                           MLE   Df Model:                            1
Date:                Mon, 12 May 2025   Pseudo R-squ.:               0.0002584
Time:                        10:50:41   Log-Likelihood:                -23938.
converged:                       True   LL-Null:                       -23945.
Covariance Type:            nonrobust   LLR p-value:                 0.0004350
                         coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------
const                  1.3664      0.007    199.819      0.000       1.353       1.380
tissu

In [43]:
drugs = session.spark.read.parquet(
    "gs://open-targets-data-releases/25.03/output/drug_molecule"
).select(
    "id",
    "blackBoxWarning",
    'isApproved',
    "maximumClinicalTrialPhase",
    "hasBeenWithdrawn",
    f.explode("linkedTargets.rows").alias("linkedTargets"),
)

In [45]:
target_prioritisation = (
    session.spark.read.parquet(
        "/users/dc16/data/releases/25.03/target_prioritisation"
    )
    .select(
        "targetId",
        "hasSafetyEvent",
        "geneticConstraint",
        "mouseKOScore",
        "maxClinicalTrialPhase",
        "tissueDistribution",
    )
    .withColumn(
        "hasSafetyEvent", f.when(f.col("hasSafetyEvent") == -1.0, 1).otherwise(0)
    )
    .join(
        drugs.select("linkedTargets"),
        drugs["linkedTargets"] == f.col("targetId"),
        "semi",
    )
    .join(
        target.select("id", "biotype").filter(f.col("biotype") == "protein_coding"),
        target["id"] == f.col("targetId"),
        "inner",
    )
    .join(
        genes_therapeutic_areas.select("geneId", "pleiotropy", 'uniqueEFOs'),
        f.col("geneId") == f.col("targetId"),
        "left",
    )
    .fillna({"pleiotropy": 0.0})
    .fillna({"uniqueEFOs": 0.0})
    .drop("geneId", "id", "biotype")
    .toPandas()
)

                                                                                

In [46]:
target_prioritisation

Unnamed: 0,targetId,hasSafetyEvent,geneticConstraint,mouseKOScore,maxClinicalTrialPhase,tissueDistribution,pleiotropy,uniqueEFOs
0,ENSG00000000938,0,-0.592936,-0.469734,1.00,0.0,0.000000,0
1,ENSG00000001626,1,0.457700,-0.964738,1.00,0.0,0.058824,2
2,ENSG00000002549,0,-0.017295,,0.50,-1.0,0.000000,0
3,ENSG00000002726,0,0.073870,-0.780918,1.00,0.0,0.235294,2
4,ENSG00000003400,1,0.438112,,0.50,-1.0,0.058824,3
...,...,...,...,...,...,...,...,...
1545,ENSG00000278540,1,,-0.583476,0.75,-1.0,0.000000,0
1546,ENSG00000278731,0,,,0.50,,0.000000,0
1547,ENSG00000282608,1,,-0.827099,1.00,0.0,0.058824,1
1548,ENSG00000292332,0,,,1.00,,0.000000,0


In [47]:
target_prioritisation[target_prioritisation['pleiotropy'] == 0]

Unnamed: 0,targetId,hasSafetyEvent,geneticConstraint,mouseKOScore,maxClinicalTrialPhase,tissueDistribution,pleiotropy,uniqueEFOs
0,ENSG00000000938,0,-0.592936,-0.469734,1.00,0.0,0.0,0
2,ENSG00000002549,0,-0.017295,,0.50,-1.0,0.0,0
7,ENSG00000004487,0,-0.789852,-0.948130,0.50,-1.0,0.0,0
8,ENSG00000004779,0,-0.197333,,1.00,-1.0,0.0,0
9,ENSG00000004799,1,0.390706,-0.705350,1.00,-1.0,0.0,0
...,...,...,...,...,...,...,...,...
1544,ENSG00000278195,0,,,1.00,0.5,0.0,0
1545,ENSG00000278540,1,,-0.583476,0.75,-1.0,0.0,0
1546,ENSG00000278731,0,,,0.50,,0.0,0
1548,ENSG00000292332,0,,,1.00,,0.0,0


In [49]:
df = target_prioritisation[['hasSafetyEvent', 'geneticConstraint', 'pleiotropy', 'tissueDistribution', 'uniqueEFOs']].dropna()
x = df[['geneticConstraint', 'pleiotropy', 'tissueDistribution',]]
x = sm.add_constant(x)
y = df['hasSafetyEvent'].astype(int)
print(sm.Logit(y, x).fit().summary())       

Optimization terminated successfully.
         Current function value: 0.571879
         Iterations 5
                           Logit Regression Results                           
Dep. Variable:         hasSafetyEvent   No. Observations:                 1486
Model:                          Logit   Df Residuals:                     1482
Method:                           MLE   Df Model:                            3
Date:                Mon, 12 May 2025   Pseudo R-squ.:                 0.02482
Time:                        10:55:52   Log-Likelihood:                -849.81
converged:                       True   LL-Null:                       -871.44
Covariance Type:            nonrobust   LLR p-value:                 2.175e-09
                         coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------
const                 -1.1934      0.080    -14.838      0.000      -1.351      -1.036
genet

In [52]:
sm.Logit(y, x).fit().pvalues

Optimization terminated successfully.
         Current function value: 0.571879
         Iterations 5


const                 8.347639e-50
geneticConstraint     4.906259e-03
pleiotropy            1.417036e-06
tissueDistribution    1.159689e-03
dtype: float64

In [53]:
subset = target_prioritisation[target_prioritisation['maxClinicalTrialPhase'] > 0.50]
subset

Unnamed: 0,targetId,hasSafetyEvent,geneticConstraint,mouseKOScore,maxClinicalTrialPhase,tissueDistribution,pleiotropy,uniqueEFOs
0,ENSG00000000938,0,-0.592936,-0.469734,1.00,0.0,0.000000,0
1,ENSG00000001626,1,0.457700,-0.964738,1.00,0.0,0.058824,2
3,ENSG00000002726,0,0.073870,-0.780918,1.00,0.0,0.235294,2
5,ENSG00000003436,0,-0.418108,-0.898656,1.00,-1.0,0.117647,2
6,ENSG00000004468,1,0.265368,-0.276253,1.00,0.0,0.117647,1
...,...,...,...,...,...,...,...,...
1544,ENSG00000278195,0,,,1.00,0.5,0.000000,0
1545,ENSG00000278540,1,,-0.583476,0.75,-1.0,0.000000,0
1547,ENSG00000282608,1,,-0.827099,1.00,0.0,0.058824,1
1548,ENSG00000292332,0,,,1.00,,0.000000,0


In [54]:
df = subset[['hasSafetyEvent', 'geneticConstraint', 'pleiotropy', 'tissueDistribution', 'uniqueEFOs']].dropna()
x = df[['geneticConstraint', 'pleiotropy', 'tissueDistribution']]
x = sm.add_constant(x)
y = df['hasSafetyEvent'].astype(int)
print(sm.Logit(y, x).fit().summary())  

Optimization terminated successfully.
         Current function value: 0.583285
         Iterations 5
                           Logit Regression Results                           
Dep. Variable:         hasSafetyEvent   No. Observations:                 1183
Model:                          Logit   Df Residuals:                     1179
Method:                           MLE   Df Model:                            3
Date:                Mon, 12 May 2025   Pseudo R-squ.:                 0.02879
Time:                        10:58:09   Log-Likelihood:                -690.03
converged:                       True   LL-Null:                       -710.48
Covariance Type:            nonrobust   LLR p-value:                 6.821e-09
                         coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------
const                 -1.1321      0.089    -12.696      0.000      -1.307      -0.957
genet

In [55]:
sm.Logit(y, x).fit().pvalues


Optimization terminated successfully.
         Current function value: 0.583285
         Iterations 5


const                 6.193024e-37
geneticConstraint     2.762529e-02
pleiotropy            1.884337e-06
tissueDistribution    1.018129e-03
dtype: float64