In [1]:
import pyspark.sql.functions as f
import statsmodels.api as sm
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.io as pio

from gentropy.common.session import Session
from gentropy.dataset.colocalisation import Colocalisation
from gentropy.dataset.l2g_prediction import L2GPrediction
from gentropy.dataset.study_index import StudyIndex
from gentropy.dataset.study_locus import StudyLocus

pio.renderers.default = 'vscode'



IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html



In [2]:
session = Session(
    extended_spark_conf={
        'spark.executor.memory':'10g',
        'spark.driver.memory':'10g'}
)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/04/24 13:23:17 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [42]:
target = session.spark.read.parquet("/users/dc16/data/releases/25.03/target")
l2g = L2GPrediction.from_parquet(
    session, "/users/dc16/data/releases/25.03/l2g_prediction"
).df.select("studyLocusId", "geneId", "score")
cred_sets = StudyLocus.from_parquet(
    session, "/users/dc16/data/releases/25.03/credible_set"
)
studies = (
    session.spark.read.parquet(
        "/users/dc16/data/gentropy_paper/gwas_study_index_therapeutic_areas"
    )
    .select(
        "studyId",
        "traitFromSource",
        "traitFromSourceMappedIds",
        "nCases",
        "nControls",
        "nSamples",
        "Haematology",
        "Metabolic",
        "Congenital",
        "Signs/symptoms",
        "Neurology",
        "Immune",
        "Psychiatry",
        "Dermatology",
        "Ophthalmology",
        "Cardiovascular",
        "Oncology",
        "Respiratory",
        "Digestive",
        "Endocrine",
        "Musculoskeletal",
        "Infection",
        "Measurement",
        f.col("bianry_less_cases").alias("binary_less_cases"),
        "Other",
    )
    .filter(f.col("Measurement") == 0)
    .filter(f.col("bianry_less_cases") == 1)
    .filter(f.col('Oncology') == 0)
    .drop('Measurement','binary_less_cases', 'Oncology')
)
tissue_specificity = (
    session.spark.read.parquet('gs://open-targets-data-releases/25.03/output/target_prioritisation')
    .select('targetId', 'tissueSpecificity', 'tissueDistribution')
)


In [4]:
l2g_signif = (
    l2g.filter(f.col("score") >= 0.5)
    .join(
        cred_sets.df.select(
            "studyLocusId",
            "variantId",
            "studyId",
            "beta",
            "pValueMantissa",
            "pValueExponent",
        ),
        "studyLocusId",
        "inner",
    )
    .join(studies, "studyId", "inner")
)


In [5]:
(
    l2g_signif
    .agg(
        f.sum('Haematology').alias('Haematology'),
        f.sum('Metabolic').alias('Metabolic'),
        f.sum('Congenital').alias('Congenital'),
        f.sum('Signs/symptoms').alias('Signs/symptoms'),
        f.sum('Neurology').alias('Neurology'),
        f.sum('Immune').alias('Immune'),
        f.sum('Psychiatry').alias('Psychiatry'),
        f.sum('Dermatology').alias('Dermatology'),
        f.sum('Ophthalmology').alias('Ophthalmology'),
        f.sum('Cardiovascular').alias('Cardiovascular'),
        f.sum('Respiratory').alias('Respiratory'),
        f.sum('Digestive').alias('Digestive'),
        f.sum('Endocrine').alias('Endocrine'),
        f.sum('Musculoskeletal').alias('Musculoskeletal'),
        f.sum('Infection').alias('Infection'),
        f.sum('Other').alias('Other'),
    )
    .show(vertical=True)
)

                                                                                

-RECORD 0---------------
 Haematology     | 394  
 Metabolic       | 5861 
 Congenital      | 5480 
 Signs/symptoms  | 265  
 Neurology       | 6677 
 Immune          | 6416 
 Psychiatry      | 4039 
 Dermatology     | 1535 
 Ophthalmology   | 2628 
 Cardiovascular  | 7205 
 Respiratory     | 3167 
 Digestive       | 3703 
 Endocrine       | 5644 
 Musculoskeletal | 5287 
 Infection       | 719  
 Other           | 2919 



In [6]:
studies_per_therapeutic_area = (
    studies
    .filter(f.col("Measurement") == 0)
    .filter(f.col("bianry_less_cases") == 1)
    .agg(
        f.sum('Haematology').alias('Haematology'),
        f.sum('Metabolic').alias('Metabolic'),
        f.sum('Congenital').alias('Congenital'),
        f.sum('Signs/symptoms').alias('Signs/symptoms'),
        f.sum('Neurology').alias('Neurology'),
        f.sum('Immune').alias('Immune'),
        f.sum('Psychiatry').alias('Psychiatry'),
        f.sum('Dermatology').alias('Dermatology'),
        f.sum('Ophthalmology').alias('Ophthalmology'),
        f.sum('Cardiovascular').alias('Cardiovascular'),
        f.sum('Respiratory').alias('Respiratory'),
        f.sum('Digestive').alias('Digestive'),
        f.sum('Endocrine').alias('Endocrine'),
        f.sum('Musculoskeletal').alias('Musculoskeletal'),
        f.sum('Infection').alias('Infection'),
        f.sum('Other').alias('Other'),
    )
)

In [47]:
genes_therapeutic_areas = (
    l2g_signif.groupBy("geneId")
    .agg(
        f.sum("Haematology").alias("Haematology"),
        f.sum("Metabolic").alias("Metabolic"),
        f.sum("Congenital").alias("Congenital"),
        f.sum("Signs/symptoms").alias("Signs/symptoms"),
        f.sum("Neurology").alias("Neurology"),
        f.sum("Immune").alias("Immune"),
        f.sum("Psychiatry").alias("Psychiatry"),
        f.sum("Dermatology").alias("Dermatology"),
        f.sum("Ophthalmology").alias("Ophthalmology"),
        f.sum("Cardiovascular").alias("Cardiovascular"),
        f.sum("Respiratory").alias("Respiratory"),
        f.sum("Digestive").alias("Digestive"),
        f.sum("Endocrine").alias("Endocrine"),
        f.sum("Musculoskeletal").alias("Musculoskeletal"),
        f.sum("Infection").alias("Infection"),
        f.sum("Other").alias("Other"),
    )
    .withColumn(
        "total",
        f.col("Haematology")
        + f.col("Metabolic")
        + f.col("Congenital")
        + f.col("Signs/symptoms")
        + f.col("Neurology")
        + f.col("Immune")
        + f.col("Psychiatry")
        + f.col("Dermatology")
        + f.col("Ophthalmology")
        + f.col("Cardiovascular")
        + f.col("Respiratory")
        + f.col("Digestive")
        + f.col("Endocrine")
        + f.col("Musculoskeletal")
        + f.col("Infection")
        + f.col("Other"),
    )
    .withColumns(
        {
            "haematology_proportion": f.col("Haematology")
            / studies_per_therapeutic_area.collect()[0]["Haematology"],
            "metabolic_proportion": f.col("Metabolic")
            / studies_per_therapeutic_area.collect()[0]["Metabolic"],
            "congenital_proportion": f.col("Congenital")
            / studies_per_therapeutic_area.collect()[0]["Congenital"],
            "signs_symptoms_proportion": f.col("Signs/symptoms")
            / studies_per_therapeutic_area.collect()[0]["Signs/symptoms"],
            "neurology_proportion": f.col("Neurology")
            / studies_per_therapeutic_area.collect()[0]["Neurology"],
            "immune_proportion": f.col("Immune")
            / studies_per_therapeutic_area.collect()[0]["Immune"],
            "psychiatry_proportion": f.col("Psychiatry")
            / studies_per_therapeutic_area.collect()[0]["Psychiatry"],
            "dermatology_proportion": f.col("Dermatology")
            / studies_per_therapeutic_area.collect()[0]["Dermatology"],
            "ophthalmology_proportion": f.col("Ophthalmology")
            / studies_per_therapeutic_area.collect()[0]["Ophthalmology"],
            "cardiovascular_proportion": f.col("Cardiovascular")
            / studies_per_therapeutic_area.collect()[0]["Cardiovascular"],
            "respiratory_proportion": f.col("Respiratory")
            / studies_per_therapeutic_area.collect()[0]["Respiratory"],
            "digestive_proportion": f.col("Digestive")
            / studies_per_therapeutic_area.collect()[0]["Digestive"],
            "endocrine_proportion": f.col("Endocrine")
            / studies_per_therapeutic_area.collect()[0]["Endocrine"],
            "musculoskeletal_proportion": f.col("Musculoskeletal")
            / studies_per_therapeutic_area.collect()[0]["Musculoskeletal"],
            "infection_proportion": f.col("Infection")
            / studies_per_therapeutic_area.collect()[0]["Infection"],
            "other_proportion": f.col("Other")
            / studies_per_therapeutic_area.collect()[0]["Other"],
        }
    )
    .join(
        target.withColumns(
            {
                "lofConstraint": f.filter(
                    "constraint", lambda x: x.constraintType == "lof"
                )[0].oeUpper,
                "misConstraint": f.filter(
                    "constraint", lambda x: x.constraintType == "mis"
                )[0].score,
                "synConstraint": f.filter(
                    "constraint", lambda x: x.constraintType == "syn"
                )[0].score,
            }
        ).select(
            "id",
            "approvedSymbol",
            "biotype",
            "genomicLocation.chromosome",
            "genomicLocation.start",
            "genomicLocation.end",
            "lofConstraint",
            "misConstraint",
            "synConstraint",
        ),
        target["id"] == f.col("geneId"),
        "inner",
    )
    .drop(
        "id",
    )
    .withColumn(
        "pleiotropy",
        (
            f.when(f.col("Haematology") > 0, 1).otherwise(0)
            + f.when(f.col("Metabolic") > 0, 1).otherwise(0)
            + f.when(f.col("Congenital") > 0, 1).otherwise(0)
            + f.when(f.col("Signs/symptoms") > 0, 1).otherwise(0)
            + f.when(f.col("Neurology") > 0, 1).otherwise(0)
            + f.when(f.col("Immune") > 0, 1).otherwise(0)
            + f.when(f.col("Psychiatry") > 0, 1).otherwise(0)
            + f.when(f.col("Dermatology") > 0, 1).otherwise(0)
            + f.when(f.col("Ophthalmology") > 0, 1).otherwise(0)
            + f.when(f.col("Cardiovascular") > 0, 1).otherwise(0)
            + f.when(f.col("Respiratory") > 0, 1).otherwise(0)
            + f.when(f.col("Digestive") > 0, 1).otherwise(0)
            + f.when(f.col("Endocrine") > 0, 1).otherwise(0)
            + f.when(f.col("Musculoskeletal") > 0, 1).otherwise(0)
            + f.when(f.col("Infection") > 0, 1).otherwise(0)
            + f.when(f.col("Other") > 0, 1).otherwise(0)
        )
        / 16,
    )
    .join(tissue_specificity, f.col('targetId') == f.col('geneId'), 'inner')
    .drop('targetId') 
    .sort(f.desc("total"))
)

In [48]:
genes_therapeutic_areas.show()



+---------------+-----------+---------+----------+--------------+---------+------+----------+-----------+-------------+--------------+-----------+---------+---------+---------------+---------+-----+-----+----------------------+--------------------+---------------------+-------------------------+--------------------+--------------------+---------------------+----------------------+------------------------+-------------------------+----------------------+--------------------+--------------------+--------------------------+--------------------+--------------------+--------------+--------------+----------+---------+---------+-------------+-------------+-------------+----------+-----------------+------------------+
|         geneId|Haematology|Metabolic|Congenital|Signs/symptoms|Neurology|Immune|Psychiatry|Dermatology|Ophthalmology|Cardiovascular|Respiratory|Digestive|Endocrine|Musculoskeletal|Infection|Other|total|haematology_proportion|metabolic_proportion|congenital_proportion|signs_symp

                                                                                

In [56]:
genes_therapeutic_areas.write.parquet('gs://genetics-portal-dev-analysis/dc16/output/pleiotropy_genes_therapeutic_areas', mode='overwrite')

                                                                                

In [50]:
print('Number of protein-coding genes with constraint score assigned to a therapeutic area:', genes_therapeutic_areas.count())
genes_therapeutic_areas.agg(f.mean('lofConstraint'), f.mean('misConstraint'), f.mean('synConstraint')).show()


                                                                                

Number of protein-coding genes with constraint score assigned to a therapeutic area: 6502


                                                                                

+------------------+------------------+-------------------+
|avg(lofConstraint)|avg(misConstraint)| avg(synConstraint)|
+------------------+------------------+-------------------+
|0.7903414093857588|0.9481722309148974|-0.2910159379832186|
+------------------+------------------+-------------------+



In [10]:
genes_not_assigned =(
    target
    .filter(~f.col('id').isin([row.geneId for row in genes_therapeutic_areas.select('geneId').collect()]))
    .filter(f.col('biotype') == 'protein_coding')
    .withColumns(
            {
                "lofConstraint": f.filter(
                    "constraint", lambda x: x.constraintType == "lof"
                )[0].oeUpper,
                "misConstraint": f.filter(
                    "constraint", lambda x: x.constraintType == "mis"
                )[0].score,
                "synConstraint": f.filter(
                    "constraint", lambda x: x.constraintType == "syn"
                )[0].score,
            }
        )
    .select(
        'id',
        'approvedSymbol',
        'lofConstraint',
        'misConstraint', 
        'synConstraint',
    )
)
print("Number of protein-coding genes with a loss-of-function constraint score not assigned to a therapeutic area:", genes_not_assigned.count())
genes_not_assigned.agg(f.mean('lofConstraint'), f.mean('misConstraint'), f.mean('synConstraint')).show()

                                                                                

Number of protein-coding genes with a loss-of-function constraint score not assigned to a therapeutic area: 13626
+------------------+------------------+--------------------+
|avg(lofConstraint)|avg(misConstraint)|  avg(synConstraint)|
+------------------+------------------+--------------------+
|0.9931970192489402|0.7129619251015237|-0.17208502554732985|
+------------------+------------------+--------------------+



In [54]:
genes_pd = genes_therapeutic_areas.toPandas()

                                                                                

In [103]:
genes_pd

Unnamed: 0,geneId,Haematology,Metabolic,Congenital,Signs/symptoms,Neurology,Immune,Psychiatry,Dermatology,Ophthalmology,...,biotype,chromosome,start,end,lofConstraint,misConstraint,synConstraint,pleiotropy,tissueSpecificity,tissueDistribution
0,ENSG00000147883,0,73,5,0,38,2,4,1,49,...,protein_coding,9,22002903,22009305,1.833,-0.69508,-0.476190,0.7500,0.50,0.0
1,ENSG00000130203,0,33,29,4,77,1,60,1,12,...,protein_coding,19,44905791,44909393,1.034,0.74028,1.038000,0.9375,0.75,-1.0
2,ENSG00000164093,0,0,26,0,38,24,2,0,9,...,protein_coding,4,110617423,110642123,0.260,1.55020,-0.667880,0.6250,0.50,0.0
3,ENSG00000134242,3,32,13,0,11,93,0,0,7,...,protein_coding,1,113813811,113871753,1.089,0.86915,-0.621660,0.6875,0.50,0.0
4,ENSG00000162594,0,2,82,0,2,92,0,13,3,...,protein_coding,1,67138907,67259979,0.606,1.08380,-0.017413,0.6250,0.50,0.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6497,ENSG00000178381,0,0,0,0,0,0,0,0,0,...,protein_coding,7,1151903,1160759,1.685,-0.94616,-2.226700,0.0625,-1.00,-1.0
6498,ENSG00000177189,0,0,0,0,0,0,0,0,0,...,protein_coding,X,20149911,20267519,0.087,4.52080,1.410900,0.0625,-1.00,-1.0
6499,ENSG00000197555,0,0,0,0,0,0,0,0,0,...,protein_coding,14,71320449,71741229,0.167,2.98150,-0.056802,0.0625,-1.00,-1.0
6500,ENSG00000134504,0,0,0,0,0,0,0,0,0,...,protein_coding,18,26454910,26657401,0.442,1.82400,1.185100,0.0625,-1.00,0.0


In [102]:
fig = px.histogram(genes_pd, x='pleiotropy', histnorm='probability', nbins=16, title='Distribution of Gene Pleiotropy Scores')
fig.show()

In [104]:
print('Min:', genes_pd['pleiotropy'].min())
print('Mean:', genes_pd['pleiotropy'].mean())
print('Median:', genes_pd['pleiotropy'].median())
print('Max:', genes_pd['pleiotropy'].max())

Min: 0.0625
Mean: 0.19942902183943403
Median: 0.125
Max: 1.0


In [105]:
print('Min:', genes_pd['lofConstraint'].min())
print('Mean:', genes_pd['lofConstraint'].mean())
print('Median:', genes_pd['lofConstraint'].median())
print('Max:', genes_pd['lofConstraint'].max())

Min: 0.03
Mean: 0.7903414
Median: 0.711
Max: 1.995


In [114]:
df = genes_pd[['pleiotropy', 'lofConstraint', 'tissueSpecificity']].dropna()
x = df[['lofConstraint']]
x = sm.add_constant(x)
y = df['pleiotropy']
print(sm.GLM(y, x, family=sm.families.Poisson()).fit().summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:             pleiotropy   No. Observations:                 6227
Model:                            GLM   Df Residuals:                     6225
Model Family:                 Poisson   Df Model:                            1
Link Function:                    Log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -2795.7
Date:                Thu, 24 Apr 2025   Deviance:                       550.86
Time:                        15:41:46   Pearson chi2:                     631.
No. Iterations:                     4   Pseudo R-squ. (CS):           0.002182
Covariance Type:            nonrobust                                         
                    coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------
const            -1.4360      0.053    -27.059

In [115]:
df = genes_pd[['pleiotropy', 'misConstraint']].dropna()
x = df['misConstraint']  
x = sm.add_constant(x)
y = df['pleiotropy']
print(sm.GLM(y, x, family=sm.families.Poisson()).fit().summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:             pleiotropy   No. Observations:                 6347
Model:                            GLM   Df Residuals:                     6345
Model Family:                 Poisson   Df Model:                            1
Link Function:                    Log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -2847.8
Date:                Thu, 24 Apr 2025   Deviance:                       570.68
Time:                        15:41:49   Pearson chi2:                     657.
No. Iterations:                     4   Pseudo R-squ. (CS):          0.0003146
Covariance Type:            nonrobust                                         
                    coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------
const            -1.6364      0.034    -47.695

In [116]:
df = genes_pd[['pleiotropy', 'synConstraint']].dropna()
x = df['synConstraint']
x = sm.add_constant(x)
y = df['pleiotropy']
print(sm.GLM(y, x, family=sm.families.Poisson()).fit().summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:             pleiotropy   No. Observations:                 6347
Model:                            GLM   Df Residuals:                     6345
Model Family:                 Poisson   Df Model:                            1
Link Function:                    Log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -2846.8
Date:                Thu, 24 Apr 2025   Deviance:                       568.63
Time:                        15:41:51   Pearson chi2:                     655.
No. Iterations:                     4   Pseudo R-squ. (CS):          0.0006383
Covariance Type:            nonrobust                                         
                    coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------
const            -1.6217      0.029    -56.218

In [119]:
df = genes_pd[['pleiotropy', 'tissueSpecificity', 'tissueDistribution']].dropna()
x = df[['tissueSpecificity', 'tissueDistribution']]
x = sm.add_constant(x)
y = df['pleiotropy']
print(sm.Poisson(y, x).fit().summary())

Optimization terminated successfully.
         Current function value: 0.448932
         Iterations 3
                          Poisson Regression Results                          
Dep. Variable:             pleiotropy   No. Observations:                 6379
Model:                        Poisson   Df Residuals:                     6376
Method:                           MLE   Df Model:                            2
Date:                Thu, 24 Apr 2025   Pseudo R-squ.:               3.404e-05
Time:                        15:43:42   Log-Likelihood:                -2863.7
converged:                       True   LL-Null:                       -2863.8
Covariance Type:            nonrobust   LLR p-value:                    0.9071
                         coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------
const                 -1.6097      0.033    -48.295      0.000      -1.675      -1.544
tissu

In [117]:
df = genes_pd[['pleiotropy', 'lofConstraint', 'misConstraint', 'synConstraint', 'tissueSpecificity']].dropna()
x = df[['lofConstraint', 'misConstraint', 'synConstraint', 'tissueSpecificity']]
x = sm.add_constant(x)
y = df['pleiotropy']
print(sm.Poisson(y, x).fit().summary())

Optimization terminated successfully.
         Current function value: 0.448431
         Iterations 5
                          Poisson Regression Results                          
Dep. Variable:             pleiotropy   No. Observations:                 6227
Model:                        Poisson   Df Residuals:                     6222
Method:                           MLE   Df Model:                            4
Date:                Thu, 24 Apr 2025   Pseudo R-squ.:                0.003608
Time:                        15:42:08   Log-Likelihood:                -2792.4
converged:                       True   LL-Null:                       -2802.5
Covariance Type:            nonrobust   LLR p-value:                 0.0004514
                        coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------
const                -1.4257      0.088    -16.127      0.000      -1.599      -1.252
lofConst

In [29]:
drugs = (
    session.spark.read.parquet('gs://open-targets-data-releases/25.03/output/drug_molecule')
    .select('id', 'blackBoxWarning', 'maximumClinicalTrialPhase', 'hasBeenWithdrawn', f.explode('linkedTargets.rows').alias('linkedTargets'))
)



In [30]:
target_prioritisation = (
    session.spark.read.parquet(
        "gs://open-targets-data-releases/25.03/output/target_prioritisation"
    )
    .select(
        "targetId",
        "hasSafetyEvent",
        "geneticConstraint",
        "mouseKOScore",
        "maxClinicalTrialPhase",
        "tissueSpecificity",
        "tissueDistribution",
    )
    .withColumn(
        "hasSafetyEvent", f.when(f.col("hasSafetyEvent") == -1.0, 1).otherwise(0)
    )
    .join(
        drugs.select('linkedTargets'),
        drugs["linkedTargets"] == f.col("targetId"),
        'semi'
    )
    .join(
        target.select('id', 'biotype').filter(f.col('biotype') == 'protein_coding'),
        target["id"] == f.col("targetId"),
        'inner'
    )
    .join(
        genes_therapeutic_areas.select("geneId", "pleiotropy"),
        f.col("geneId") == f.col("targetId"),
        "left",
    )
    .fillna({"pleiotropy": 0.0})
    .drop('geneId', 'id', 'biotype')
    .toPandas()
)

                                                                                

In [31]:
target_prioritisation

Unnamed: 0,targetId,hasSafetyEvent,geneticConstraint,mouseKOScore,maxClinicalTrialPhase,tissueSpecificity,tissueDistribution,pleiotropy
0,ENSG00000000938,0,-0.592936,-0.469734,1.00,0.75,0.0,0.0000
1,ENSG00000001626,1,0.457700,-0.964738,1.00,0.50,0.0,0.0000
2,ENSG00000002549,0,-0.017295,,0.50,-1.00,-1.0,0.0000
3,ENSG00000002726,0,0.073870,-0.780918,1.00,0.75,0.0,0.2500
4,ENSG00000003400,1,0.438112,,0.50,-1.00,-1.0,0.0000
...,...,...,...,...,...,...,...,...
1545,ENSG00000278540,1,,-0.583476,0.75,-1.00,-1.0,0.0000
1546,ENSG00000278731,0,,,0.50,,,0.0000
1547,ENSG00000282608,1,,-0.827099,1.00,0.50,0.0,0.0625
1548,ENSG00000292332,0,,,1.00,,,0.0000


In [33]:
df = target_prioritisation[['hasSafetyEvent', 'geneticConstraint', 'pleiotropy', 'tissueSpecificity']].dropna()
x = df[['geneticConstraint', 'pleiotropy', 'tissueSpecificity',]]
x = sm.add_constant(x)
y = df['hasSafetyEvent'].astype(int)
print(sm.Logit(y, x).fit().summary())       

Optimization terminated successfully.
         Current function value: 0.572380
         Iterations 5
                           Logit Regression Results                           
Dep. Variable:         hasSafetyEvent   No. Observations:                 1486
Model:                          Logit   Df Residuals:                     1482
Method:                           MLE   Df Model:                            3
Date:                Thu, 24 Apr 2025   Pseudo R-squ.:                 0.02396
Time:                        13:52:36   Log-Likelihood:                -850.56
converged:                       True   LL-Null:                       -871.44
Covariance Type:            nonrobust   LLR p-value:                 4.507e-09
                        coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------
const                -1.2916      0.080    -16.065      0.000      -1.449      -1.134
geneticC

In [34]:
subset = target_prioritisation[target_prioritisation['maxClinicalTrialPhase'] > 0.50]
subset

Unnamed: 0,targetId,hasSafetyEvent,geneticConstraint,mouseKOScore,maxClinicalTrialPhase,tissueSpecificity,tissueDistribution,pleiotropy
0,ENSG00000000938,0,-0.592936,-0.469734,1.00,0.75,0.0,0.0000
1,ENSG00000001626,1,0.457700,-0.964738,1.00,0.50,0.0,0.0000
3,ENSG00000002726,0,0.073870,-0.780918,1.00,0.75,0.0,0.2500
5,ENSG00000003436,0,-0.418108,-0.898656,1.00,0.50,-1.0,0.1250
6,ENSG00000004468,1,0.265368,-0.276253,1.00,0.50,0.0,0.1250
...,...,...,...,...,...,...,...,...
1544,ENSG00000278195,0,,,1.00,0.50,0.5,0.0000
1545,ENSG00000278540,1,,-0.583476,0.75,-1.00,-1.0,0.0000
1547,ENSG00000282608,1,,-0.827099,1.00,0.50,0.0,0.0625
1548,ENSG00000292332,0,,,1.00,,,0.0000


In [51]:
df = subset[['hasSafetyEvent', 'geneticConstraint', 'pleiotropy', 'tissueSpecificity']].dropna()
x = df[['geneticConstraint', 'pleiotropy', 'tissueSpecificity']]
x = sm.add_constant(x)
y = df['hasSafetyEvent'].astype(int)
print(sm.Logit(y, x).fit().summary())  

Optimization terminated successfully.
         Current function value: 0.583233
         Iterations 5
                           Logit Regression Results                           
Dep. Variable:         hasSafetyEvent   No. Observations:                 1183
Model:                          Logit   Df Residuals:                     1179
Method:                           MLE   Df Model:                            3
Date:                Thu, 24 Apr 2025   Pseudo R-squ.:                 0.02888
Time:                        14:07:36   Log-Likelihood:                -689.96
converged:                       True   LL-Null:                       -710.48
Covariance Type:            nonrobust   LLR p-value:                 6.424e-09
                        coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------
const                -1.2554      0.091    -13.813      0.000      -1.433      -1.077
geneticC