In [22]:
import pyspark.sql.functions as f
import statsmodels.api as sm
import pandas as pd
import numpy as np

from gentropy.common.session import Session
from gentropy.dataset.colocalisation import Colocalisation
from gentropy.dataset.l2g_prediction import L2GPrediction
from gentropy.dataset.study_index import StudyIndex
from gentropy.dataset.study_locus import StudyLocus


In [2]:
session = Session(
    extended_spark_conf={
        'spark.executor.memory':'10g',
        'spark.driver.memory':'10g'}
)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/04/15 14:22:12 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
target = session.spark.read.parquet('/users/dc16/data/releases/25.03/target')
l2g = L2GPrediction.from_parquet(
    session, "/users/dc16/data/releases/25.03/l2g_prediction"
).df.select("studyLocusId", "geneId", "score")
cred_sets = StudyLocus.from_parquet(
    session, "/users/dc16/data/releases/25.03/credible_set"
)
studies = (
    session.spark.read.parquet(
        "/users/dc16/data/gentropy_paper/gwas_study_index_therapeutic_areas"
    )
    .select(
        "studyId",
        "traitFromSource",
        "traitFromSourceMappedIds",
        "nCases",
        "nControls",
        "nSamples",
        "Haematology",
        "Metabolic",
        "Congenital",
        "Signs/symptoms",
        "Neurology",
        "Immune",
        "Psychiatry",
        "Dermatology",
        "Ophthalmology",
        "Cardiovascular",
        "Oncology",
        "Respiratory",
        "Digestive",
        "Endocrine",
        "Musculoskeletal",
        "Infection",
        "Measurement",
        f.col("bianry_less_cases").alias("binary_less_cases"),
        "Other",
    )
    .filter(f.col("Measurement") == 0)
    .filter(f.col("bianry_less_cases") == 1)
)


In [4]:
l2g_signif = (
    l2g.filter(f.col("score") >= 0.5)
    .join(
        cred_sets.df.select(
            "studyLocusId",
            "variantId",
            "studyId",
            "beta",
            "pValueMantissa",
            "pValueExponent",
        ),
        "studyLocusId",
        "inner",
    )
    .join(studies, "studyId", "inner")
)


In [5]:
(
    l2g_signif
    .agg(
        f.sum('Haematology').alias('Haematology'),
        f.sum('Metabolic').alias('Metabolic'),
        f.sum('Congenital').alias('Congenital'),
        f.sum('Signs/symptoms').alias('Signs/symptoms'),
        f.sum('Neurology').alias('Neurology'),
        f.sum('Immune').alias('Immune'),
        f.sum('Psychiatry').alias('Psychiatry'),
        f.sum('Dermatology').alias('Dermatology'),
        f.sum('Ophthalmology').alias('Ophthalmology'),
        f.sum('Cardiovascular').alias('Cardiovascular'),
        f.sum('Oncology').alias('Oncology'),
        f.sum('Respiratory').alias('Respiratory'),
        f.sum('Digestive').alias('Digestive'),
        f.sum('Endocrine').alias('Endocrine'),
        f.sum('Musculoskeletal').alias('Musculoskeletal'),
        f.sum('Infection').alias('Infection'),
        f.sum('Other').alias('Other'),
    )
    .show(vertical=True)
)

                                                                                

-RECORD 0---------------
 Haematology     | 394  
 Metabolic       | 5861 
 Congenital      | 5480 
 Signs/symptoms  | 265  
 Neurology       | 6677 
 Immune          | 6416 
 Psychiatry      | 4039 
 Dermatology     | 1535 
 Ophthalmology   | 2628 
 Cardiovascular  | 7205 
 Oncology        | 7176 
 Respiratory     | 3167 
 Digestive       | 3703 
 Endocrine       | 5644 
 Musculoskeletal | 5287 
 Infection       | 719  
 Other           | 2919 



In [6]:
studies_per_therapeutic_area = (
    studies
    .filter(f.col("Measurement") == 0)
    .filter(f.col("bianry_less_cases") == 1)
    .agg(
        f.sum('Haematology').alias('Haematology'),
        f.sum('Metabolic').alias('Metabolic'),
        f.sum('Congenital').alias('Congenital'),
        f.sum('Signs/symptoms').alias('Signs/symptoms'),
        f.sum('Neurology').alias('Neurology'),
        f.sum('Immune').alias('Immune'),
        f.sum('Psychiatry').alias('Psychiatry'),
        f.sum('Dermatology').alias('Dermatology'),
        f.sum('Ophthalmology').alias('Ophthalmology'),
        f.sum('Cardiovascular').alias('Cardiovascular'),
        f.sum('Oncology').alias('Oncology'),
        f.sum('Respiratory').alias('Respiratory'),
        f.sum('Digestive').alias('Digestive'),
        f.sum('Endocrine').alias('Endocrine'),
        f.sum('Musculoskeletal').alias('Musculoskeletal'),
        f.sum('Infection').alias('Infection'),
        f.sum('Other').alias('Other'),
    )
)

In [7]:
genes_therapeutic_areas = (
    l2g_signif.groupBy("geneId")
    .agg(
        f.sum("Haematology").alias("Haematology"),
        f.sum("Metabolic").alias("Metabolic"),
        f.sum("Congenital").alias("Congenital"),
        f.sum("Signs/symptoms").alias("Signs/symptoms"),
        f.sum("Neurology").alias("Neurology"),
        f.sum("Immune").alias("Immune"),
        f.sum("Psychiatry").alias("Psychiatry"),
        f.sum("Dermatology").alias("Dermatology"),
        f.sum("Ophthalmology").alias("Ophthalmology"),
        f.sum("Cardiovascular").alias("Cardiovascular"),
        f.sum("Oncology").alias("Oncology"),
        f.sum("Respiratory").alias("Respiratory"),
        f.sum("Digestive").alias("Digestive"),
        f.sum("Endocrine").alias("Endocrine"),
        f.sum("Musculoskeletal").alias("Musculoskeletal"),
        f.sum("Infection").alias("Infection"),
        f.sum("Other").alias("Other"),
    )
    .withColumn(
        "total",
        f.col("Haematology")
        + f.col("Metabolic")
        + f.col("Congenital")
        + f.col("Signs/symptoms")
        + f.col("Neurology")
        + f.col("Immune")
        + f.col("Psychiatry")
        + f.col("Dermatology")
        + f.col("Ophthalmology")
        + f.col("Cardiovascular")
        + f.col("Oncology")
        + f.col("Respiratory")
        + f.col("Digestive")
        + f.col("Endocrine")
        + f.col("Musculoskeletal")
        + f.col("Infection")
        + f.col("Other"),
    )
    .withColumns(
        {
            'haematology_proportion': f.col('Haematology') / studies_per_therapeutic_area.collect()[0]['Haematology'],
            'metabolic_proportion': f.col('Metabolic') / studies_per_therapeutic_area.collect()[0]['Metabolic'],
            'congenital_proportion': f.col('Congenital') / studies_per_therapeutic_area.collect()[0]['Congenital'],
            'signs_symptoms_proportion': f.col('Signs/symptoms') / studies_per_therapeutic_area.collect()[0]['Signs/symptoms'],
            'neurology_proportion': f.col('Neurology') / studies_per_therapeutic_area.collect()[0]['Neurology'],
            'immune_proportion': f.col('Immune') / studies_per_therapeutic_area.collect()[0]['Immune'],
            'psychiatry_proportion': f.col('Psychiatry') / studies_per_therapeutic_area.collect()[0]['Psychiatry'],
            'dermatology_proportion': f.col('Dermatology') / studies_per_therapeutic_area.collect()[0]['Dermatology'],
            'ophthalmology_proportion': f.col('Ophthalmology') / studies_per_therapeutic_area.collect()[0]['Ophthalmology'],
            'cardiovascular_proportion': f.col('Cardiovascular') / studies_per_therapeutic_area.collect()[0]['Cardiovascular'],
            'oncology_proportion': f.col('Oncology') / studies_per_therapeutic_area.collect()[0]['Oncology'],
            'respiratory_proportion': f.col('Respiratory') / studies_per_therapeutic_area.collect()[0]['Respiratory'],
            'digestive_proportion': f.col('Digestive') / studies_per_therapeutic_area.collect()[0]['Digestive'],
            'endocrine_proportion': f.col('Endocrine') / studies_per_therapeutic_area.collect()[0]['Endocrine'],
            'musculoskeletal_proportion': f.col('Musculoskeletal') / studies_per_therapeutic_area.collect()[0]['Musculoskeletal'],
            'infection_proportion': f.col('Infection') / studies_per_therapeutic_area.collect()[0]['Infection'],
            'other_proportion': f.col('Other') / studies_per_therapeutic_area.collect()[0]['Other'],
        }
    )
    .join(
        target.withColumn("constraint", f.explode("constraint"))
        .select(
            "id",
            "approvedSymbol",
            "biotype",
            "genomicLocation.chromosome",
            "genomicLocation.start",
            "genomicLocation.end",
            "constraint.constraintType",
            "constraint.score",
        )
        .filter(f.col("constraintType") == "lof"),
        target["id"] == f.col("geneId"),
        "inner",
    )
    .drop("id", "constraintType")
    .withColumnRenamed("score", "lofConstraint")
    .withColumn(
        'pleiotropy',
        (
            f.when(f.col("Haematology") > 0, 1).otherwise(0)
            + f.when(f.col("Metabolic") > 0, 1).otherwise(0)
            + f.when(f.col("Congenital") > 0, 1).otherwise(0)
            + f.when(f.col("Signs/symptoms") > 0, 1).otherwise(0)
            + f.when(f.col("Neurology") > 0, 1).otherwise(0)
            + f.when(f.col("Immune") > 0, 1).otherwise(0)
            + f.when(f.col("Psychiatry") > 0, 1).otherwise(0)
            + f.when(f.col("Dermatology") > 0, 1).otherwise(0)
            + f.when(f.col("Ophthalmology") > 0, 1).otherwise(0)
            + f.when(f.col("Cardiovascular") > 0, 1).otherwise(0)
            + f.when(f.col("Oncology") > 0, 1).otherwise(0)
            + f.when(f.col("Respiratory") > 0, 1).otherwise(0)
            + f.when(f.col("Digestive") > 0, 1).otherwise(0)
            + f.when(f.col("Endocrine") > 0, 1).otherwise(0)
            + f.when(f.col("Musculoskeletal") > 0, 1).otherwise(0)
            + f.when(f.col("Infection") > 0, 1).otherwise(0)
            + f.when(f.col("Other") > 0, 1).otherwise(0)
        )
        / 17
    )
    .sort(f.desc("total"))
)

In [8]:
genes_therapeutic_areas.write.parquet('gs://genetics-portal-dev-analysis/dc16/output/pleiotropy_genes_therapeutic_areas', mode='overwrite')

25/04/15 14:22:22 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-google-hadoop-file-system.properties,hadoop-metrics2.properties
25/04/15 14:22:23 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

In [67]:
drugs = (
    session.spark.read.parquet('gs://open-targets-data-releases/25.03/output/drug_molecule')
    .select('id', 'blackBoxWarning', 'maximumClinicalTrialPhase', 'hasBeenWithdrawn', f.explode('linkedTargets.rows').alias('linkedTargets'))
)
genes_drugs = (
    genes_therapeutic_areas
    .join(
        drugs,
        genes_therapeutic_areas['geneId'] == drugs['linkedTargets'],
        'inner'
    )
    .drop('linkedTargets')
    .filter(f.col('maximumClinicalTrialPhase').isin([3.0, 4.0]))
)

In [70]:
genes_pd = genes_therapeutic_areas.filter(f.col('lofConstraint').isNotNull()).toPandas()
genes_drugs_pd = genes_drugs.filter(f.col('lofConstraint').isNotNull()).toPandas()

                                                                                

In [71]:
genes_pd

Unnamed: 0,geneId,Haematology,Metabolic,Congenital,Signs/symptoms,Neurology,Immune,Psychiatry,Dermatology,Ophthalmology,...,musculoskeletal_proportion,infection_proportion,other_proportion,approvedSymbol,biotype,chromosome,start,end,lofConstraint,pleiotropy
0,ENSG00000147883,0,73,5,0,38,2,4,1,49,...,0.000000,0.000000,0.006369,CDKN2B,protein_coding,9,22002903,22009305,7.336900e-03,0.764706
1,ENSG00000130203,0,33,29,4,77,1,60,1,12,...,0.005056,0.011994,0.003017,APOE,protein_coding,19,44905791,44909393,1.868500e-03,0.941176
2,ENSG00000164093,0,0,26,0,38,24,2,0,9,...,0.003933,0.000000,0.002682,PITX2,protein_coding,4,110617423,110642123,9.761800e-01,0.647059
3,ENSG00000134242,3,32,13,0,11,93,0,0,7,...,0.028652,0.000000,0.001006,PTPN22,protein_coding,1,113813811,113871753,3.492100e-21,0.705882
4,ENSG00000162594,0,2,82,0,2,92,0,13,3,...,0.006180,0.002999,0.000000,IL23R,protein_coding,1,67138907,67259979,5.596100e-04,0.647059
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6794,ENSG00000111145,0,0,0,0,0,0,0,0,0,...,0.000000,0.000000,0.000000,ELK3,protein_coding,12,96194375,96269824,8.350800e-02,0.058824
6795,ENSG00000141905,0,0,0,0,0,0,0,0,0,...,0.000000,0.000000,0.000000,NFIC,protein_coding,19,3314403,3469217,9.053400e-02,0.058824
6796,ENSG00000177189,0,0,0,0,0,0,0,0,0,...,0.000000,0.000000,0.000000,RPS6KA3,protein_coding,X,20149911,20267519,9.999900e-01,0.058824
6797,ENSG00000123094,0,0,0,0,0,0,0,0,0,...,0.000000,0.000000,0.000000,RASSF8,protein_coding,12,25958232,26079892,9.415800e-01,0.058824


In [30]:
x = genes_pd['pleiotropy']
x = sm.add_constant(x)
y = genes_pd['lofConstraint']
print(sm.OLS(y, x).fit().summary())

                            OLS Regression Results                            
Dep. Variable:          lofConstraint   R-squared:                       0.022
Model:                            OLS   Adj. R-squared:                  0.021
Method:                 Least Squares   F-statistic:                     150.1
Date:                Tue, 15 Apr 2025   Prob (F-statistic):           3.80e-34
Time:                        14:56:15   Log-Likelihood:                -3602.7
No. Observations:                6799   AIC:                             7209.
Df Residuals:                    6797   BIC:                             7223.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.2220      0.008     26.877      0.0

In [36]:
x = genes_pd[['haematology_proportion', 'metabolic_proportion',
       'congenital_proportion', 'signs_symptoms_proportion',
       'neurology_proportion', 'immune_proportion', 'psychiatry_proportion',
       'dermatology_proportion', 'ophthalmology_proportion',
       'cardiovascular_proportion', 'oncology_proportion',
       'respiratory_proportion', 'digestive_proportion',
       'endocrine_proportion', 'musculoskeletal_proportion',
       'infection_proportion', 'other_proportion']]
x = sm.add_constant(x)
y = genes_pd['lofConstraint']
print(sm.OLS(y, x).fit().summary())

                            OLS Regression Results                            
Dep. Variable:          lofConstraint   R-squared:                       0.015
Model:                            OLS   Adj. R-squared:                  0.012
Method:                 Least Squares   F-statistic:                     5.994
Date:                Tue, 15 Apr 2025   Prob (F-statistic):           5.20e-14
Time:                        15:07:25   Log-Likelihood:                -3626.3
No. Observations:                6799   AIC:                             7289.
Df Residuals:                    6781   BIC:                             7411.
Df Model:                          17                                         
Covariance Type:            nonrobust                                         
                                 coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------------
const               

In [79]:
x = genes_drugs_pd[['pleiotropy', 'lofConstraint']]
x = sm.add_constant(x)
y = genes_drugs_pd['blackBoxWarning'].astype(int)
print(sm.Logit(y, x).fit().summary())

Optimization terminated successfully.
         Current function value: 0.544019
         Iterations 5
                           Logit Regression Results                           
Model:                          Logit   Df Residuals:                     5585
Method:                           MLE   Df Model:                            2
Date:                Tue, 15 Apr 2025   Pseudo R-squ.:                0.004109
Time:                        18:32:15   Log-Likelihood:                -3040.0
converged:                       True   LL-Null:                       -3052.5
Covariance Type:            nonrobust   LLR p-value:                 3.576e-06
                    coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------
const            -1.0710      0.059    -18.114      0.000      -1.187      -0.955
pleiotropy       -1.0159      0.228     -4.456      0.000      -1.463      -0.569
lofConstraint    

In [78]:
x = genes_drugs_pd[['pleiotropy', 'lofConstraint']]
x = sm.add_constant(x)
y = genes_drugs_pd['hasBeenWithdrawn'].astype(int)
print(sm.Logit(y, x).fit().summary())

Optimization terminated successfully.
         Current function value: 0.239517
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:       hasBeenWithdrawn   No. Observations:                 5588
Model:                          Logit   Df Residuals:                     5585
Method:                           MLE   Df Model:                            2
Date:                Tue, 15 Apr 2025   Pseudo R-squ.:                 0.01346
Time:                        18:31:15   Log-Likelihood:                -1338.4
converged:                       True   LL-Null:                       -1356.7
Covariance Type:            nonrobust   LLR p-value:                 1.181e-08
                    coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------
const            -2.3504      0.104    -22.517      0.000      -2.555      -2.146
pleiotropy       -2.