In [24]:
import pandas as pd
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql import types as T
import plotly.express as px

# Initialise Spark

spark = SparkSession.builder \
    .appName("Spark NLP")\
    .master("local[4]")\
    .config("spark.driver.memory","16G")\
    .config("spark.driver.maxResultSize", "0") \
    .config("spark.kryoserializer.buffer.max", "2000M")\
    .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:3.3.4")\
    .getOrCreate()

In [25]:
# Prepare input file from chembl evidence

evd = (
    spark.read.json('data/cttv008-20-01-2022.json.gz')
)

In [26]:
studies = (
        evd

        # Extract studies with their reasons to stop
        .filter(F.col('studyStopReason').isNotNull())
        .withColumn('urls', F.explode('urls'))
        .filter(F.col('urls.niceName').contains('ClinicalTrials'))
        .withColumn('nct_id', F.element_at(F.split(F.col('urls.url'), '%22'), -2))
        .select('nct_id', F.col('studyStopReason').alias('why_stopped'))
        .distinct()

        # Convert to Pandas DF
        #.toPandas()
    )

studies.show()

+-----------+--------------------+
|     nct_id|         why_stopped|
+-----------+--------------------+
|NCT01286974|Terminated early,...|
|NCT00318474|DSMB recommended ...|
|NCT01328093|The decision to s...|
|NCT00072189|Early termination...|
|NCT03119623|Lost funding prio...|
|NCT03315052|Delay in IRB appr...|
|NCT00895297|Slow recruitment....|
|NCT00383331|Trial was stopped...|
|NCT00421317|Study stopped at ...|
|NCT00880373|The funding withd...|
|NCT02389764|        Slow Accrual|
|NCT00296959|early termination...|
|NCT01197235|difficult to enro...|
|NCT02677948|FDA has placed al...|
|NCT00509366|Study terminated ...|
|NCT00447369|Because we did no...|
|NCT02811159|   Business decision|
|NCT02715804|    Sponsor decision|
|NCT01446575|This study was re...|
|NCT02179151|   FDA Clinical Hold|
+-----------+--------------------+
only showing top 20 rows



In [78]:
from pyspark.sql import DataFrame
from typing import Iterable
import pyspark.sql.types as t
import pyspark.sql.functions as f

def flatten(schema, prefix=None):
    """Required to flatten the schema."""
    fields = []
    for field in schema.fields:
        name = prefix + '.' + field.name if prefix else field.name
        dtype = field.dataType
        if isinstance(dtype, t.ArrayType):
            dtype = dtype.elementType
        if isinstance(dtype, t.StructType):
            fields += flatten(dtype, prefix=name)
        else:
            fields.append(name)
    return fields

def melt(
        df: DataFrame,
        id_vars: Iterable[str],
        value_vars: Iterable[str],
        var_name: str = 'variable',
        value_name: str = 'value'
) -> DataFrame:
    """Convert :class:`DataFrame` from wide to long format."""

    # Create array<struct<variable: str, value: ...>>
    _vars_and_vals = f.array(*(
        f.struct(f.lit(c).alias(var_name), f.col(c).alias(value_name))
        for c in value_vars
    ))

    # Add to the DataFrame and explode
    _tmp = df.withColumn('_vars_and_vals', f.explode(_vars_and_vals))

    cols = list(id_vars) + [f.col('_vars_and_vals')[x].alias(x) for x in [var_name, value_name]]
    return _tmp.select(*cols)

def evidence_distinct_fields_count(
        df: DataFrame,
        var_name: str
) -> DataFrame:
    """Count unique values in variable (e.g. targetId) and datasource."""

    # flatten dataframe schema
    flat_df = df.select([f.col(c).alias(c) for c in flatten(df.schema)])
    # Unique counts per column field
    exprs = [f.countDistinct(f.col(field.name)).alias(field.name)
             for field in list(filter(lambda x: x.name != 'datasourceId',
                                      flat_df.schema))]
    out = df.groupBy(f.col('datasourceId')).agg(*exprs)
    # Clean column names
    out_cleaned = out.toDF(*(c.replace('.', '_') for c in out.columns))
    cols = [c.name for c in filter(lambda x: x.name != 'datasourceId',
                                   out_cleaned.schema.fields)]
    melted = melt(out_cleaned,
                  id_vars=['datasourceId'],
                  var_name='field',
                  value_vars=cols,
                  value_name='value')
    melted = melted.withColumn('variable', f.lit(var_name))
    return melted

def get_columns_to_report(dataset_columns):
    return [
        'datasourceId',
        'targetFromSourceId',
        'diseaseFromSourceMappedId' if 'diseaseFromSourceMappedId' in dataset_columns else 'diseaseFromSourceId',
        'drugId',
        'urls',
        #'variantId',
        #'literature'
    ]

columns_to_report = get_columns_to_report(evd_w_predictions.columns)

In [81]:
evidence_distinct_fields_count(evd_w_predictions.select(columns_to_report),
                                           'evidenceDistinctFieldsCountByDatasource').show()

+------------+--------------------+-----+--------------------+
|datasourceId|               field|value|            variable|
+------------+--------------------+-----+--------------------+
|      chembl|  targetFromSourceId| 1418|evidenceDistinctF...|
|      chembl|diseaseFromSource...| 2447|evidenceDistinctF...|
|      chembl|              drugId| 3756|evidenceDistinctF...|
|      chembl|       urls_niceName|    4|evidenceDistinctF...|
|      chembl|            urls_url|87160|evidenceDistinctF...|
+------------+--------------------+-----+--------------------+



In [None]:
evd_w_predictions.withColumn('s', F.filter((F.col('urls.niceName').contains('ClinicalTrials')) & ())

In [None]:
def evidence_distinct_clinical_trials(df:DataFrame):

    df
    .withColumn('urls', F.explode('urls'))
    .filter(F.col('urls.niceName').contains('ClinicalTrials'))
    .withColumn('nct_id', F.element_at(F.split(F.col('urls.url'), '%22'), -2))

In [79]:
#evd_w_predictions.filter(F.col('datasourceId') == 'chembl')

flat_df = evd_w_predictions.select([F.col(c).alias(c) for c in flatten(evd_w_predictions.select(*columns_to_report).schema)])

exprs = [f.countDistinct(f.col(field.name)).alias(field.name)
            for field in list(filter(lambda x: x.name != 'datasourceId',
                                    flat_df.schema))]

out = evd_w_predictions.select(*columns_to_report).groupBy(f.col('datasourceId')).agg(*exprs)

out.show()


+------------+------------------+-------------------------+------+-------------+--------+
|datasourceId|targetFromSourceId|diseaseFromSourceMappedId|drugId|urls.niceName|urls.url|
+------------+------------------+-------------------------+------+-------------+--------+
|      chembl|              1418|                     2447|  3756|            4|   87160|
+------------+------------------+-------------------------+------+-------------+--------+



In [135]:
evd_w_predictions.first()

Row(clinicalPhase=3, clinicalStatus='Completed', datasourceId='chembl', datatypeId='known_drug', diseaseFromSource='Diabetes Mellitus, Type 2', diseaseFromSourceMappedId='EFO_0001360', drugId='CHEMBL1431', studyStartDate='2006-03-01', studyStopReason=None, targetFromSource='CHEMBL2363065', targetFromSourceId='O43920', urls=[Row(niceName='ClinicalTrials', url='https://clinicaltrials.gov/search?id=%22NCT00295633%22')], studyStopReasonSubclasses=None, studyStopReasonSuperclasses=None, classes_count='0')

In [27]:
# Export table to make predictions
# studies.coalesce(1).write.csv('data/studies_latest.tsv', sep='\t', header=True)

In [28]:
# Load results from predict.py

schema = T.ArrayType(T.StringType())

predictions = (
    spark.read.csv('data/chembl_predictions-2022-01-24.tsv', sep='\t', header=True)

    # Lists are represented as strings. They must be converted
    .withColumn("subclasses", F.from_json(F.regexp_replace(F.col('subclasses'), "(u')", "'"), schema=schema))
    .withColumn("superclasses", F.from_json(F.regexp_replace(F.col('superclasses'), "(u')", "'"), schema=schema))
)

predictions.show(10, False, True)


-RECORD 0-----------------------------------------------------------------------------------
 why_stopped  | Health restrictions due to the COVID-19 pandemic                            
 subclasses   | [Covid19]                                                                   
 superclasses | [Neutral]                                                                   
-RECORD 1-----------------------------------------------------------------------------------
 why_stopped  | Study withdrawn due to logistical and safety concerns relating to Covid-19. 
 subclasses   | [Covid19]                                                                   
 superclasses | [Neutral]                                                                   
-RECORD 2-----------------------------------------------------------------------------------
 why_stopped  | SARS-CoV-2 Pandemic                                                         
 subclasses   | [Covid19]                                             

In [29]:
predictions.filter(F.col('why_stopped').isNotNull()).filter(F.col('subclasses').isNull()).count()

3213

In [30]:
"""
This does not work because I end up with a combination of each subclass with their superclass
I want to keep the one subclass to one superclass mapping

predictions_exploded = (
    predictions.withColumn("subclass", F.explode('subclass_raw'))
    .withColumn("superclass", F.explode('superclass_raw'))
    .drop('subclass_raw', 'superclass_raw')
)
"""

predictions_exploded = (
    predictions
    .withColumn('tmp', F.arrays_zip('subclasses', 'superclasses'))
    .withColumn('tmp', F.explode('tmp'))
    .select('why_stopped', F.col('tmp.subclasses').alias('subclass'), F.col('tmp.superclasses').alias('superclass'))
)

predictions_exploded.show(5, False, True)



-RECORD 0----------------------------------------------------------------------------------
 why_stopped | Health restrictions due to the COVID-19 pandemic                            
 subclass    | Covid19                                                                     
 superclass  | Neutral                                                                     
-RECORD 1----------------------------------------------------------------------------------
 why_stopped | Study withdrawn due to logistical and safety concerns relating to Covid-19. 
 subclass    | Covid19                                                                     
 superclass  | Neutral                                                                     
-RECORD 2----------------------------------------------------------------------------------
 why_stopped | SARS-CoV-2 Pandemic                                                         
 subclass    | Covid19                                                          

In [31]:
print('Broader class distribution:', predictions_exploded.groupBy('superclass').count().orderBy(F.col('count').desc()).show())
print('\n')
print('Finer class distribution:', predictions_exploded.groupBy('subclass').count().orderBy(F.col('count').desc()).show(truncate=False))

+------------------+-----+
|        superclass|count|
+------------------+-----+
|           Neutral|22983|
| Possibly_Negative| 7294|
|    Invalid_Reason| 3497|
|          Negative| 3178|
|Safety_Sideeffects|  937|
|           Success|  400|
+------------------+-----+

Broader class distribution: None


+-----------------------+-----+
|subclass               |count|
+-----------------------+-----+
|Insufficient_Enrollment|10768|
|Business_Administrative|7294 |
|Negative               |3178 |
|Logistics_Resources    |2720 |
|Study_Design           |2524 |
|Invalid_Reason         |2369 |
|Study_Staff_Moved      |1710 |
|Covid19                |1617 |
|Another_Study          |1335 |
|No_Context             |1128 |
|Safety_Sideeffects     |937  |
|Regulatory             |933  |
|Interim_Analysis       |928  |
|Success                |400  |
|Endpoint_Met           |267  |
|Ethical_Reason         |156  |
|Insufficient_Data      |25   |
+-----------------------+-----+

Finer class distribut

In [32]:
fig = px.bar(predictions_exploded.toPandas(), y='subclass', color='superclass', title='ChEMBL 22.02 Reasons to Stop Distribution')
fig.update_layout(barmode='stack', xaxis={'categoryorder':'total descending'})

fig.show()

In [33]:
# Build results into evidence by merging both datasets on the reason to stop field

print(evd.first())

print(predictions.first())

Row(clinicalPhase=3, clinicalStatus='Completed', datasourceId='chembl', datatypeId='known_drug', diseaseFromSource='HIV Infections', diseaseFromSourceMappedId='EFO_0000764', drugId='CHEMBL163', studyStartDate='2005-07-01', studyStopReason=None, targetFromSource='CHEMBL2364675', targetFromSourceId='P08684', urls=[Row(niceName='ClinicalTrials', url='https://clinicaltrials.gov/search?id=%22NCT00102960%22')])
Row(why_stopped='Health restrictions due to the COVID-19 pandemic', subclasses=['Covid19'], superclasses=['Neutral'])


In [34]:
predictions_exploded.filter(F.col('subclass') == 'Business_Administrative').show(20, False, True)

-RECORD 0--------------------------------------------------------------------------------------------
 why_stopped | Unable to secure clinic support for conducting research                               
 subclass    | Business_Administrative                                                               
 superclass  | Possibly_Negative                                                                     
-RECORD 1--------------------------------------------------------------------------------------------
 why_stopped | Lack of Funding                                                                       
 subclass    | Business_Administrative                                                               
 superclass  | Possibly_Negative                                                                     
-RECORD 2--------------------------------------------------------------------------------------------
 why_stopped | First cohort completed. Per sponsor decision.                      

In [35]:
print(f"{evd.filter(F.col('studyStopReason').isNull()).count()} / {evd.count()} ({evd.filter(F.col('studyStopReason').isNull()).count()/evd.count()*100}%) of ChEMBL evidence does not have a reason to stop.")

567765 / 632297 (89.7940366631504%) of ChEMBL evidence does not have a reason to stop.


In [37]:
(
    evd.join(predictions, evd['studyStopReason'] == predictions['why_stopped'], how='left')
    .select('studyStopReason', 'subclasses', 'superclasses')
    .filter(F.col('studyStopReason').isNotNull())
    .show(truncate=False)
)

+----------------------------------------+-------------------------+------------+
|studyStopReason                         |subclasses               |superclasses|
+----------------------------------------+-------------------------+------------+
|Failure to recruit adequate patients    |[Insufficient_Enrollment]|[Neutral]   |
|No signal of efficacy with Entospletinib|[Negative]               |[Negative]  |
|No signal of efficacy with Entospletinib|[Negative]               |[Negative]  |
|No signal of efficacy with Entospletinib|[Negative]               |[Negative]  |
|No signal of efficacy with Entospletinib|[Negative]               |[Negative]  |
|No signal of efficacy with Entospletinib|[Negative]               |[Negative]  |
|No signal of efficacy with Entospletinib|[Negative]               |[Negative]  |
|No signal of efficacy with Entospletinib|[Negative]               |[Negative]  |
|No signal of efficacy with Entospletinib|[Negative]               |[Negative]  |
|No signal of ef

In [136]:
evd_w_predictions = (
    evd.join(predictions, evd['studyStopReason'] == predictions['why_stopped'], how='left')
    .withColumnRenamed('subclasses', 'studyStopReasonSubclasses').withColumnRenamed('superclasses','studyStopReasonSuperclasses')
    .drop('why_stopped')
    .distinct()
)

evd_w_predictions.filter(F.col('studyStopReason').isNotNull()).first()

Row(clinicalPhase=1, clinicalStatus='Terminated', datasourceId='chembl', datatypeId='known_drug', diseaseFromSource='Acute Lymphoblastic Leukemia With Failed Remission', diseaseFromSourceMappedId='EFO_0000220', drugId='CHEMBL1421', studyStartDate='2018-04-20', studyStopReason='Due to departure of PI from St. Jude', targetFromSource='CHEMBL1913', targetFromSourceId='P09619', urls=[Row(niceName='ClinicalTrials', url='https://clinicaltrials.gov/search?id=%22NCT03515200%22')], studyStopReasonSubclasses=['Study_Staff_Moved'], studyStopReasonSuperclasses=['Neutral'])

In [139]:
evd_w_predictions.filter(F.array_contains(F.col('studyStopReasonSubclasses'), 'Success').select('studyStopReason').show()

SyntaxError: unexpected EOF while parsing (<ipython-input-139-fcda3a247d74>, line 1)

In [39]:
evd_w_predictions.first()

Row(clinicalPhase=3, clinicalStatus='Completed', datasourceId='chembl', datatypeId='known_drug', diseaseFromSource='Diabetes Mellitus, Type 2', diseaseFromSourceMappedId='EFO_0001360', drugId='CHEMBL1431', studyStartDate='2006-03-01', studyStopReason=None, targetFromSource='CHEMBL2363065', targetFromSourceId='O43920', urls=[Row(niceName='ClinicalTrials', url='https://clinicaltrials.gov/search?id=%22NCT00295633%22')], studyStopReasonSubclasses=None, studyStopReasonSuperclasses=None)

In [40]:
evd_w_predictions = (
    evd_w_predictions.withColumn(
        # About 2000 records have a reason to stop but the model throws no prediction
        'classes_count', F.when(F.col('studyStopReasonSubclasses').isNull(), '0')
        .when(F.col('studyStopReasonSubclasses').isNotNull(), F.size('studyStopReasonSubclasses'))
    )
)

In [41]:
fig2 = px.pie(evd_w_predictions.groupBy('classes_count').count().toPandas(), values='count', names='classes_count', title='ChEMBL 22.02 Predicted Classes Count')
#fig2.update_traces(textposition='inside', textinfo='percent+label')

fig2.show()

In [42]:
evd_w_predictions.drop('classes_count', 'studyStopReasonSuperclasses').withColumnRenamed('studyStopReasonSubclasses', 'studyStopReasonCategory').printSchema()

root
 |-- clinicalPhase: long (nullable = true)
 |-- clinicalStatus: string (nullable = true)
 |-- datasourceId: string (nullable = true)
 |-- datatypeId: string (nullable = true)
 |-- diseaseFromSource: string (nullable = true)
 |-- diseaseFromSourceMappedId: string (nullable = true)
 |-- drugId: string (nullable = true)
 |-- studyStartDate: string (nullable = true)
 |-- studyStopReason: string (nullable = true)
 |-- targetFromSource: string (nullable = true)
 |-- targetFromSourceId: string (nullable = true)
 |-- urls: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- niceName: string (nullable = true)
 |    |    |-- url: string (nullable = true)
 |-- studyStopReasonCategory: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [43]:
evd_w_predictions.filter(F.col('studyStopReasonSubclasses').isNotNull()).show(1, False, True)

-RECORD 0-------------------------------------------------------------------------------------------------
 clinicalPhase               | 1                                                                          
 clinicalStatus              | Terminated                                                                 
 datasourceId                | chembl                                                                     
 datatypeId                  | known_drug                                                                 
 diseaseFromSource           | Acute Lymphoblastic Leukemia With Failed Remission                         
 diseaseFromSourceMappedId   | EFO_0000220                                                                
 drugId                      | CHEMBL1421                                                                 
 studyStartDate              | 2018-04-20                                                                 
 studyStopReason             | Due to

- Daniel comment: use the granular class and keep the superclass out from the schema
- Add logic to the BE config. List all subclasses and map to the superclass score.
- How frequently we see more than one superclass.
- Add table instead of bar chart.
- Why are we missing so many stop reasons? 

In [44]:
evd_w_predictions.filter(F.array_contains(F.col('studyStopReasonSubclasses'), 'Safety_Sideeffects')).first()

Row(clinicalPhase=3, clinicalStatus='Terminated', datasourceId='chembl', datatypeId='known_drug', diseaseFromSource='Depression', diseaseFromSourceMappedId='MONDO_0002050', drugId='CHEMBL1175', studyStartDate='2015-05-01', studyStopReason='principle investigator decision due to many adverse events in patients', targetFromSource='CHEMBL222', targetFromSourceId='P23975', urls=[Row(niceName='ClinicalTrials', url='https://clinicaltrials.gov/search?id=%22NCT02443194%22')], studyStopReasonSubclasses=['Safety_Sideeffects', 'Insufficient_Enrollment'], studyStopReasonSuperclasses=['Safety_Sideeffects', 'Neutral'], classes_count='2')

In [45]:
# Hacer downweight en caso de negative o de safety, el score se conv¡erta en 0.
#  


## Scoring




In [None]:
tmp = evd_w_predictions.withColumn('subclass', F.explode('studyStopReasonSubclasses')).filter(F.col('subclass').isNotNull()).select('clinicalPhase', 'subclass')
tmp2 = evd_w_predictions.withColumn('superclass', F.explode('studyStopReasonSuperclasses')).filter(F.col('superclass').isNotNull()).select('clinicalPhase', 'superclass')

In [84]:

print('-------NEGATIVE-------')
a = tmp2.filter(F.col('superclass') == 'Negative').groupby('clinicalPhase').count().withColumn('superclass', F.lit('Negative'))
print(a.show())
print('\n')

print('-------SAFETY-------')
b = tmp2.filter(F.col('superclass') == 'Safety_Sideeffects').groupby('clinicalPhase').count().withColumn('superclass', F.lit('Safety_Sideeffects'))
print(b.show())

print('-------SUCCESS-------')
c = tmp2.filter(F.col('superclass') == 'Success').groupby('clinicalPhase').count().withColumn('superclass', F.lit('Success'))
print(c.show())
print('\n')

print('-------POSSIBLY NEGATIVE-------')
d = tmp2.filter(F.col('superclass') == 'Possibly_Negative').groupby('clinicalPhase').count().withColumn('superclass', F.lit('Possibly_Negative'))
print(d.show())
print('\n')

print('-------INVALID REASON-------')
e = tmp2.filter(F.col('superclass') == 'Invalid_Reason').groupby('clinicalPhase').count().withColumn('superclass', F.lit('Invalid_Reason'))
print(e.show())
print('\n')

print('-------NEUTRAL-------')
f = tmp2.filter(F.col('superclass') == 'Neutral').groupby('clinicalPhase').count().withColumn('superclass', F.lit('Neutral'))
print(f.show())
print('\n')

-------NEGATIVE-------
+-------------+-----+----------+
|clinicalPhase|count|superclass|
+-------------+-----+----------+
|            0|   16|  Negative|
|            1| 1451|  Negative|
|            3|  894|  Negative|
|            2| 2903|  Negative|
|            4|  248|  Negative|
+-------------+-----+----------+

None


-------SAFETY-------
+-------------+-----+------------------+
|clinicalPhase|count|        superclass|
+-------------+-----+------------------+
|            0|   16|Safety_Sideeffects|
|            1|  839|Safety_Sideeffects|
|            3|  449|Safety_Sideeffects|
|            2| 1516|Safety_Sideeffects|
|            4|  119|Safety_Sideeffects|
+-------------+-----+------------------+

None
-------SUCCESS-------
+-------------+-----+----------+
|clinicalPhase|count|superclass|
+-------------+-----+----------+
|            1|   47|   Success|
|            3|  152|   Success|
|            2|  183|   Success|
|            4|   69|   Success|
+-------------+-----+--

In [91]:
all = a.union(b).union(c).union(d).union(e).union(f).toPandas()
all['clinicalPhase'] = all['clinicalPhase'].astype(str)

all.head()

Unnamed: 0,clinicalPhase,count,superclass
0,0,16,Negative
1,1,1451,Negative
2,3,894,Negative
3,2,2903,Negative
4,4,248,Negative


In [95]:
fig = px.bar(all[(all['superclass'] != 'Neutral') & (all['superclass'] != 'Possibly_Negative') & (all['superclass'] != 'Invalid_Reason')], x="superclass", y="count", color="clinicalPhase", title="Distribution per phase of study")
fig.show()

In [96]:
evd_w_predictions.first()

Row(clinicalPhase=3, clinicalStatus='Completed', datasourceId='chembl', datatypeId='known_drug', diseaseFromSource='Diabetes Mellitus, Type 2', diseaseFromSourceMappedId='EFO_0001360', drugId='CHEMBL1431', studyStartDate='2006-03-01', studyStopReason=None, targetFromSource='CHEMBL2363065', targetFromSourceId='O43920', urls=[Row(niceName='ClinicalTrials', url='https://clinicaltrials.gov/search?id=%22NCT00295633%22')], studyStopReasonSubclasses=None, studyStopReasonSuperclasses=None, classes_count='0')

### Among the phase 4 studies that are downgraded, how many of them are backed by trials of a lower phase?

In [119]:
affected_assocs = (
    # Return all the associations affected by the downgrading
    evd_w_predictions
    .filter((F.array_contains(F.col('studyStopReasonSuperclasses'), 'Negative')) | (F.array_contains(F.col('studyStopReasonSuperclasses'), 'Safety_Sideeffects')))
    .select('targetFromSourceId', 'diseaseFromSourceMappedId')
    .distinct()
)

affected_assocs.show()

+------------------+-------------------------+
|targetFromSourceId|diseaseFromSourceMappedId|
+------------------+-------------------------+
|            P35372|              EFO_0003843|
|            P04049|              EFO_1001465|
|            P04350|              EFO_1001974|
|            Q9UBN7|            MONDO_0002691|
|            Q13547|            MONDO_0001657|
|            P56556|              EFO_0002618|
|            P30872|              EFO_0003843|
|            Q9H4B7|              EFO_1001830|
|            P36507|              EFO_0000311|
|            P78334|              EFO_0000612|
|            P23921|            MONDO_0002158|
|            Q5JZY3|              EFO_0000501|
|            Q13885|              EFO_0003869|
|            Q9BVA1|              EFO_0003869|
|            P17181|              EFO_1001373|
|            Q13547|            MONDO_0008170|
|            P47869|            Orphanet_3451|
|            P48551|            MONDO_0100130|
|            

In [115]:
(
    # Return the rest of evidence of the affected associations that is not affected by the downgrading
    evd_w_predictions
    .join(affected_assocs, on=['targetFromSourceId', 'diseaseFromSourceMappedId'], how='inner')
    .filter((~F.array_contains(F.col('studyStopReasonSuperclasses'), 'Negative')) & (~F.array_contains(F.col('studyStopReasonSuperclasses'), 'Safety_Sideeffects')))
    .select('targetFromSourceId', 'diseaseFromSourceMappedId', 'studyStopReasonSuperclasses', 'clinicalStatus', 'clinicalPhase')
    .show()
)

# This is interesting. All associations present a reason to stop...

+------------------+-------------------------+---------------------------+--------------+-------------+
|targetFromSourceId|diseaseFromSourceMappedId|studyStopReasonSuperclasses|clinicalStatus|clinicalPhase|
+------------------+-------------------------+---------------------------+--------------+-------------+
|            P09619|              EFO_0000220|                  [Neutral]|    Terminated|            1|
|            P07437|              EFO_0003060|                  [Neutral]|    Terminated|            2|
|            Q13885|              EFO_0000574|        [Possibly_Negative]|     Withdrawn|            1|
|            O43678|            MONDO_0008315|                  [Neutral]|    Terminated|            2|
|            Q07864|              EFO_0000565|                  [Neutral]|    Terminated|            2|
|            P04150|              EFO_0004599|       [Neutral, Possibl...|     Withdrawn|            4|
|            P11836|              EFO_0000403|        [Possibly_

In [124]:
# How are scores affected?

affected_evd_score = (   
    evd_w_predictions
    .filter((F.array_contains(F.col('studyStopReasonSuperclasses'), 'Negative')) | (F.array_contains(F.col('studyStopReasonSuperclasses'), 'Safety_Sideeffects')))
    .withColumn('originalScore', F.expr("element_at(map(0, 0.09, 1, 0.1, 2, 0.2, 3, 0.7, 4, 1.0), clinicalPhase)"))
    .withColumn('newScore', F.col('originalScore') * 0.5)
    .select('targetFromSourceId', 'diseaseFromSourceMappedId', 'clinicalPhase', 'originalScore', 'newScore')
)

affected_evd_score.show()

+------------------+-------------------------+-------------+-------------+--------+
|targetFromSourceId|diseaseFromSourceMappedId|clinicalPhase|originalScore|newScore|
+------------------+-------------------------+-------------+-------------+--------+
|            P20711|              EFO_0002508|            2|         0.20|     0.1|
|            P23975|            MONDO_0002050|            3|         0.70|    0.35|
|            P35346|              EFO_0007331|            2|         0.20|     0.1|
|            P17948|              EFO_0003060|            1|         0.10|    0.05|
|            P07550|              EFO_0003843|            2|         0.20|     0.1|
|            P27815|              EFO_0000729|            2|         0.20|     0.1|
|            Q9NRF9|              EFO_0000717|            3|         0.70|    0.35|
|            P35499|              EFO_0003030|            4|         1.00|     0.5|
|            Q9Y5Y9|               HP_0000989|            2|         0.20|  

In [127]:
print(affected_evd_score.agg(F.avg(F.col('originalScore'))).show())
print(affected_evd_score.agg(F.avg(F.col('newScore'))).show())

+------------------+
|avg(originalScore)|
+------------------+
|          0.283578|
+------------------+

None
+-------------------+
|      avg(newScore)|
+-------------------+
|0.14178892397376322|
+-------------------+

None


In [132]:
upweighted_evd_score = (   
    evd_w_predictions
    .filter((F.array_contains(F.col('studyStopReasonSuperclasses'), 'Success')))
    .withColumn('originalScore', F.expr("element_at(map(0, 0.09, 1, 0.1, 2, 0.2, 3, 0.7, 4, 1.0), clinicalPhase)"))
    .withColumn('newScore', F.col('originalScore') * 2)
    .select('targetFromSourceId', 'diseaseFromSourceMappedId', 'clinicalPhase', 'originalScore', 'newScore')
)

upweighted_evd_score.show()

+------------------+-------------------------+-------------+-------------+--------+
|targetFromSourceId|diseaseFromSourceMappedId|clinicalPhase|originalScore|newScore|
+------------------+-------------------------+-------------+-------------+--------+
|            P35462|              EFO_0000692|            4|         1.00|    2.00|
|            P10275|              EFO_0000673|            2|         0.20|    0.40|
|            Q15858|              EFO_0000692|            4|         1.00|    2.00|
|            Q9H4B7|            MONDO_0007254|            3|         0.70|    1.40|
|            P35968|              EFO_0001365|            2|         0.20|    0.40|
|            Q9UKV0|              EFO_0003060|            1|         0.10|    0.20|
|            P04350|            MONDO_0007254|            3|         0.70|    1.40|
|            P49005|              EFO_0003060|            4|         1.00|    2.00|
|            P48169|               HP_0002315|            3|         0.70|  

In [133]:
print(upweighted_evd_score.agg(F.avg(F.col('originalScore'))).show())
print(upweighted_evd_score.agg(F.avg(F.col('newScore'))).show())

+------------------+
|avg(originalScore)|
+------------------+
|          0.480488|
+------------------+

None
+-------------+
|avg(newScore)|
+-------------+
|     0.960976|
+-------------+

None
