In [7]:
import pandas as pd
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql import types as T
import plotly.express as px

# Initialise Spark

spark = SparkSession.builder \
    .appName("Spark NLP")\
    .master("local[4]")\
    .config("spark.driver.memory","16G")\
    .config("spark.driver.maxResultSize", "0") \
    .config("spark.kryoserializer.buffer.max", "2000M")\
    .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:3.3.4")\
    .getOrCreate()

In [2]:
# Prepare input file from chembl evidence

evd = (
    spark.read.json('data/cttv008-20-01-2022.json.gz')
)

In [3]:
studies = (
        evd

        # Extract studies with their reasons to stop
        .filter(F.col('studyStopReason').isNotNull())
        .withColumn('urls', F.explode('urls'))
        .filter(F.col('urls.niceName').contains('ClinicalTrials'))
        .withColumn('nct_id', F.element_at(F.split(F.col('urls.url'), '%22'), -2))
        .select('nct_id', F.col('studyStopReason').alias('why_stopped'))
        .distinct()

        # Convert to Pandas DF
        #.toPandas()
    )

studies.show()

+-----------+--------------------+
|     nct_id|         why_stopped|
+-----------+--------------------+
|NCT01286974|Terminated early,...|
|NCT00318474|DSMB recommended ...|
|NCT01328093|The decision to s...|
|NCT00072189|Early termination...|
|NCT03119623|Lost funding prio...|
|NCT03315052|Delay in IRB appr...|
|NCT00895297|Slow recruitment....|
|NCT00383331|Trial was stopped...|
|NCT00421317|Study stopped at ...|
|NCT00880373|The funding withd...|
|NCT02389764|        Slow Accrual|
|NCT00296959|early termination...|
|NCT01197235|difficult to enro...|
|NCT02677948|FDA has placed al...|
|NCT00509366|Study terminated ...|
|NCT00447369|Because we did no...|
|NCT02811159|   Business decision|
|NCT02715804|    Sponsor decision|
|NCT01446575|This study was re...|
|NCT02179151|   FDA Clinical Hold|
+-----------+--------------------+
only showing top 20 rows



In [9]:
# Export table to make predictions
# studies.coalesce(1).write.csv('data/studies_latest.tsv', sep='\t', header=True)

In [4]:
# Load results from predict.py

schema = T.ArrayType(T.StringType())

predictions = (
    spark.read.csv('/Users/irene/MEGAsync/EBI/repos/evidence_datasource_parsers/exploration/stopReasons/data/chembl_predictions_latest.tsv', sep='\t', header=False)
    .toDF('why_stopped', 'subclass_raw', 'superclass_raw')

    # Lists are represented as strings. They must be converted
    .withColumn("subclass_raw", F.from_json(F.regexp_replace(F.col('subclass_raw'), "(u')", "'"), schema=schema))
    .withColumn("superclass_raw", F.from_json(F.regexp_replace(F.col('superclass_raw'), "(u')", "'"), schema=schema))
)

predictions.show(10, False, True)

# Look at record #2

-RECORD 0----------------------------------------------------------------------------------------------------------------------------------------------------------------------
 why_stopped    | Terminated early, Sponsor Decision                                                                                                                           
 subclass_raw   | [Business_Administrative]                                                                                                                                    
 superclass_raw | [Possibly_Negative]                                                                                                                                          
-RECORD 1----------------------------------------------------------------------------------------------------------------------------------------------------------------------
 why_stopped    | DSMB recommended stopping the trial because of lack of effect.                                        

In [5]:
"""
This does not work because I end up with a combination of each subclass with their superclass
I want to keep the one subclass to one superclass mapping

predictions_exploded = (
    predictions.withColumn("subclass", F.explode('subclass_raw'))
    .withColumn("superclass", F.explode('superclass_raw'))
    .drop('subclass_raw', 'superclass_raw')
)
"""

predictions_exploded = (
    predictions
    .withColumn('tmp', F.arrays_zip('subclass_raw', 'superclass_raw'))
    .withColumn('tmp', F.explode('tmp'))
    .select('why_stopped', F.col('tmp.subclass_raw').alias('subclass'), F.col('tmp.superclass_raw').alias('superclass'))
)

predictions_exploded.show(5, False, True)



-RECORD 0------------------------------------------------------------------------------------------------------------------------
 why_stopped | Terminated early, Sponsor Decision                                                                                
 subclass    | Business_Administrative                                                                                           
 superclass  | Possibly_Negative                                                                                                 
-RECORD 1------------------------------------------------------------------------------------------------------------------------
 why_stopped | DSMB recommended stopping the trial because of lack of effect.                                                    
 subclass    | Negative                                                                                                          
 superclass  | Negative                                                                   

In [62]:
print('Broader class distribution:', predictions_exploded.groupBy('superclass').count().orderBy(F.col('count').desc()).show())
print('\n')
print('Finer class distribution:', predictions_exploded.groupBy('subclass').count().orderBy(F.col('count').desc()).show(truncate=False))

+------------------+-----+
|        superclass|count|
+------------------+-----+
|           Neutral| 6342|
| Possibly_Negative| 2213|
|          Negative| 1319|
|    Invalid_Reason|  995|
|Safety_Sideeffects|  359|
|           Success|  105|
+------------------+-----+

Broader class distribution: None


+-----------------------+-----+
|subclass               |count|
+-----------------------+-----+
|Insufficient_Enrollment|3375 |
|Business_Administrative|2213 |
|Negative               |1319 |
|Study_Design           |696  |
|Invalid_Reason         |614  |
|Logistics_Resources    |590  |
|Another_Study          |511  |
|No_Context             |381  |
|Safety_Sideeffects     |359  |
|Study_Staff_Moved      |306  |
|Covid19                |255  |
|Interim_Analysis       |248  |
|Regulatory             |244  |
|Success                |105  |
|Endpoint_Met           |79   |
|Ethical_Reason         |33   |
|Insufficient_Data      |5    |
+-----------------------+-----+

Finer class distribut

In [8]:
fig = px.bar(predictions_exploded.toPandas(), y='subclass', color='superclass', title='ChEMBL 22.02 Reasons to Stop Distribution')
fig.update_layout(barmode='stack', xaxis={'categoryorder':'total descending'})

fig.show()

In [67]:
# Build results into evidence by merging both datasets on the reason to stop field

print(evd.first())

print(predictions.first())

Row(clinicalPhase=3, clinicalStatus='Completed', datasourceId='chembl', datatypeId='known_drug', diseaseFromSource='HIV Infections', diseaseFromSourceMappedId='EFO_0000764', drugId='CHEMBL163', studyStartDate='2005-07-01', studyStopReason=None, targetFromSource='CHEMBL2364675', targetFromSourceId='P08684', urls=[Row(niceName='ClinicalTrials', url='https://clinicaltrials.gov/search?id=%22NCT00102960%22')])
Row(why_stopped='Terminated early, Sponsor Decision', subclass_raw=['Business_Administrative'], superclass_raw=['Possibly_Negative'])


In [77]:
print(f"{evd.filter(F.col('studyStopReason').isNull()).count()} / {evd.count()} ({evd.filter(F.col('studyStopReason').isNull()).count()/evd.count()*100}%) of ChEMBL evidence does not have a reason to stop.")

567765 / 632297 (89.7940366631504%) of ChEMBL evidence does not have a reason to stop.


In [73]:
(
    evd.join(predictions, evd['studyStopReason'] == predictions['why_stopped'], how='left')
    .select('studyStopReason', 'subclass_raw', 'superclass_raw')
    .filter(F.col('studyStopReason').isNotNull())
    .show(truncate=False)
)

+----------------------------------------+-------------------------+--------------+
|studyStopReason                         |subclass_raw             |superclass_raw|
+----------------------------------------+-------------------------+--------------+
|Failure to recruit adequate patients    |[Insufficient_Enrollment]|[Neutral]     |
|No signal of efficacy with Entospletinib|[Negative]               |[Negative]    |
|No signal of efficacy with Entospletinib|[Negative]               |[Negative]    |
|No signal of efficacy with Entospletinib|[Negative]               |[Negative]    |
|No signal of efficacy with Entospletinib|[Negative]               |[Negative]    |
|No signal of efficacy with Entospletinib|[Negative]               |[Negative]    |
|No signal of efficacy with Entospletinib|[Negative]               |[Negative]    |
|No signal of efficacy with Entospletinib|[Negative]               |[Negative]    |
|No signal of efficacy with Entospletinib|[Negative]               |[Negativ

In [49]:
evd_w_predictions = (
    evd.join(predictions, evd['studyStopReason'] == predictions['why_stopped'], how='left')
    .withColumnRenamed('subclass_raw', 'studyStopReasonSubclasses').withColumnRenamed('superclass_raw','studyStopReasonSuperclasses')
    .drop('why_stopped')
    .distinct()
)

evd_w_predictions.filter(F.col('studyStopReason').isNotNull()).first()

Row(clinicalPhase=1, clinicalStatus='Terminated', datasourceId='chembl', datatypeId='known_drug', diseaseFromSource='Acute Lymphoblastic Leukemia With Failed Remission', diseaseFromSourceMappedId='EFO_0000220', drugId='CHEMBL1421', studyStartDate='2018-04-20', studyStopReason='Due to departure of PI from St. Jude', targetFromSource='CHEMBL1913', targetFromSourceId='P09619', urls=[Row(niceName='ClinicalTrials', url='https://clinicaltrials.gov/search?id=%22NCT03515200%22')], studyStopReasonSubclasses=['Study_Staff_Moved'], studyStopReasonSuperclasses=['Neutral'])

In [51]:
evd_w_predictions.first()

Row(clinicalPhase=3, clinicalStatus='Completed', datasourceId='chembl', datatypeId='known_drug', diseaseFromSource='Diabetes Mellitus, Type 2', diseaseFromSourceMappedId='EFO_0001360', drugId='CHEMBL1431', studyStartDate='2006-03-01', studyStopReason=None, targetFromSource='CHEMBL2363065', targetFromSourceId='O43920', urls=[Row(niceName='ClinicalTrials', url='https://clinicaltrials.gov/search?id=%22NCT00295633%22')], studyStopReasonSubclasses=None, studyStopReasonSuperclasses=None)

In [52]:
evd_w_predictions = (
    evd_w_predictions.withColumn(
        # About 2000 records have a reason to stop but the model throws no prediction
        'classes_count', F.when(F.col('studyStopReasonSubclasses').isNull(), '0')
        .when(F.col('studyStopReasonSubclasses').isNotNull(), F.size('studyStopReasonSubclasses'))
    )
)

In [55]:
fig2 = px.pie(evd_w_predictions.groupBy('classes_count').count().toPandas(), values='count', names='classes_count', title='ChEMBL 22.02 Predicted Classes Count')
#fig2.update_traces(textposition='inside', textinfo='percent+label')

fig2.show()

In [57]:
evd_w_predictions.drop('classes_count').printSchema()

root
 |-- clinicalPhase: long (nullable = true)
 |-- clinicalStatus: string (nullable = true)
 |-- datasourceId: string (nullable = true)
 |-- datatypeId: string (nullable = true)
 |-- diseaseFromSource: string (nullable = true)
 |-- diseaseFromSourceMappedId: string (nullable = true)
 |-- drugId: string (nullable = true)
 |-- studyStartDate: string (nullable = true)
 |-- studyStopReason: string (nullable = true)
 |-- targetFromSource: string (nullable = true)
 |-- targetFromSourceId: string (nullable = true)
 |-- urls: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- niceName: string (nullable = true)
 |    |    |-- url: string (nullable = true)
 |-- studyStopReasonSubclasses: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- studyStopReasonSuperclasses: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [58]:
evd_w_predictions.filter(F.col('studyStopReasonSubclasses').isNotNull()).show(1, False, True)

-RECORD 0-------------------------------------------------------------------------------------------------
 clinicalPhase               | 1                                                                          
 clinicalStatus              | Terminated                                                                 
 datasourceId                | chembl                                                                     
 datatypeId                  | known_drug                                                                 
 diseaseFromSource           | Acute Lymphoblastic Leukemia With Failed Remission                         
 diseaseFromSourceMappedId   | EFO_0000220                                                                
 drugId                      | CHEMBL1421                                                                 
 studyStartDate              | 2018-04-20                                                                 
 studyStopReason             | Due to

- Daniel comment: use the granular class and keep the superclass out from the schema
- Add logic to the BE config. List all subclasses and map to the superclass score.
- How frequently we see more than one superclass.
- Add table instead of bar chart.
- Why are we missing so many stop reasons? 

In [60]:
evd_w_predictions.filter(F.array_contains(F.col('studyStopReasonSubclasses'), 'Safety_Sideeffects')).first()

Row(clinicalPhase=3, clinicalStatus='Terminated', datasourceId='chembl', datatypeId='known_drug', diseaseFromSource='Depression', diseaseFromSourceMappedId='MONDO_0002050', drugId='CHEMBL1175', studyStartDate='2015-05-01', studyStopReason='principle investigator decision due to many adverse events in patients', targetFromSource='CHEMBL222', targetFromSourceId='P23975', urls=[Row(niceName='ClinicalTrials', url='https://clinicaltrials.gov/search?id=%22NCT02443194%22')], studyStopReasonSubclasses=['Safety_Sideeffects', 'Insufficient_Enrollment'], studyStopReasonSuperclasses=['Safety_Sideeffects', 'Neutral'], classes_count='2')

In [61]:
evd_w_predictions.filter(F.array_contains(F.col('studyStopReasonSubclasses'), 'Safety_Sideeffects')).select('targetFromSourceId').distinct().count()

482