In [24]:
import pandas as pd
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql import types as T
import plotly.express as px

# Initialise Spark

spark = SparkSession.builder \
    .appName("Spark NLP")\
    .master("local[4]")\
    .config("spark.driver.memory","16G")\
    .config("spark.driver.maxResultSize", "0") \
    .config("spark.kryoserializer.buffer.max", "2000M")\
    .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:3.3.4")\
    .getOrCreate()

In [25]:
# Prepare input file from chembl evidence

evd = (
    spark.read.json('data/cttv008-20-01-2022.json.gz')
)

In [26]:
studies = (
        evd

        # Extract studies with their reasons to stop
        .filter(F.col('studyStopReason').isNotNull())
        .withColumn('urls', F.explode('urls'))
        .filter(F.col('urls.niceName').contains('ClinicalTrials'))
        .withColumn('nct_id', F.element_at(F.split(F.col('urls.url'), '%22'), -2))
        .select('nct_id', F.col('studyStopReason').alias('why_stopped'))
        .distinct()

        # Convert to Pandas DF
        #.toPandas()
    )

studies.show()

+-----------+--------------------+
|     nct_id|         why_stopped|
+-----------+--------------------+
|NCT01286974|Terminated early,...|
|NCT00318474|DSMB recommended ...|
|NCT01328093|The decision to s...|
|NCT00072189|Early termination...|
|NCT03119623|Lost funding prio...|
|NCT03315052|Delay in IRB appr...|
|NCT00895297|Slow recruitment....|
|NCT00383331|Trial was stopped...|
|NCT00421317|Study stopped at ...|
|NCT00880373|The funding withd...|
|NCT02389764|        Slow Accrual|
|NCT00296959|early termination...|
|NCT01197235|difficult to enro...|
|NCT02677948|FDA has placed al...|
|NCT00509366|Study terminated ...|
|NCT00447369|Because we did no...|
|NCT02811159|   Business decision|
|NCT02715804|    Sponsor decision|
|NCT01446575|This study was re...|
|NCT02179151|   FDA Clinical Hold|
+-----------+--------------------+
only showing top 20 rows



In [27]:
# Export table to make predictions
# studies.coalesce(1).write.csv('data/studies_latest.tsv', sep='\t', header=True)

In [28]:
# Load results from predict.py

schema = T.ArrayType(T.StringType())

predictions = (
    spark.read.csv('data/chembl_predictions-2022-01-24.tsv', sep='\t', header=True)

    # Lists are represented as strings. They must be converted
    .withColumn("subclasses", F.from_json(F.regexp_replace(F.col('subclasses'), "(u')", "'"), schema=schema))
    .withColumn("superclasses", F.from_json(F.regexp_replace(F.col('superclasses'), "(u')", "'"), schema=schema))
)

predictions.show(10, False, True)


-RECORD 0-----------------------------------------------------------------------------------
 why_stopped  | Health restrictions due to the COVID-19 pandemic                            
 subclasses   | [Covid19]                                                                   
 superclasses | [Neutral]                                                                   
-RECORD 1-----------------------------------------------------------------------------------
 why_stopped  | Study withdrawn due to logistical and safety concerns relating to Covid-19. 
 subclasses   | [Covid19]                                                                   
 superclasses | [Neutral]                                                                   
-RECORD 2-----------------------------------------------------------------------------------
 why_stopped  | SARS-CoV-2 Pandemic                                                         
 subclasses   | [Covid19]                                             

In [29]:
predictions.filter(F.col('why_stopped').isNotNull()).filter(F.col('subclasses').isNull()).count()

3213

In [30]:
"""
This does not work because I end up with a combination of each subclass with their superclass
I want to keep the one subclass to one superclass mapping

predictions_exploded = (
    predictions.withColumn("subclass", F.explode('subclass_raw'))
    .withColumn("superclass", F.explode('superclass_raw'))
    .drop('subclass_raw', 'superclass_raw')
)
"""

predictions_exploded = (
    predictions
    .withColumn('tmp', F.arrays_zip('subclasses', 'superclasses'))
    .withColumn('tmp', F.explode('tmp'))
    .select('why_stopped', F.col('tmp.subclasses').alias('subclass'), F.col('tmp.superclasses').alias('superclass'))
)

predictions_exploded.show(5, False, True)



-RECORD 0----------------------------------------------------------------------------------
 why_stopped | Health restrictions due to the COVID-19 pandemic                            
 subclass    | Covid19                                                                     
 superclass  | Neutral                                                                     
-RECORD 1----------------------------------------------------------------------------------
 why_stopped | Study withdrawn due to logistical and safety concerns relating to Covid-19. 
 subclass    | Covid19                                                                     
 superclass  | Neutral                                                                     
-RECORD 2----------------------------------------------------------------------------------
 why_stopped | SARS-CoV-2 Pandemic                                                         
 subclass    | Covid19                                                          

In [31]:
print('Broader class distribution:', predictions_exploded.groupBy('superclass').count().orderBy(F.col('count').desc()).show())
print('\n')
print('Finer class distribution:', predictions_exploded.groupBy('subclass').count().orderBy(F.col('count').desc()).show(truncate=False))

+------------------+-----+
|        superclass|count|
+------------------+-----+
|           Neutral|22983|
| Possibly_Negative| 7294|
|    Invalid_Reason| 3497|
|          Negative| 3178|
|Safety_Sideeffects|  937|
|           Success|  400|
+------------------+-----+

Broader class distribution: None


+-----------------------+-----+
|subclass               |count|
+-----------------------+-----+
|Insufficient_Enrollment|10768|
|Business_Administrative|7294 |
|Negative               |3178 |
|Logistics_Resources    |2720 |
|Study_Design           |2524 |
|Invalid_Reason         |2369 |
|Study_Staff_Moved      |1710 |
|Covid19                |1617 |
|Another_Study          |1335 |
|No_Context             |1128 |
|Safety_Sideeffects     |937  |
|Regulatory             |933  |
|Interim_Analysis       |928  |
|Success                |400  |
|Endpoint_Met           |267  |
|Ethical_Reason         |156  |
|Insufficient_Data      |25   |
+-----------------------+-----+

Finer class distribut

In [32]:
fig = px.bar(predictions_exploded.toPandas(), y='subclass', color='superclass', title='ChEMBL 22.02 Reasons to Stop Distribution')
fig.update_layout(barmode='stack', xaxis={'categoryorder':'total descending'})

fig.show()

In [33]:
# Build results into evidence by merging both datasets on the reason to stop field

print(evd.first())

print(predictions.first())

Row(clinicalPhase=3, clinicalStatus='Completed', datasourceId='chembl', datatypeId='known_drug', diseaseFromSource='HIV Infections', diseaseFromSourceMappedId='EFO_0000764', drugId='CHEMBL163', studyStartDate='2005-07-01', studyStopReason=None, targetFromSource='CHEMBL2364675', targetFromSourceId='P08684', urls=[Row(niceName='ClinicalTrials', url='https://clinicaltrials.gov/search?id=%22NCT00102960%22')])
Row(why_stopped='Health restrictions due to the COVID-19 pandemic', subclasses=['Covid19'], superclasses=['Neutral'])


In [34]:
predictions_exploded.filter(F.col('subclass') == 'Business_Administrative').show(20, False, True)

-RECORD 0--------------------------------------------------------------------------------------------
 why_stopped | Unable to secure clinic support for conducting research                               
 subclass    | Business_Administrative                                                               
 superclass  | Possibly_Negative                                                                     
-RECORD 1--------------------------------------------------------------------------------------------
 why_stopped | Lack of Funding                                                                       
 subclass    | Business_Administrative                                                               
 superclass  | Possibly_Negative                                                                     
-RECORD 2--------------------------------------------------------------------------------------------
 why_stopped | First cohort completed. Per sponsor decision.                      

In [35]:
print(f"{evd.filter(F.col('studyStopReason').isNull()).count()} / {evd.count()} ({evd.filter(F.col('studyStopReason').isNull()).count()/evd.count()*100}%) of ChEMBL evidence does not have a reason to stop.")

567765 / 632297 (89.7940366631504%) of ChEMBL evidence does not have a reason to stop.


In [37]:
(
    evd.join(predictions, evd['studyStopReason'] == predictions['why_stopped'], how='left')
    .select('studyStopReason', 'subclasses', 'superclasses')
    .filter(F.col('studyStopReason').isNotNull())
    .show(truncate=False)
)

+----------------------------------------+-------------------------+------------+
|studyStopReason                         |subclasses               |superclasses|
+----------------------------------------+-------------------------+------------+
|Failure to recruit adequate patients    |[Insufficient_Enrollment]|[Neutral]   |
|No signal of efficacy with Entospletinib|[Negative]               |[Negative]  |
|No signal of efficacy with Entospletinib|[Negative]               |[Negative]  |
|No signal of efficacy with Entospletinib|[Negative]               |[Negative]  |
|No signal of efficacy with Entospletinib|[Negative]               |[Negative]  |
|No signal of efficacy with Entospletinib|[Negative]               |[Negative]  |
|No signal of efficacy with Entospletinib|[Negative]               |[Negative]  |
|No signal of efficacy with Entospletinib|[Negative]               |[Negative]  |
|No signal of efficacy with Entospletinib|[Negative]               |[Negative]  |
|No signal of ef

In [38]:
evd_w_predictions = (
    evd.join(predictions, evd['studyStopReason'] == predictions['why_stopped'], how='left')
    .withColumnRenamed('subclasses', 'studyStopReasonSubclasses').withColumnRenamed('superclasses','studyStopReasonSuperclasses')
    .drop('why_stopped')
    .distinct()
)

evd_w_predictions.filter(F.col('studyStopReason').isNotNull()).first()

Row(clinicalPhase=1, clinicalStatus='Terminated', datasourceId='chembl', datatypeId='known_drug', diseaseFromSource='Acute Lymphoblastic Leukemia With Failed Remission', diseaseFromSourceMappedId='EFO_0000220', drugId='CHEMBL1421', studyStartDate='2018-04-20', studyStopReason='Due to departure of PI from St. Jude', targetFromSource='CHEMBL1913', targetFromSourceId='P09619', urls=[Row(niceName='ClinicalTrials', url='https://clinicaltrials.gov/search?id=%22NCT03515200%22')], studyStopReasonSubclasses=['Study_Staff_Moved'], studyStopReasonSuperclasses=['Neutral'])

In [39]:
evd_w_predictions.first()

Row(clinicalPhase=3, clinicalStatus='Completed', datasourceId='chembl', datatypeId='known_drug', diseaseFromSource='Diabetes Mellitus, Type 2', diseaseFromSourceMappedId='EFO_0001360', drugId='CHEMBL1431', studyStartDate='2006-03-01', studyStopReason=None, targetFromSource='CHEMBL2363065', targetFromSourceId='O43920', urls=[Row(niceName='ClinicalTrials', url='https://clinicaltrials.gov/search?id=%22NCT00295633%22')], studyStopReasonSubclasses=None, studyStopReasonSuperclasses=None)

In [40]:
evd_w_predictions = (
    evd_w_predictions.withColumn(
        # About 2000 records have a reason to stop but the model throws no prediction
        'classes_count', F.when(F.col('studyStopReasonSubclasses').isNull(), '0')
        .when(F.col('studyStopReasonSubclasses').isNotNull(), F.size('studyStopReasonSubclasses'))
    )
)

In [41]:
fig2 = px.pie(evd_w_predictions.groupBy('classes_count').count().toPandas(), values='count', names='classes_count', title='ChEMBL 22.02 Predicted Classes Count')
#fig2.update_traces(textposition='inside', textinfo='percent+label')

fig2.show()

In [42]:
evd_w_predictions.drop('classes_count', 'studyStopReasonSuperclasses').withColumnRenamed('studyStopReasonSubclasses', 'studyStopReasonCategory').printSchema()

root
 |-- clinicalPhase: long (nullable = true)
 |-- clinicalStatus: string (nullable = true)
 |-- datasourceId: string (nullable = true)
 |-- datatypeId: string (nullable = true)
 |-- diseaseFromSource: string (nullable = true)
 |-- diseaseFromSourceMappedId: string (nullable = true)
 |-- drugId: string (nullable = true)
 |-- studyStartDate: string (nullable = true)
 |-- studyStopReason: string (nullable = true)
 |-- targetFromSource: string (nullable = true)
 |-- targetFromSourceId: string (nullable = true)
 |-- urls: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- niceName: string (nullable = true)
 |    |    |-- url: string (nullable = true)
 |-- studyStopReasonCategory: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [43]:
evd_w_predictions.filter(F.col('studyStopReasonSubclasses').isNotNull()).show(1, False, True)

-RECORD 0-------------------------------------------------------------------------------------------------
 clinicalPhase               | 1                                                                          
 clinicalStatus              | Terminated                                                                 
 datasourceId                | chembl                                                                     
 datatypeId                  | known_drug                                                                 
 diseaseFromSource           | Acute Lymphoblastic Leukemia With Failed Remission                         
 diseaseFromSourceMappedId   | EFO_0000220                                                                
 drugId                      | CHEMBL1421                                                                 
 studyStartDate              | 2018-04-20                                                                 
 studyStopReason             | Due to

- Daniel comment: use the granular class and keep the superclass out from the schema
- Add logic to the BE config. List all subclasses and map to the superclass score.
- How frequently we see more than one superclass.
- Add table instead of bar chart.
- Why are we missing so many stop reasons? 

In [44]:
evd_w_predictions.filter(F.array_contains(F.col('studyStopReasonSubclasses'), 'Safety_Sideeffects')).first()

Row(clinicalPhase=3, clinicalStatus='Terminated', datasourceId='chembl', datatypeId='known_drug', diseaseFromSource='Depression', diseaseFromSourceMappedId='MONDO_0002050', drugId='CHEMBL1175', studyStartDate='2015-05-01', studyStopReason='principle investigator decision due to many adverse events in patients', targetFromSource='CHEMBL222', targetFromSourceId='P23975', urls=[Row(niceName='ClinicalTrials', url='https://clinicaltrials.gov/search?id=%22NCT02443194%22')], studyStopReasonSubclasses=['Safety_Sideeffects', 'Insufficient_Enrollment'], studyStopReasonSuperclasses=['Safety_Sideeffects', 'Neutral'], classes_count='2')

In [45]:
# Hacer downweight en caso de negative o de safety, el score se conv¡erta en 0.
#  


## Scoring




In [None]:
tmp = evd_w_predictions.withColumn('subclass', F.explode('studyStopReasonSubclasses')).filter(F.col('subclass').isNotNull()).select('clinicalPhase', 'subclass')
tmp2 = evd_w_predictions.withColumn('superclass', F.explode('studyStopReasonSuperclasses')).filter(F.col('superclass').isNotNull()).select('clinicalPhase', 'superclass')

In [61]:

print('-------NEGATIVE-------')
print(tmp2.filter(F.col('superclass') == 'Negative').groupby('clinicalPhase').count().show())
print('\n')

print('-------SAFETY-------')
print(tmp2.filter(F.col('superclass') == 'Safety_Sideeffects').groupby('clinicalPhase').count().show())

print('-------SUCCESS-------')
print(tmp2.filter(F.col('superclass') == 'Success').groupby('clinicalPhase').count().show())
print('\n')

print('-------POSSIBLY NEGATIVE-------')
print(tmp2.filter(F.col('superclass') == 'Possibly_Negative').groupby('clinicalPhase').count().show())
print('\n')

print('-------INVALID REASON-------')
print(tmp2.filter(F.col('superclass') == 'Invalid_Reason').groupby('clinicalPhase').count().show())
print('\n')

print('-------NEUTRAL-------')
print(tmp2.filter(F.col('superclass') == 'Neutral').groupby('clinicalPhase').count().show())
print('\n')

-------NEGATIVE-------
+-------------+-----+
|clinicalPhase|count|
+-------------+-----+
|            0|   16|
|            1| 1451|
|            3|  894|
|            2| 2903|
|            4|  248|
+-------------+-----+

None


-------SAFETY-------
+-------------+-----+
|clinicalPhase|count|
+-------------+-----+
|            0|   16|
|            1|  839|
|            3|  449|
|            2| 1516|
|            4|  119|
+-------------+-----+

None
-------SUCCESS-------
+-------------+-----+
|clinicalPhase|count|
+-------------+-----+
|            1|   47|
|            3|  152|
|            2|  183|
|            4|   69|
+-------------+-----+

None


-------POSSIBLY NEGATIVE-------
+-------------+-----+
|clinicalPhase|count|
+-------------+-----+
|            0|  190|
|            1| 3984|
|            3| 2199|
|            2| 5816|
|            4| 1608|
+-------------+-----+

None


-------INVALID REASON-------
+-------------+-----+
|clinicalPhase|count|
+-------------+-----+
|      

In [51]:
tmp = evd_w_predictions.withColumn('subclass', F.explode('studyStopReasonSubclasses')).filter(F.col('subclass').isNotNull()).select('clinicalPhase', 'subclass')
tmp2 = evd_w_predictions.withColumn('superclass', F.explode('studyStopReasonSuperclasses')).filter(F.col('superclass').isNotNull()).select('clinicalPhase', 'superclass')


In [50]:
fig = px.bar(tmp.toPandas(), y='subclass', color='clinicalPhase', title='ChEMBL 22.02 Reasons to Stop Distribution per Phase')
fig.update_layout(barmode='stack', xaxis={'categoryorder':'total descending'})

fig.show()

In [52]:
fig2 = px.bar(tmp2.toPandas(), y='superclass', color='clinicalPhase', title='ChEMBL 22.02 Reasons to Stop Distribution per Phase')
fig2.update_layout(barmode='stack', xaxis={'categoryorder':'total descending'})

fig2.show()

In [56]:
from pyspark.sql import Window
w = Window.partitionBy('clinicalPhase')
windowed = tmp2.select('clinicalPhase', 'superclass', F.count('clinicalPhase').over(w).alias('n')).sort('clinicalPhase', 'superclass').distinct()

windowed.show()

+-------------+------------------+-----+
|clinicalPhase|        superclass|    n|
+-------------+------------------+-----+
|            0|           Neutral|  686|
|            0| Possibly_Negative|  686|
|            0|    Invalid_Reason|  686|
|            0|          Negative|  686|
|            0|Safety_Sideeffects|  686|
|            1|           Neutral|18214|
|            1| Possibly_Negative|18214|
|            1|    Invalid_Reason|18214|
|            1|Safety_Sideeffects|18214|
|            1|          Negative|18214|
|            1|           Success|18214|
|            2| Possibly_Negative|31223|
|            2|           Neutral|31223|
|            2|          Negative|31223|
|            2|    Invalid_Reason|31223|
|            2|Safety_Sideeffects|31223|
|            2|           Success|31223|
|            3|           Neutral| 9983|
|            3|Safety_Sideeffects| 9983|
|            3| Possibly_Negative| 9983|
+-------------+------------------+-----+
only showing top

In [59]:
fig = px.bar(windowed.toPandas(), x='superclass', y='n', color='clinicalPhase', title='ChEMBL 22.02 Reasons to Stop Distribution per Phase')

fig.show()