In [1]:
# Install pandas if you dont have it already
!pip install pandas pytrials

Looking in indexes: https://pypi.org/simple, https://aiuser:****@pypi.isbei.ai/
Collecting pytrials
  Downloading pytrials-0.3.0-py2.py3-none-any.whl.metadata (3.3 kB)
Downloading pytrials-0.3.0-py2.py3-none-any.whl (6.3 kB)
Installing collected packages: pytrials
Successfully installed pytrials-0.3.0


#### Import Libraries

In [2]:
import pandas as pd
from pytrials.client import ClinicalTrials
import re

#### Read Input Data

To download data:
- go to https://clinicaltrials.gov/
- search for expression i.e. cancer 
- go to **Table View**
- click on the download button
- Select CSV file format, all studies and select at least these fields:

***['NCT Number', 'Study Title', 'Study URL', 'Conditions', 'Interventions']***

- save data under data/clinical_trials.csv in this directory

In [24]:
trials = pd.read_csv("data/clinical_trials.csv")

#### PreProcessing

In [201]:
# Dropping null rows
trials = trials.dropna(axis=0, how='any')

#### Filtering only cancer trials

These synonyms can be replaced with for any other drug. They can be found once you search for your studies and click on ***Synonyms of conditions or disease***

In [224]:
# Cancer Synonyms List
cancer_synonyms = ["cancer", "Neoplasms", "Tumors", "Tumor", "Neoplasm", "malignancies", "Cancers",
                    "Oncology", "Malignancy", "Tumour", "tumours", "Neoplasia", "Malignant neoplasm", "malignant tumors",
                    "Malignant tumor", "Malignant Neoplasms", "primary cancer", "Neoplastic Disease", "Cancer NOS",
                    "neoplasias", "neoplastic syndrome", "Primary Malignant Neoplasm", "tumor NOS"," Malignant neoplastic disease",
                    "malignant tumours", "Malignant tumour", "malignant neoplasm primary"]

In [203]:
# Cancer Regex
cancer_regex = "|".join(cancer_synonyms)

In [204]:
# Using regex to filter cancer trials
cancer_trials = trials[trials['Conditions'].str.contains(f"r'{cancer_regex}'", regex=True, flags=re.IGNORECASE) | trials['Study Title'].str.contains(f"r'{cancer_regex}'", regex=True, flags=re.IGNORECASE)]

#### Extracting Drugs Text Only

In [205]:
# Filter interventions that have DRUG mentioned
cancer_drug_trials = cancer_trials[cancer_trials['Interventions'].str.contains(r"DRUG", regex=True)]

In [206]:
# Extract all drugs and dosages within same text
pattern = r"DRUG: (.*?)\|"
drug_names = cancer_drug_trials['Interventions'].str.extractall(pattern)
drug_names.columns = ["Drug w/ dosages"]

In [207]:
# Merging to original dataset for each NCT number multiple drugs can exist so there can be multiple rows
drug_dosages = cancer_drug_trials.reset_index().merge(drug_names.reset_index().drop(columns=['match']), left_on='index', right_on='level_0').drop(columns=['level_0', 'index'])

#### Extracting Units

In [208]:
# Using regex to extract all possible units
drug_dosages['Drug w/ dosages'] = drug_dosages['Drug w/ dosages'].str.lower()
units = drug_dosages['Drug w/ dosages'].str.extract(r"\d+\s*([A-Za-z]+[\^]?(?:/[A-Za-z]+[\^]?[\d+]?)*)", re.IGNORECASE)

In [209]:
# Filter actual units that make sense
units[0].unique()

array([nan, 'f', 'mg', 'mg/ml', 'dose', 'mg/m^2', 'mg/ml/min', 'hp',
       'monoclonal', 'tablets', 'ga', 'dotatate', 'mg/kg/day', 'mg/day',
       'agonist', 'pb', 'or', 'lu', 'capsules', 'dichloride',
       'monotherapy', 'sodium', 'consecutive', 'antibody', 'protein',
       'gel', 'days', 'mg/d', 'mg/m', 'mg/kg', 'darpin', 'ac', 'in', 'b',
       'a', 'mg/m2', 'i', 'cc', 'weeks', 'cw', 'hcl', 'tnf', 'mab', 'bar',
       'inhibitor', 'inhibitors', 'and', 'pembrolizumab', 'for', 'fu',
       'plus', 'c', 'courses', 'low', 'high', 'medium', 'complex', 'mw',
       'mgkg', 'citrate', 'integrin', 'of', 'instillation', 'fluoro',
       'zr', 'injection', 'pyruvate', 'radioisotope', 'daily', 'mcg',
       'mg/m^2/day', 'ascending', 'day', 'mtc', 're', 'gallium', 'weekly',
       'labeled', 'combination', 'q', 'l', 'k', 'd', 'at', 'trike',
       'combined', 'escalating', 'with', 'regimen', 'es', 'ige', 'alpha',
       'single', 'milligram', 'na', 'ml', 'powder', 'granule', 'il', 'on',
 

In [210]:
# All units that will be used to extract dosages
units = ['mg',  'mg/ml', 'mg/m', 'mg/ml/min', 'mg/kg/day', 'mg/day',
        'mg/d', 'mg/kg', 'mg/m2', 'mgkg', 'mcg', 'milligram', 'ml', 'mm', 'ul/kg', 'ppm', 
        'gbq', 'mu/m2', 'mg/m2/dose', 'mci', 'mbq/kg', 'iu/kg', 'g/kgof', 'g/kg',  'ml/day', 'ml/kg', 'mcg/kg', 'microg/ml', 
        'mg/dose', 'ml/h', 'mg/m^2', 'mg/m^2/day', 'g']

In [211]:
# Escaping units regex
units_regex = "|".join(map(re.escape, units))

#### Extracting Dosages

In [212]:
# Using drugs-dosage text and units to extract dosages for each row
dosage_extraction_pattern = fr'((?:\d+(?:\.\d+|,\d+|-\d+|/\d+)*)\s?(?:\b{units_regex}\b)(?:\s|$))'
dosages = drug_dosages['Drug w/ dosages'].str.extractall(dosage_extraction_pattern, re.IGNORECASE)

In [213]:
# Joining multiple dosages for same drug and merging to original dataset
dosages.reset_index(inplace=True, level= 0, names="index")
dosages.columns = ['index', 'dosages']
dosages = dosages.groupby(['index'], as_index=False)['dosages'].apply(lambda x: " + ".join(x))
drug_dosages = drug_dosages.reset_index().merge(dosages, on='index', how='left')

#### Extracting Drugs Only

In [219]:
# Using dosages regex to extract drugs only
drug_dosages['drug'] = drug_dosages['Drug w/ dosages'].str.replace(f"({dosage_extraction_pattern})", "",regex=True)

#### Saving Result

In [221]:
drug_dosages[["NCT Number","Study Title", "Study URL", "Conditions", "Interventions", "drug", "dosages" ]].to_csv("cancer_extracted_dosages.csv", index=False)