## Imports

In [1]:
import numpy as np
import pandas as pd
import requests
import urllib.parse
import json
import os
import requests
from tqdm import tqdm

## Helper Functions for Data Processing/Loading

Selecting trials
  - that matches the listed chronic conditions (chronic kidney disease, diabetes, obesity, cancer, hypertension)
  - Interventional study only
  - Trial is complete
  - Has at least 6 or more reported baseline measures in clinicaltrials.gov portal

Data extracted for each trial
  - NCTId
  - Brief Title
  - Condition
  - Brief Summary
  - Eligibility Criteria
  - Baseline MeasureTitle
  - Primary Outcome Measure
  - Intervention Name
  - Study Type (we don't use this for generation as all studies we use are interventional)
  - TrialGroup (we don't use this as trial meta-data)

In [4]:
def fetch_api_v2_data(condition, size):
  '''
  Fetches data from the Clinical Trials API v2.
  condition: The condition to search for.
  size: The number of maximum results/trials to return for each condition.
  return: A list of dictionaries (json format) containing the data for each condition.
  '''

  #this is the main URL of the API
  base_url = 'https://clinicaltrials.gov/api/v2/studies'

  #parameters of the API call
  ## query.cond - is the condition of the trial we are searching for (e.g. diabetes/hypertension etc.)
  ## pageSize - is the max number of trials fetched for each condition
  ## filter.advanced - requires all fetched trials to be - interventional, complete and have at least 6 or more baseline measures
  ## fields - is the fields/information of the trial we are fetching
  params = {
      'query.cond': condition,
      'pageSize' : size,
      'filter.advanced': 'AREA[StudyType]INTERVENTIONAL | AREA[OverallStatus]COMPLETED | AREA[BaselineMeasure:size]RANGE[6,MAX]',
      'fields': 'NCTId|BriefTitle|Condition|BriefSummary|EligibilityCriteria|BaselineMeasureTitle|PrimaryOutcomeMeasure|InterventionName|StudyType'
  }

  #make the API call
  response = requests.get(base_url, params=params)

  if response.status_code == 200: #success
      # Process the response if successful and return JSON
      json_data = response.json()
  else:
      print(f"Error fetching data, status code: {response.status_code}")

  return json_data



In [17]:
def parse_api_v2_data_to_df(data, group):

  '''
  Parses the data object returned from the Clinical Trials API v2, and converts it into a df.
  data: A list of dictionaries (json format) containing the data for each condition, returned by the API.
  return: A DataFrame containing the parsed data.
  '''

  # Create empty lists to store the extracted data
  nct_ids = []
  brief_titles = []
  eligibility_criteria = []
  brief_summaries = []
  conditions = []
  study_types = []
  interventions = []
  primary_outcomes = []
  measures = []

  # Iterate over the list of studies
  for study in data['studies']:

      # Extract the relevant data
      nct_ids.append(study['protocolSection']['identificationModule']['nctId'])
      brief_titles.append(study['protocolSection']['identificationModule']['briefTitle'])
      eligibility_criteria.append(study['protocolSection']['eligibilityModule']['eligibilityCriteria'])
      brief_summaries.append(study['protocolSection']['descriptionModule']['briefSummary'])

      # Extract the conditions
      study_conditions = ''
      for condition in study['protocolSection']['conditionsModule']['conditions']:
          study_conditions += condition+ ', '
      conditions.append(study_conditions)

      # Extract the study type
      study_types.append(study['protocolSection']['designModule']['studyType'])

      # Extract the interventions
      study_interventions = ''
      for intervention in study['protocolSection']['armsInterventionsModule']['interventions']:
          study_interventions += intervention['name'] + ', '
      interventions.append(study_interventions)

      # Extract the primary outcomes
      study_primary_outcomes = ''
      for outcome in study['protocolSection']['outcomesModule']['primaryOutcomes']:
          study_primary_outcomes += outcome['measure'] + ', '
      primary_outcomes.append(study_primary_outcomes)

      # Extract the measures
      study_measures = ''
      for measure in study['resultsSection']['baselineCharacteristicsModule']['measures']:
          study_measures += measure['title'] + ', '
      measures.append(study_measures)

  # Create a DataFrame from the extracted data
  studies_df = pd.DataFrame({
      'NCTId': nct_ids,
      'BriefTitle': brief_titles,
      'EligibilityCriteria': eligibility_criteria,
      'BriefSummary': brief_summaries,
      'Conditions': conditions,
      'StudyType': study_types,
      'Interventions': interventions,
      'PrimaryOutcomes': primary_outcomes,
      'BaselineMeasures': measures,
      'TrialGroup' : group
  })

  return studies_df



In [18]:
#run the loop for fetching data from clinicaltrials.gov
#loop for each disease/conditions

final_df = pd.DataFrame()
conditions = ['chronic kidney disease', 'diabetes', 'obesity', 'cancer', 'hypertension']

#this is the number of maximum trials to return per study
#tune it as needed, I used 500
max_number_of_trials_to_return = 500

for condition in conditions:

  #fetch the json response
  data = fetch_api_v2_data(condition, size=max_number_of_trials_to_return)

  #convert the json response into a dataframe
  df = parse_api_v2_data_to_df(data, condition)

  #concat with the final_df dataframe
  final_df = pd.concat([final_df, df], ignore_index=True)

In [19]:
#print dataframe
final_df.head(3)

Unnamed: 0,NCTId,BriefTitle,EligibilityCriteria,BriefSummary,Conditions,StudyType,Interventions,PrimaryOutcomes,BaselineMeasures,TrialGroup
0,NCT03196076,Contrast-enhanced Ultrasound for Complex Kidne...,Inclusion Criteria:\n\nTo be eligible for the ...,The purpose of this research study is to evalu...,"Chronic Kidney Diseases, Cystic Kidney Disease,",INTERVENTIONAL,"Perflutren Lipid microsphere,",Number of Participants With Change in Radiolog...,"Age, Categorical, Age, Continuous, Sex: Female...",chronic kidney disease
1,NCT00977080,Evaluation of the Effectiveness of Paricalcito...,Inclusion Criteria\n\n1. Male or female patien...,Evaluates the effectiveness of on-label Parica...,"Chronic Kidney Disease, Secondary Hyperparathy...",INTERVENTIONAL,"Paricalcitol, Cinacalcet,",The Number of Participants Who Achieve a Mean ...,"Age Continuous, Sex: Female, Male, History of ...",chronic kidney disease
2,NCT02915029,Home-base Kidney Care in Zuni Indians,Inclusion Criteria:\n\n* Clinical diagnosis of...,People reach End Stage Renal Disease (ESRD) du...,"Chronic Kidney Disease,",INTERVENTIONAL,"Educational and lifestyle coaching,",Patient Activation Measure (PAM) -13 Item Ques...,"Age, Continuous, Sex: Female, Male, Race (NIH/...",chronic kidney disease


In [15]:
#size of the dataframe - before any processing
final_df.shape

(1825, 10)

In [20]:
#see the number of trials of each group
final_df.TrialGroup.value_counts()

Unnamed: 0_level_0,count
TrialGroup,Unnamed: 1_level_1
diabetes,500
cancer,500
obesity,345
hypertension,317
chronic kidney disease,163


In [22]:
#save the dataset
final_df.to_csv('Dataset.csv', index=False)