## API Pull Code
Author: Kate Meldrum (kmm4ap@virginia.edu)

to request trial info from clinicaltrials.gov api and convert to pd dataframe

possible fields: https://clinicaltrials.gov/api/info/study_fields_list

DEMO CODE 

In [1]:
import pandas as pd
import numpy as np
import requests
import sys
if sys.version_info[0] < 3: 
    from StringIO import StringIO
else:
    from io import StringIO
import time

In [3]:
# Field Definitions (change as desired)
min_rnk='1' #min pull rank
max_rnk='100' #max pull rank
fields='BriefTitle' #Fields to pull
fmt='csv' #Do not change for rest of code to work

In [4]:
#define url structure
api_url='https://clinicaltrials.gov/api/query/study_fields?%20&fields='+fields+'&min_rnk='+min_rnk+'&max_rnk='+max_rnk+'&fmt='+fmt
r = requests.get(api_url)

In [5]:
table=StringIO(str(r.text).split('\n\n')[1]) #delete API header and convert to string IO
table=pd.read_csv(table) # convert to pd dataframe

In [6]:
table

Unnamed: 0,Rank,BriefTitle
0,1,Untreated Sleep Apnea as an Aggravating Factor...
1,2,Uterine Manipulator Versus no Uterine Manipula...
2,3,A Long-term Trial of ETC-1002 in Patients With...
3,4,Empagliflozin in ESKD - A Feasibility Study
4,5,High Flow Nasal Cannula in Patients Undergoing...
...,...,...
95,96,"Laser Speckle Contrast Imaging, Surgical Eye &..."
96,97,Evaluation of Maternal Fetal Cardiac Structure...
97,98,89Zr-Bevacizumab PET/CT Imaging in NF2 Patients
98,99,The Impact of Conventional Hemodialysis and Re...


FUNCTION

In [2]:
# As Function: 
def api_pull(min_rnk,max_rnk,fields):
    '''
    min_rnk: STR NUMBER newest trial to request (1 is newest in database)
    max_rnk: STR NUMBER oldest trial to request
    fields: STR fields to request separated by comma in single string
    '''
    #define url structure
    api_url='https://clinicaltrials.gov/api/query/study_fields?%20&fields='+fields+'&min_rnk='+min_rnk+'&max_rnk='+max_rnk+'&fmt=csv'
    r = requests.get(api_url) #request from api
    table=StringIO(str(r.text).split('\n\n')[1]) #delete API header and convert to string IO
    table=pd.read_csv(table) # convert to pd dataframe
    return table

In [8]:
api_pull('1','100','BriefTitle')

Unnamed: 0,Rank,BriefTitle
0,1,Untreated Sleep Apnea as an Aggravating Factor...
1,2,Uterine Manipulator Versus no Uterine Manipula...
2,3,A Long-term Trial of ETC-1002 in Patients With...
3,4,Empagliflozin in ESKD - A Feasibility Study
4,5,High Flow Nasal Cannula in Patients Undergoing...
...,...,...
95,96,"Laser Speckle Contrast Imaging, Surgical Eye &..."
96,97,Evaluation of Maternal Fetal Cardiac Structure...
97,98,89Zr-Bevacizumab PET/CT Imaging in NF2 Patients
98,99,The Impact of Conventional Hemodialysis and Re...


GET DATA SET

In [3]:
def get_num_trials():
    '''
    Returns the number of trials currently availible from clinicaltrials.gov
    '''
    api_url='https://clinicaltrials.gov/api/query/study_fields?%20&fields=BriefTitle&min_rnk=1&max_rnk=10&fmt=csv'
    r = requests.get(api_url)
    return int(str(r.text).split('NStudiesAvail:')[1].split('"\n')[0])
get_num_trials()

439377

In [4]:
# The pull didnt take that long so maybe we just want to get a new copy every time: 
def get_df():
    fields='NCTId,OfficialTitle,OrgFullName,Condition,InterventionType,MinimumAge,MaximumAge,Gender,HealthyVolunteers,EligibilityCriteria'
    mx=get_num_trials()
    x=mx//1000
    df = pd.DataFrame(columns=['Rank','NCTId','OfficialTitle','OrgFullName','Condition','InterventionType','MinimumAge','MaximumAge','Gender','HealthyVolunteers','EligibilityCriteria'])
    for i in range(0, x):
        min_rnk=str((i*1000)+1)
        max_rnk=str((i+1)*1000)
        chunk=api_pull(min_rnk,max_rnk,fields) 
        df=pd.concat([df, chunk], axis=0)
        # get new trials from remainder
    chunk=api_pull(str((x*1000)+1), str(mx), fields)
    df=pd.concat([df,chunk], axis=0)
    return df

In [5]:
st=time.time()
data_set=get_df()
et=time.time()
print(et-st)

357.6852889060974


In [6]:
data_set.head()

Unnamed: 0,Rank,NCTId,OfficialTitle,OrgFullName,Condition,InterventionType,MinimumAge,MaximumAge,Gender,HealthyVolunteers,EligibilityCriteria
0,1,NCT05693311,Inflammatory Bowel Disease Related Joint Manif...,Assiut University,IBD|Joint Diseases,,18 Years,90 Years,All,,Inclusion Criteria:||Patients previously diagn...
1,2,NCT05693298,High-flow Nasal Cannula Oxygen Therapy for Out...,University Magna Graecia,Gastroscopy,Device|Device,18 Months,,All,No,Inclusion Criteria:||outpatients with the indi...
2,3,NCT05693285,Preterm Birth and Biomarkers for Cardiovascula...,Uppsala University,Preterm Birth|Cardiovascular Diseases,Other,,60 Years,Female,No,Inclusion Criteria:||Women identified as cases...
3,4,NCT05693272,"A Randomized, Observer-Blind, Dose Escalation,...",University of Saskatchewan,Severe Acute Respiratory Syndrome Coronavirus ...,Biological|Biological,18 Years,65 Years,All,Accepts Healthy Volunteers,Inclusion Criteria:||Male and female participa...
4,5,NCT05693259,Improvement Effect of Functional Dyspepsia Aft...,Wonju Severance Christian Hospital,Functional Dyspepsia,Device,,,All,Accepts Healthy Volunteers,Inclusion Criteria:||Symptoms onset should sta...


In [7]:
len(data_set)

439377

In [8]:
get_num_trials()

439377

In [9]:
data_set.to_csv('/Users/meldrumapple/Desktop/Capstone/api_pull_data.csv')