## API Pull Code
Author: Kate Meldrum (kmm4ap@virginia.edu)

to request trial info from clinicaltrials.gov api and convert to pd dataframe

possible fields: https://clinicaltrials.gov/api/info/study_fields_list

DEMO CODE 

In [21]:
import pandas as pd
import numpy as np
import requests
import sys
if sys.version_info[0] < 3: 
    from StringIO import StringIO
else:
    from io import StringIO

In [3]:
# Field Definitions (change as desired)
min_rnk='1' #min pull rank
max_rnk='100' #max pull rank
fields='BriefTitle' #Fields to pull
fmt='csv' #Do not change for rest of code to work

In [4]:
#define url structure
api_url='https://clinicaltrials.gov/api/query/study_fields?%20&fields='+fields+'&min_rnk='+min_rnk+'&max_rnk='+max_rnk+'&fmt='+fmt
r = requests.get(api_url)

In [5]:
table=StringIO(str(r.text).split('\n\n')[1]) #delete API header and convert to string IO
table=pd.read_csv(table) # convert to pd dataframe

In [6]:
table

Unnamed: 0,Rank,BriefTitle
0,1,Untreated Sleep Apnea as an Aggravating Factor...
1,2,Uterine Manipulator Versus no Uterine Manipula...
2,3,A Long-term Trial of ETC-1002 in Patients With...
3,4,Empagliflozin in ESKD - A Feasibility Study
4,5,High Flow Nasal Cannula in Patients Undergoing...
...,...,...
95,96,"Laser Speckle Contrast Imaging, Surgical Eye &..."
96,97,Evaluation of Maternal Fetal Cardiac Structure...
97,98,89Zr-Bevacizumab PET/CT Imaging in NF2 Patients
98,99,The Impact of Conventional Hemodialysis and Re...


FUNCTION

In [22]:
# As Function: 
def api_pull(min_rnk,max_rnk,fields):
    '''
    min_rnk: STR NUMBER newest trial to request (1 is newest in database)
    max_rnk: STR NUMBER oldest trial to request
    fields: STR fields to request separated by comma in single string
    '''
    #define url structure
    api_url='https://clinicaltrials.gov/api/query/study_fields?%20&fields='+fields+'&min_rnk='+min_rnk+'&max_rnk='+max_rnk+'&fmt=csv'
    r = requests.get(api_url) #request from api
    table=StringIO(str(r.text).split('\n\n')[1]) #delete API header and convert to string IO
    table=pd.read_csv(table) # convert to pd dataframe
    return table

In [8]:
api_pull('1','100','BriefTitle')

Unnamed: 0,Rank,BriefTitle
0,1,Untreated Sleep Apnea as an Aggravating Factor...
1,2,Uterine Manipulator Versus no Uterine Manipula...
2,3,A Long-term Trial of ETC-1002 in Patients With...
3,4,Empagliflozin in ESKD - A Feasibility Study
4,5,High Flow Nasal Cannula in Patients Undergoing...
...,...,...
95,96,"Laser Speckle Contrast Imaging, Surgical Eye &..."
96,97,Evaluation of Maternal Fetal Cardiac Structure...
97,98,89Zr-Bevacizumab PET/CT Imaging in NF2 Patients
98,99,The Impact of Conventional Hemodialysis and Re...


GET INITIAL DATA SET

In [9]:
# To get all trials (pull max(1000) at a time 
# only need to run this code once - then write separate code to update: 

# max=438902
fields='NCTId,OfficialTitle,OrgFullName,Condition,InterventionType,MinimumAge,MaximumAge,Gender,HealthyVolunteers,EligibilityCriteria'
for i in range(0, 438):
    min_rnk=str((i*1000)+1)
    max_rnk=str((i+1)*1000)
    df=api_pull(min_rnk,max_rnk,fields) 
    df.to_csv('/Users/meldrumapple/Desktop/Capstone/api_pull/'+str(i))

In [10]:
#get the last 902 trials: 
df=api_pull('438001','438902',fields) 
df.to_csv('/Users/meldrumapple/Desktop/Capstone/api_pull/'+str(438))

In [11]:
#read in all the files we just made
file_list=[pd.read_csv('/Users/meldrumapple/Desktop/Capstone/api_pull/'+str(i)) for i in range(0,439)]

In [16]:
# combine into one df
full_pull=pd.concat(file_list, axis=0)
full_pull=full_pull.iloc[:,2:]
full_pull

Unnamed: 0,NCTId,OfficialTitle,OrgFullName,Condition,InterventionType,MinimumAge,MaximumAge,Gender,HealthyVolunteers,EligibilityCriteria
0,NCT05687097,Untreated Sleep-related Breathing Disorders as...,"University Health Network, Toronto","Sleep Apnea|Spinal Cord Injuries|Spasticity, M...",Device|Device|Device,18 Years,,All,No,Inclusion Criteria:||Individuals with subacute...
1,NCT05687084,Randomized Controlled Trial on the Oncologic O...,Universita di Verona,Endometrial Neoplasms,Device,18 Years,100 Years,Female,No,Inclusion Criteria:||Diagnosis of Endometrial ...
2,NCT05687071,"A Multicenter, Open-label, Uncontrolled, Long-...","Otsuka Pharmaceutical Co., Ltd.",Hyper-low-density Lipoprotein (LDL) Cholestero...,Drug,18 Years,85 Years,All,No,Inclusion Criteria:||Patients with inadequate ...
3,NCT05687058,Empagliflozin in ESKD - A Feasibility Study,University of Mississippi Medical Center,"Kidney Failure, Chronic|Heart Failure",Drug|Drug,18 Years,,All,No,Inclusion Criteria:||age ≥18 years;|diagnosis ...
4,NCT05687045,High-flow Nasal Cannula Oxygen Therapy for Out...,University Magna Graecia,Colonoscopy,Device|Device,18 Years,,All,No,Inclusion Criteria:||outpatients with the indi...
...,...,...,...,...,...,...,...,...,...,...
897,NCT00000107,,National Center for Research Resources (NCRR),"Heart Defects, Congenital",,17 Years,60 Years,All,Accepts Healthy Volunteers,Inclusion Criteria:||Resting blood pressure be...
898,NCT00000106,,National Center for Research Resources (NCRR),Rheumatic Diseases,Device,18 Years,65 Years,All,No,Inclusion Criteria:||Patients are required to ...
899,NCT00000105,Vaccination With Tetanus Toxoid and Keyhole Li...,"Masonic Cancer Center, University of Minnesota",Cancer,Biological|Biological|Drug|Biological,18 Years,,All,Accepts Healthy Volunteers,Inclusion Criteria:||Patients must have a diag...
900,NCT00000104,,National Center for Research Resources (NCRR),Lead Poisoning,Procedure,0 Years,,Female,Accepts Healthy Volunteers,Inclusion Criteria:||Pregnant mothers of the P...


In [18]:
# read big df to csv:
full_pull.to_csv('/Users/meldrumapple/Desktop/Capstone/api_pull_data')

UPDATE INITIAL DATA SET

In [35]:
def get_num_trials():
    '''
    Returns the number of trials currently availible from clinicaltrials.gov
    '''
    api_url='https://clinicaltrials.gov/api/query/study_fields?%20&fields=BriefTitle&min_rnk=1&max_rnk=10&fmt=csv'
    r = requests.get(api_url)
    return int(str(r.text).split('NStudiesAvail:')[1].split('"\n')[0])
get_num_trials()

438902

In [47]:
## CURRENTLY UNTESTED, USE AT YOUR OWN RISK 
def update_df(df_old):
    fields='NCTId,OfficialTitle,OrgFullName,Condition,InterventionType,MinimumAge,MaximumAge,Gender,HealthyVolunteers,EligibilityCriteria'
    mx=get_num_trials()
    df_new=df_old
    if mx>len(df_old):
        new_trials=(mx-len(df_old))
        if new_trials>1000:
            x=new_trials//1000
            #get new trials by increments of full 1000s
            for i in range(0, x):
                min_rnk=str((i*1000)+1)
                max_rnk=str((i+1)*1000)
                df=api_pull(min_rnk,max_rnk,fields) 
                df_new=pd.concat([df_new,df], axis=0)
            # get new trials from remainder
            df=api_pull(str((x*1000)+1), str(new_trials))
            df_new=pd.concat([df_new,df], axis=0)
        else:
            df=api_pull('1', str(new_trials))
            df_new=pd.concat([df_new,df], axis=0)
    print(str(new_trials)+' trials added to df')
    return df_new