## API Pull Code
Author: Kathryn Meldrum (kmm4ap@virginia.edu)

to request trial info from clinicaltrials.gov api and convert to pd dataframe

possible fields: https://clinicaltrials.gov/api/info/study_fields_list

## Import Packages

In [8]:
import pandas as pd
import numpy as np
import requests
import sys
if sys.version_info[0] < 3: 
    from StringIO import StringIO
else:
    from io import StringIO
import time

## Define Function to pull csv from clinicaltrials.gov

In [9]:
# As Function: 
def api_pull(min_rnk,max_rnk,fields):
    '''
    min_rnk: STR NUMBER newest trial to request (1 is newest in database)
    max_rnk: STR NUMBER oldest trial to request
    fields: STR fields to request separated by comma in single string
    '''
    #define url structure
    api_url='https://clinicaltrials.gov/api/query/study_fields?%20&fields='+fields+'&min_rnk='+min_rnk+'&max_rnk='+max_rnk+'&fmt=csv'
    r = requests.get(api_url) #request from api
    table=StringIO(str(r.text).split('\n\n')[1]) #delete API header and convert to string IO
    table=pd.read_csv(table) # convert to pd dataframe
    return table

Demo

In [10]:
api_pull('1','100','BriefTitle')

Unnamed: 0,Rank,BriefTitle
0,1,Supplementing Brief Psychotherapy With a Mobil...
1,2,A Study of RC198 in Patients With Locally Adva...
2,3,Music for Sleep After Stroke
3,4,"Effect of Music on Nursing Students' Skills, A..."
4,5,Effect of Early Postoperative Oral Carbohydrat...
...,...,...
95,96,Stent Omission After Ureteroscopy and Lithotri...
96,97,Use of Clomiphene Citrate as an Inhibitor of O...
97,98,A Phase 1 Dose Escalation Study of VX-973 in H...
98,99,Boosting Psychotherapy Effects by Means of Tra...


## Define function to get the number of trials currently on CTG

In [11]:
def get_num_trials():
    '''
    Returns the number of trials currently availible from clinicaltrials.gov
    '''
    api_url='https://clinicaltrials.gov/api/query/study_fields?%20&fields=BriefTitle&min_rnk=1&max_rnk=10&fmt=csv'
    r = requests.get(api_url)
    return int(str(r.text).split('NStudiesAvail:')[1].split('"\n')[0])
get_num_trials()

452745

## Load Custom NER Model

In [30]:
# load in model
import spacy
nlp=spacy.load('/Users/meldrumapple/Desktop/Capstone/mod_chia/model-best')

# set up dict to store spans
label_list=['Person','Condition','Drug','Observation','Measurement','Procedure','Device','Visit','Negation','Qualifier','Temporal','Value','Multiplier','Reference_point','Mood','Post-eligibility','Pregnancy_considerations','Informed_consent']
ents_dict={}
for label in label_list:
    ents_dict[label]=[]
#print(ents_dict)

# define funtion to get labels and spans for 2 columns of text, return lists of labels and spans
def analyze(inc, exc):
    ents_col=[]
    spans_col=[]
    text_col=[]
    
    for (txt1, txt2) in zip(inc, exc):
        
        ents=[]
        spans=[]
        
        
        txt1=str(txt1).replace('|', ' ')
        txt2=str(txt2).replace('|', ' ')
        
        doc1=nlp(str(txt1))
        pred_ents1 = [(e.start_char, e.end_char, e.label_) for e in doc1.ents]
        for (start, end, label) in pred_ents1:
            ents.append(label)
            spans.append(txt1[start:end].lower())
            ents_dict[label].append(txt1[start:end].lower())
            
        if txt2 != None: 
            doc2=nlp(str(txt2))
            pred_ents2 = [(e.start_char, e.end_char, e.label_) for e in doc2.ents]
            for (start, end, label) in pred_ents2:
                ents.append(label)
                spans.append(txt2[start:end].lower())
                ents_dict[label].append(txt2[start:end].lower())
        text_col.append(str('Inclusion Criteria: '+txt1+r'\n'+' Exclusion Criteria: '+txt2))     
        ents_col.append(ents)
        spans_col.append(spans)
    return (text_col, ents_col, spans_col)

# define funtion to get labels and spans for two columns of text, return entity dictionary
def analyze2(inc, exc):
    inc_col=[]
    exc_col=[]
    
    for (txt1, txt2) in zip(inc, exc):
        
        txt1=str(txt1).replace('|', ' ')
        txt2=str(txt2).replace('|', ' ')
        
        doc1=nlp(str(txt1))
        pred_ents_inc = [(e.start_char, e.end_char, e.label_) for e in doc1.ents]
        inc_col.append({'text': txt1, 'entities': pred_ents_inc})
            
        if txt2 != None: 
            doc2=nlp(str(txt2))
            pred_ents_exc=[(e.start_char, e.end_char, e.label_) for e in doc2.ents]
        else: 
            pred_ents_exc=pd.NA
        exc_col.append({'text': txt1, 'entities': pred_ents_inc})
        
    return (inc_col, exc_col)



## Define function to concat trial info and NER Results

In [46]:
def get_df():
    # define fields to pull:
    fields='NCTId,OfficialTitle,OrgFullName,CompletionDate,CompletionDateType,Condition,InterventionType,MinimumAge,MaximumAge,Gender,HealthyVolunteers,EligibilityCriteria'
    # define max rank to pull
    mx=get_num_trials()
    # divide into batches of 1000
    x=mx//1000
    # initialize dataframe
    df = pd.DataFrame(columns=['NCTId','OfficialTitle','OrgFullName','CompletionDate', 'CompletionDateType','Condition','InterventionType','MinimumAge','MaximumAge','Gender','HealthyVolunteers','CriteriaText','Entities','Spans'])
    
    #helper function to clean chunk
    def clean_chunk(chunk):
        # split eligibility into inclusion and exclusion
        chunk[['InclusionCriteria', 'ExclusionCriteria']]=chunk.EligibilityCriteria.str[21:].str.split("Exclusion Criteria:\\|\\|",expand=True).iloc[:,:2]
        # perform nlp analysis and store
        chunk_nlp=analyze(chunk['InclusionCriteria'], chunk['ExclusionCriteria'])
        chunk['CriteriaText']=chunk_nlp[0]
        chunk['Entities']=chunk_nlp[1]
        chunk['Spans']=chunk_nlp[2]
        # drop cols we don't want
        chunk=chunk.drop(['EligibilityCriteria','InclusionCriteria', 'ExclusionCriteria', 'Rank'], axis=1)
        return chunk
        

    for i in range(0, x):
        
        # define ranks to pull
        min_rnk=str((i*1000)+1)
        max_rnk=str((i+1)*1000)
        print(min_rnk, max_rnk)
        
        # pull ranks 
        chunk=api_pull(min_rnk,max_rnk,fields) 
        chunk=clean_chunk(chunk)
    
        # add to big dataframe
        df=pd.concat([df, chunk], axis=0)
    #get new trials from remainder
    chunk=api_pull(str((x*1000)+1), str(mx), fields)
    chunk=clean_chunk(chunk)
    df=pd.concat([df,chunk], axis=0)
    return df

## Alternate concat function with only NER results and sparse trial info

In [57]:
def get_df2(): #this one pulls entity span indexes
    # define fields to pull:
    fields='NCTId,OfficialTitle,CompletionDate,EligibilityCriteria'
    # define max rank to pull
    mx=get_num_trials()
    # divide into batches of 1000
    x=mx//1000
    # initialize dataframe
    df = pd.DataFrame(columns=['NCTId','OfficialTitle','CompletionDate','InclusionNER', 'ExclusionNER'])
    
    #helper function to clean chunk
    def clean_chunk(chunk):
        # split eligibility into inclusion and exclusion
        chunk[['InclusionCriteria', 'ExclusionCriteria']]=chunk.EligibilityCriteria.str[21:].str.split("Exclusion Criteria:\\|\\|",expand=True).iloc[:,:2]
        # perform nlp analysis and store
        chunk_nlp=analyze2(chunk['InclusionCriteria'], chunk['ExclusionCriteria'])
        chunk['InclusionNER']=chunk_nlp[0]
        chunk['ExclusionNER']=chunk_nlp[1]
        chunk=chunk.drop(['EligibilityCriteria', 'InclusionCriteria', 'ExclusionCriteria'], axis=1)
        return chunk
        

    for i in range(0, x):
        
        # define ranks to pull
        min_rnk=str((i*1000)+1)
        max_rnk=str((i+1)*1000)
        print(min_rnk, max_rnk)
        
        # pull ranks 
        chunk=api_pull(min_rnk,max_rnk,fields) 
        chunk=clean_chunk(chunk)
    
        # add to big dataframe
        df=pd.concat([df, chunk], axis=0)
    #get new trials from remainder
    chunk=api_pull(str((x*1000)+1), str(mx), fields)
    chunk=clean_chunk(chunk)
    df=pd.concat([df,chunk], axis=0)
    return df

## Demo

In [58]:
# Get streamlit-style dataframe
# st=time.time()
# data_set=get_df()
# et=time.time()
# print('Time Elapsed: '+str(et-st))

In [59]:
# print(data_set.head())

In [60]:
#data_set.to_csv('/Users/meldrumapple/Desktop/Capstone/api_pull_data_sl.csv')

In [61]:
#len(pd.read_csv('/Users/meldrumapple/Desktop/Capstone/api_pull_data_sl.csv'))

In [62]:
# make a padded style dataframe with all possible spans for entity types for streamlit
# set_ents_dict={}
# longest=0
# for x in ents_dict.keys():
#     if len(list(set(ents_dict[x])))>longest:
#         longest=len(list(set(ents_dict[x])))
# for x in ents_dict.keys():
#     spans=list(set(ents_dict[x]))
#     pad=longest-len(spans)
#     set_ents_dict[x]=list(set(ents_dict[x]))+[pd.NA]*pad
# ents_df=pd.DataFrame(set_ents_dict)
# ents_df.to_csv('/Users/meldrumapple/Desktop/Capstone/ents_spans_small.csv')

In [63]:
# Get results dataframe
st=time.time()
data_set=get_df2()
et=time.time()
data_set.to_csv('/Users/meldrumapple/Desktop/Capstone/results.csv')
print('Time Elapsed: '+str(et-st))
print(data_set.head())
print(len(data_set))
print(len(pd.read_csv('/Users/meldrumapple/Desktop/Capstone/results.csv')))

1 1000
1001 2000
2001 3000
3001 4000
4001 5000
5001 6000
6001 7000
7001 8000
8001 9000
9001 10000
10001 11000
11001 12000
12001 13000
13001 14000
14001 15000
15001 16000
16001 17000
17001 18000
18001 19000
19001 20000
20001 21000
21001 22000
22001 23000
23001 24000
24001 25000
25001 26000
26001 27000
27001 28000
28001 29000
29001 30000
30001 31000
31001 32000
32001 33000
33001 34000
34001 35000
35001 36000
36001 37000
37001 38000
38001 39000
39001 40000
40001 41000
41001 42000
42001 43000
43001 44000
44001 45000
45001 46000
46001 47000
47001 48000
48001 49000
49001 50000
50001 51000
51001 52000
52001 53000
53001 54000
54001 55000
55001 56000
56001 57000
57001 58000
58001 59000
59001 60000
60001 61000
61001 62000
62001 63000
63001 64000
64001 65000
65001 66000
66001 67000
67001 68000
68001 69000
69001 70000
70001 71000
71001 72000
72001 73000
73001 74000
74001 75000
75001 76000
76001 77000
77001 78000
78001 79000
79001 80000
80001 81000
81001 82000
82001 83000
83001 84000
84001 85000
85