In [44]:
import requests
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math as math

In [45]:
# All 'info' api requests & urls for ClinicalTrials.gov

fields_url = 'http://ClinicalTrials.gov/api/info/study_fields_list?fmt=JSON'
studyFields = requests.get(fields_url).json()

def_url = 'http://ClinicalTrials.gov/api/info/api_defs?fmt=JSON'
api_def = requests.get(def_url).json()

struct_url = 'http://ClinicalTrials.gov/api/info/study_structure?fmt=JSON'
api_struct = requests.get(struct_url).json()

stats_url = 'http://ClinicalTrials.gov/api/info/study_statistics?fmt=JSON'
study_stats = requests.get(stats_url).json()

areas_url = 'http://ClinicalTrials.gov/api/info/search_areas?fmt=JSON'
search_areas = requests.get(areas_url).json()

In [75]:
# We want to search through api to find results relevant to covid 19

query_term1 = 'covid 19'
query_term1 = query_term1.strip()
query_term1 = query_term1.replace(' ', '+')
query_term1 = query_term1.replace('&', '%26')
fullStudyQ1_url = f'http://ClinicalTrials.gov/api/query/study_fields?&expr={query_term1}\
                     &min_rnk=2000&max_rnk=2010&fields=NCTId,BriefTitle,StatusVerifiedDate,CompletionDate&fmt=JSON'
fullStudyQ1 = requests.get(fullStudyQ1_url).json()
fullStudyQ1['StudyFieldsResponse']['NStudiesFound']

2788

In [76]:
# Studies on the fringe of having relevant data may only appear through a second query: 'coronavirus'

query_term2 = 'coronavirus'
query_term2 = query_term2.strip()
query_term2 = query_term2.replace(' ', '+') #via ClinicalTrials api guidelines
query_term2 = query_term2.replace('&', '%26') #^^^
fullStudyQ2_url = f'http://ClinicalTrials.gov/api/query/study_fields?&expr={query_term2}\
                     &min_rnk=1&fields=NCTId,BriefTitle,StatusVerifiedDate,CompletionDate&fmt=JSON'
fullStudyQ2 = requests.get(fullStudyQ2_url).json() # api requested json object
fullStudyQ2['StudyFieldsResponse']['NStudiesFound']  #number of studies returned by the query

1423

In [48]:
limit1 = fullStudyQ1['StudyFieldsResponse']['NStudiesFound'] #make sure we don't ask for results outside of the scope of our query

tempURL1 = f'http://ClinicalTrials.gov/api/query/study_fields?&expr={query_term1}\
           &min_rnk=1&max_rnk=1000&fields=NCTId,BriefTitle,StatusVerifiedDate,CompletionDate&fmt=JSON'   # the data is large, and must be queried in packets of 1,000
tempL1 = requests.get(tempURL1).json()

tempURL2 = f'http://ClinicalTrials.gov/api/query/study_fields?&expr={query_term1}\
           &min_rnk=1001&max_rnk=2000&fields=NCTId,BriefTitle,StatusVerifiedDate,CompletionDate&fmt=JSON' # creating 3 url targets for our packets
tempL2 = requests.get(tempURL2).json()                                                                    # creating 3 temporary json request data

tempURL3 = f'http://ClinicalTrials.gov/api/query/study_fields?&expr={query_term1}\
           &min_rnk=2001&max_rnk={limit1}&fields=NCTId,BriefTitle,StatusVerifiedDate,CompletionDate&fmt=JSON'
tempL3 = requests.get(tempURL3).json()

Q1_tempL1 = tempL1['StudyFieldsResponse']['StudyFields']
Q1_tempL2 = tempL2['StudyFieldsResponse']['StudyFields']
Q1_tempL3 = tempL3['StudyFieldsResponse']['StudyFields']

In [59]:
limit2 = fullStudyQ2['StudyFieldsResponse']['NStudiesFound']

tempURL1 = f'http://ClinicalTrials.gov/api/query/study_fields?&expr={query_term1}\
           &min_rnk=1&max_rnk=1000&fields=NCTId,BriefTitle,StatusVerifiedDate,CompletionDate&fmt=JSON'
tempL1 = requests.get(tempURL1).json()

tempURL2 = f'http://ClinicalTrials.gov/api/query/study_fields?&expr={query_term1}\
           &min_rnk=1001&max_rnk={limit2}&fields=NCTId,BriefTitle,StatusVerifiedDate,CompletionDate&fmt=JSON'
tempL2 = requests.get(tempURL2).json()


Q2_tempL1 = tempL1['StudyFieldsResponse']['StudyFields']
Q2_tempL2 = tempL2['StudyFieldsResponse']['StudyFields']


In [60]:
Q1_tempDF1 = pd.DataFrame(Q1_tempL1)
Q1_tempDF2 = pd.DataFrame(Q1_tempL2)
Q1_tempDF3 = pd.DataFrame(Q1_tempL3)

Q2_tempDF1 = pd.DataFrame(Q2_tempL1)
Q2_tempDF2 = pd.DataFrame(Q2_tempL2)

In [64]:
#returns single list of col values from all 3 temp DF's 

def unListItems(q, col):
    if q == 'Q1':
        lst_1 = [' ']*len(Q1_tempDF1[col])  # create empty list the length of 'Q1_tempDF1[col]'
        i = 0
        while i < len(Q1_tempDF1[col]):
            lst_1[i] = Q1_tempDF1[col][i][0]
            i += 1
        lst_2 = [' ']*len(Q1_tempDF2[col])
        i = 0
        while i < len(Q1_tempDF2[col]):
            lst_2[i] = Q1_tempDF2[col][i][0]
            i += 1      
        lst_3 = [' ']*len(Q1_tempDF3[col])
        i = 0
        while i < len(Q1_tempDF3[col]):
            lst_3[i] = Q1_tempDF3[col][i][0]
            i += 1
        LIST = lst_1 + lst_2 + lst_3
        return LIST
    
    elif q == 'Q2':
        lst_1 = [' ']*len(Q2_tempDF1[col])
        i = 0
        while i < len(Q2_tempDF1[col]):
            lst_1[i] = Q2_tempDF1[col][i][0]
            i += 1
        lst_2 = [' ']*len(Q2_tempDF2[col])
        i = 0
        while i < len(Q2_tempDF2[col]):
            lst_2[i] = Q2_tempDF2[col][i][0]
            i += 1       
        LIST = lst_1 + lst_2
        return LIST

In [65]:
#retrive unListed Items, populate Query1 DataFrame with query results from id, title, and date keys

ID = unListItems('Q1','NCTId')
Title = unListItems('Q1','BriefTitle')
VerifiedDate = unListItems('Q1','StatusVerifiedDate')

DATE_Q1 = pd.DataFrame()
DATE_Q1.insert(0, 'ID', ID)
DATE_Q1.insert(1, 'Title', Title)
DATE_Q1.insert(2, 'Date Verified', VerifiedDate)

In [66]:
# ^^^ --> Query2

ID = unListItems('Q2','NCTId')
Title = unListItems('Q2','BriefTitle')
VerifiedDate = unListItems('Q2','StatusVerifiedDate')

DATE_Q2 = pd.DataFrame()
DATE_Q2.insert(0, 'ID', ID)
DATE_Q2.insert(1, 'Title', Title)
DATE_Q2.insert(2, 'Date Verified', VerifiedDate)

In [71]:
COVID19_DATES = pd.concat([DATE_Q1, DATE_Q2], ignore_index=True)

In [73]:
COVID19_DATES['ID'].nunique()

2788

In [77]:
COVID19_DATES = DATE_Q1

In [78]:
COVID19_DATES

Unnamed: 0,ID,Title,Date Verified
0,NCT04395482,Lung CT Scan Analysis of SARS-CoV2 Induced Lun...,May 2020
1,NCT04395924,Maternal-foetal Transmission of SARS-Cov-2,June 2020
2,NCT04476940,COVID-19 Breastfeeding Guideline for African-A...,July 2020
3,NCT04412265,Frailty in Elderly Patients With COVID-19,June 2020
4,NCT04427332,Smell and Taste Disorders in COVID-19 Patients,June 2020
...,...,...,...
2783,NCT03728257,Lung Transplant G0 (LTGO): Improving Self-Mana...,June 2020
2784,NCT03823469,Evaluating the Impact of a Culinary Coaching T...,May 2020
2785,NCT04429061,Reaching 90 90 90 in Adolescents in Zambia: Us...,June 2020
2786,NCT03922334,Navigating New Motherhood 2,June 2020


In [79]:
COVID19_DATES

Unnamed: 0,ID,Title,Date Verified
0,NCT04395482,Lung CT Scan Analysis of SARS-CoV2 Induced Lun...,May 2020
1,NCT04395924,Maternal-foetal Transmission of SARS-Cov-2,June 2020
2,NCT04476940,COVID-19 Breastfeeding Guideline for African-A...,July 2020
3,NCT04412265,Frailty in Elderly Patients With COVID-19,June 2020
4,NCT04427332,Smell and Taste Disorders in COVID-19 Patients,June 2020
...,...,...,...
2508,NCT04061382,Sero-epidemiological Survey of England in 2019...,July 2019
2682,NCT01306084,Viral Infections in Healthy and Immunocompromi...,"June 25, 2020"
2697,NCT02656381,Uveitis/Intraocular Inflammatory Disease Bioba...,"May 4, 2020"
2715,NCT04278404,"Pharmacokinetics, Pharmacodynamics, and Safety...",October 2019
