In [130]:
import requests
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math as math

In [151]:
questURL = 'http://ClinicalTrials.gov/api/query/study_fields?expr=covid+19\
           &min_rnk=1\
           &max_rnk=1\
           &fields=\
               NCTId,\
               BriefTitle,\
               StatusVerifiedDate,\
               CompletionDate,\
               OrgFullName,\
               OrgClass,\
               keyword,\
               LocationCity,\
               LocationCountry\
           &fmt=JSON'
Quest = requests.get(questURL).json()
Quest

{'StudyFieldsResponse': {'APIVrs': '1.01.02',
  'DataVrs': '2020:07:18 22:20:26.715',
  'Expression': 'covid 19',
  'NStudiesAvail': 346147,
  'NStudiesFound': 2788,
  'MinRank': 1,
  'MaxRank': 1,
  'NStudiesReturned': 1,
  'FieldList': ['NCTId',
   'BriefTitle',
   'StatusVerifiedDate',
   'CompletionDate',
   'OrgFullName',
   'OrgClass',
   'Keyword',
   'LocationCity',
   'LocationCountry'],
  'StudyFields': [{'Rank': 1,
    'NCTId': ['NCT04395482'],
    'BriefTitle': ['Lung CT Scan Analysis of SARS-CoV2 Induced Lung Injury'],
    'StatusVerifiedDate': ['May 2020'],
    'CompletionDate': ['October 15, 2020'],
    'OrgFullName': ['University of Milano Bicocca'],
    'OrgClass': ['OTHER'],
    'Keyword': ['Lung injury', 'sars-covid-2', 'coronavirus infection'],
    'LocationCity': ['Bergamo',
     'Bergamo',
     'Ferrara',
     'Lecco',
     'Melzo',
     'Monza',
     'Rimini',
     'San Marino'],
    'LocationCountry': ['Italy',
     'Italy',
     'Italy',
     'Italy',
     'Ita

In [152]:
limit1 = Quest['StudyFieldsResponse']['NStudiesFound'] #make sure we don't ask for results outside of the scope of our query
limit1

2788

In [153]:
# Query the api in packets of 1000 then merge into a single dataframe

tempURL1 = 'http://ClinicalTrials.gov/api/query/study_fields?expr=covid+19\
           &min_rnk=1\
           &max_rnk=1000\
           &fields=\
               NCTId,\
               BriefTitle,\
               StatusVerifiedDate,\
               CompletionDate,\
               OrgFullName,\
               OrgClass,\
               keyword,\
               LocationCity,\
               LocationCountry\
           &fmt=JSON'   
tempL1 = requests.get(tempURL1).json()

tempURL2 = 'http://ClinicalTrials.gov/api/query/study_fields?expr=covid+19\
           &min_rnk=1001\
           &max_rnk=2000\
           &fields=\
               NCTId,\
               BriefTitle,\
               StatusVerifiedDate,\
               CompletionDate,\
               OrgFullName,\
               OrgClass,\
               keyword,\
               LocationCity,\
               LocationCountry\
           &fmt=JSON'
tempL2 = requests.get(tempURL2).json()                                                             

tempURL3 = f'http://ClinicalTrials.gov/api/query/study_fields?expr=covid+19\
           &min_rnk=2001\
           &max_rnk={limit1}\
           &fields=\
               NCTId,\
               BriefTitle,\
               StatusVerifiedDate,\
               CompletionDate,\
               OrgFullName,\
               OrgClass,\
               keyword,\
               LocationCity,\
               LocationCountry\
           &fmt=JSON'
tempL3 = requests.get(tempURL3).json()

Q1_tempL1 = tempL1['StudyFieldsResponse']['StudyFields']
Q1_tempL2 = tempL2['StudyFieldsResponse']['StudyFields']
Q1_tempL3 = tempL3['StudyFieldsResponse']['StudyFields']

In [198]:
Q1_tempDF1 = pd.DataFrame(Q1_tempL1)
Q1_tempDF2 = pd.DataFrame(Q1_tempL2)
Q1_tempDF3 = pd.DataFrame(Q1_tempL3)

In [199]:
#returns single list of col values from all 3 temp DF's 

def unListItems(col):
    lst_1 = [' ']*len(Q1_tempDF1[col])  # create empty list the length of 'Q1_tempDF1[col]'
    i = 0
    while i < len(Q1_tempDF1[col]):
        if len(Q1_tempDF1[col][i])==0:
            lst_1[i] = None
        else:
            lst_1[i] = Q1_tempDF1[col][i][0]
        i += 1
    lst_2 = [' ']*len(Q1_tempDF2[col])
    i = 0
    while i < len(Q1_tempDF2[col]):
        if len(Q1_tempDF2[col][i])==0:
            lst_2[i] = None
        else:
            lst_2[i] = Q1_tempDF2[col][i][0]
        i += 1      
    lst_3 = [' ']*len(Q1_tempDF3[col])
    i = 0
    while i < len(Q1_tempDF3[col]):
        if len(Q1_tempDF3[col][i])==0:
            lst_3[i] = None
        else:
            lst_3[i] = Q1_tempDF3[col][i][0]
        i += 1
    LIST = lst_1 + lst_2 + lst_3
    return LIST

In [250]:
#retrive unListed Items, populate Query1 DataFrame with query results from id, title, and date keys

ID = unListItems('NCTId')
Title = unListItems('BriefTitle')
VerifiedDate = unListItems('StatusVerifiedDate')
CompletionDate = unListItems('CompletionDate')
OrgFullName = unListItems('OrgFullName')
OrgClass = unListItems('OrgClass')
QueryType = ['covid 19']*len(ID)
Country = unListItems('LocationCountry')
City = unListItems('LocationCity')


DF_Q1 = pd.DataFrame()
DF_Q1.insert(0, 'ID', ID)
DF_Q1.insert(1, 'Title', Title)
DF_Q1.insert(2, 'Date Verified', VerifiedDate)
DF_Q1.insert(3, 'Completion Date', CompletionDate)
DF_Q1.insert(4, 'Organization', OrgFullName)
DF_Q1.insert(5, 'Organization Class', OrgClass)
DF_Q1.insert(6, 'Country', Country)
DF_Q1.insert(7, 'City', City)

In [210]:
groupID = DF_Q1.groupby('ID')

In [212]:
df = pd.DataFrame(groupID.count())

In [215]:
DF_Q1.loc[DF_Q1['ID']=='NCT04330144']

Unnamed: 0,ID,Title,Date Verified,Completion Date,Organization,Organization Class
995,NCT04280913,Clinical Outcomes of Patients With COVID19,March 2020,"March 31, 2020",Guangzhou Institute of Respiratory Disease,OTHER
1234,NCT04280913,Clinical Outcomes of Patients With COVID19,March 2020,"March 31, 2020",Guangzhou Institute of Respiratory Disease,OTHER


In [None]:
df = df.loc[df['Title']>1]

In [221]:
dupeList = []
for i in df.index:
    dupeList.append(i)
print(dupeList)

['NCT04280913', 'NCT04316884', 'NCT04330144', 'NCT04336761', 'NCT04341207', 'NCT04343742', 'NCT04344197', 'NCT04344379', 'NCT04348240', 'NCT04349618', 'NCT04353271', 'NCT04358042', 'NCT04360759', 'NCT04362124', 'NCT04363008', 'NCT04363060', 'NCT04366167', 'NCT04367883', 'NCT04370119', 'NCT04371354', 'NCT04371471', 'NCT04375670', 'NCT04376944', 'NCT04377308', 'NCT04379297', 'NCT04382391', 'NCT04383613', 'NCT04385121', 'NCT04386265', 'NCT04387253', 'NCT04392531', 'NCT04401540', 'NCT04403438', 'NCT04403906', 'NCT04413747', 'NCT04416308', 'NCT04416334', 'NCT04422678', 'NCT04426279', 'NCT04430062', 'NCT04432350', 'NCT04434144', 'NCT04441710', 'NCT04442087', 'NCT04442165', 'NCT04462120', 'NCT04468009', 'NCT04473547']


In [222]:
query = 'coronavirus'
quest2URL = f'http://ClinicalTrials.gov/api/query/study_fields?expr={query}\
           &min_rnk=1\
           &max_rnk=1\
           &fields=\
               NCTId,\
               BriefTitle,\
               StatusVerifiedDate,\
               CompletionDate,\
               OrgFullName,\
               OrgClass,\
               keyword,\
               LocationCity,\
               LocationCountry\
           &fmt=JSON'
Quest2 = requests.get(quest2URL).json()
Quest2

{'StudyFieldsResponse': {'APIVrs': '1.01.02',
  'DataVrs': '2020:07:18 22:20:26.715',
  'Expression': 'coronavirus',
  'NStudiesAvail': 346147,
  'NStudiesFound': 1423,
  'MinRank': 1,
  'MaxRank': 1,
  'NStudiesReturned': 1,
  'FieldList': ['NCTId',
   'BriefTitle',
   'StatusVerifiedDate',
   'CompletionDate',
   'OrgFullName',
   'OrgClass',
   'Keyword',
   'LocationCity',
   'LocationCountry'],
  'StudyFields': [{'Rank': 1,
    'NCTId': ['NCT04279795'],
    'BriefTitle': ['Detection of 2019 Novel Coronavirus in Multiple Organ System and Its Relationship With Clinical Manifestations'],
    'StatusVerifiedDate': ['February 2020'],
    'CompletionDate': ['February 28, 2021'],
    'OrgFullName': ['Third Affiliated Hospital, Sun Yat-Sen University'],
    'OrgClass': ['OTHER'],
    'Keyword': ['2019 Novel Coronavirus'],
    'LocationCity': ['Guangzhou'],
    'LocationCountry': ['China']}]}}

In [152]:
limit2 = Quest2['StudyFieldsResponse']['NStudiesFound'] #make sure we don't ask for results outside of the scope of our query
limit2 

2788

In [223]:
# Query the api in packets of 1000 then merge into a single dataframe

tempURL1 = 'http://ClinicalTrials.gov/api/query/study_fields?expr=covid+19\
           &min_rnk=1\
           &max_rnk=1000\
           &fields=\
               NCTId,\
               BriefTitle,\
               StatusVerifiedDate,\
               CompletionDate,\
               OrgFullName,\
               OrgClass,\
               keyword,\
               LocationCity,\
               LocationCountry\
           &fmt=JSON'   
tempL1 = requests.get(tempURL1).json()

tempURL2 = f'http://ClinicalTrials.gov/api/query/study_fields?expr=covid+19\
           &min_rnk=1001\
           &max_rnk={limit2}\
           &fields=\
               NCTId,\
               BriefTitle,\
               StatusVerifiedDate,\
               CompletionDate,\
               OrgFullName,\
               OrgClass,\
               keyword,\
               LocationCity,\
               LocationCountry\
           &fmt=JSON'
tempL2 = requests.get(tempURL2).json()                                                             

Q2_tempL1 = tempL1['StudyFieldsResponse']['StudyFields']
Q2_tempL2 = tempL2['StudyFieldsResponse']['StudyFields']

In [224]:
Q2_tempDF1 = pd.DataFrame(Q2_tempL1)
Q2_tempDF2 = pd.DataFrame(Q2_tempL2)

In [229]:
#returns single list of col values from all 3 temp DF's 

def unListItemsQ2(col):
    lst_1 = [' ']*len(Q2_tempDF1[col])  # create empty list the length of 'Q1_tempDF1[col]'
    i = 0
    while i < len(Q2_tempDF1[col]):
        if len(Q2_tempDF1[col][i])==0:
            lst_1[i] = None
        else:
            lst_1[i] = Q2_tempDF1[col][i][0]
        i += 1
    lst_2 = [' ']*len(Q2_tempDF2[col])
    i = 0
    while i < len(Q2_tempDF2[col]):
        if len(Q2_tempDF2[col][i])==0:
            lst_2[i] = None
        else:
            lst_2[i] = Q2_tempDF2[col][i][0]
        i += 1      
    LIST = lst_1 + lst_2
    return LIST

In [251]:
#retrive unListed Items, populate Query1 DataFrame with query results from id, title, and date keys

ID = unListItemsQ2('NCTId')
Title = unListItemsQ2('BriefTitle')
VerifiedDate = unListItemsQ2('StatusVerifiedDate')
CompletionDate = unListItemsQ2('CompletionDate')
OrgFullName = unListItemsQ2('OrgFullName')
OrgClass = unListItemsQ2('OrgClass')
QueryType = ['coronavirus']*len(ID)
Country = unListItemsQ2('LocationCountry')
City = unListItemsQ2('LocationCity')

DF_Q2 = pd.DataFrame()
DF_Q2.insert(0, 'ID', ID)
DF_Q2.insert(1, 'Title', Title)
DF_Q2.insert(2, 'Date Verified', VerifiedDate)
DF_Q2.insert(3, 'Completion Date', CompletionDate)
DF_Q2.insert(4, 'Organization', OrgFullName)
DF_Q2.insert(5, 'Organization Class', OrgClass)
DF_Q2.insert(6, 'Country', Country)
DF_Q2.insert(7, 'City', City)

In [252]:
DF_QQ = pd.concat([DF_Q1,DF_Q2], ignore_index=True)
DF_QQ = DF_QQ.drop_duplicates()
DF_QQ.set_index('ID', inplace=True)
DF_QQ

Unnamed: 0_level_0,Title,Date Verified,Completion Date,Organization,Organization Class,Country,City
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
NCT04395482,Lung CT Scan Analysis of SARS-CoV2 Induced Lun...,May 2020,"October 15, 2020",University of Milano Bicocca,OTHER,Italy,Bergamo
NCT04395924,Maternal-foetal Transmission of SARS-Cov-2,June 2020,May 2021,Centre Hospitalier Régional d'Orléans,OTHER,France,Orléans
NCT04476940,COVID-19 Breastfeeding Guideline for African-A...,July 2020,June 2022,Meharry Medical College,OTHER,United States,Nashville
NCT04412265,Frailty in Elderly Patients With COVID-19,June 2020,"March 1, 2021",University of Milano Bicocca,OTHER,Italy,Monza
NCT04427332,Smell and Taste Disorders in COVID-19 Patients,June 2020,"September 30, 2020",University of Milano Bicocca,OTHER,Italy,Monza
...,...,...,...,...,...,...,...
NCT04361643,Low-dose Lenalidomide for Non-severe COVID-19 ...,April 2020,"December 31, 2020",Hospital Universitario Getafe,OTHER,Spain,Getafe
NCT04394104,COVID-19 Wellness Survey,May 2020,"May 31, 2020","Rutgers, The State University of New Jersey",OTHER,United States,New Brunswick
NCT04401423,TXA COVID-19 Clinical Trial,May 2020,December 2021,Columbia University,OTHER,,
NCT04349371,Saved From COVID-19,April 2020,April 2021,Columbia University,OTHER,United States,New York
