In [1]:
import requests
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math as math

In [2]:
#Create url search with Keywrod "COVID-19" containing the relevant fields we need for analysis and make single call to get data example
questURL = 'http://ClinicalTrials.gov/api/query/study_fields?expr=covid+19\
           &min_rnk=1\
           &max_rnk=1\
           &fields=\
               NCTId,\
               BriefTitle,\
               StatusVerifiedDate,\
               CompletionDate,\
               OrgFullName,\
               OrgClass,\
               keyword,\
               LocationCity,\
               LocationCountry,\
               EnrollmentCount,\
               STDAge,\
               Gender\
           &fmt=JSON'
Quest = requests.get(questURL).json()
Quest

{'StudyFieldsResponse': {'APIVrs': '1.01.02',
  'DataVrs': '2020:07:30 00:15:40.081',
  'Expression': 'covid 19',
  'NStudiesAvail': 347327,
  'NStudiesFound': 2949,
  'MinRank': 1,
  'MaxRank': 1,
  'NStudiesReturned': 1,
  'FieldList': ['NCTId',
   'BriefTitle',
   'StatusVerifiedDate',
   'CompletionDate',
   'OrgFullName',
   'OrgClass',
   'Keyword',
   'LocationCity',
   'LocationCountry',
   'EnrollmentCount',
   'StdAge',
   'Gender'],
  'StudyFields': [{'Rank': 1,
    'NCTId': ['NCT04395482'],
    'BriefTitle': ['Lung CT Scan Analysis of SARS-CoV2 Induced Lung Injury'],
    'StatusVerifiedDate': ['May 2020'],
    'CompletionDate': ['October 15, 2020'],
    'OrgFullName': ['University of Milano Bicocca'],
    'OrgClass': ['OTHER'],
    'Keyword': ['Lung injury', 'sars-covid-2', 'coronavirus infection'],
    'LocationCity': ['Bergamo',
     'Bergamo',
     'Ferrara',
     'Lecco',
     'Melzo',
     'Monza',
     'Rimini',
     'San Marino'],
    'LocationCountry': ['Italy',
   

In [3]:
# Take the amount of possible results and use as our limit, so we know how many times to call the API
limit1 = Quest['StudyFieldsResponse']['NStudiesFound'] 
limit1

2949

In [4]:
# Make our series of 3 requests and store them in three variables

tempURL1 = 'http://ClinicalTrials.gov/api/query/study_fields?expr=covid+19\
           &min_rnk=1\
           &max_rnk=1000\
           &fields=\
               NCTId,\
               BriefTitle,\
               StatusVerifiedDate,\
               StartDate,\
               CompletionDate,\
               OrgFullName,\
               OrgClass,\
               keyword,\
               LocationFacility,\
               LocationStatus,\
               LocationCity,\
               LocationCountry,\
               EnrollmentCount,\
               STDAge,\
               Gender\
           &fmt=JSON'   
tempL1 = requests.get(tempURL1).json()

tempURL2 = 'http://ClinicalTrials.gov/api/query/study_fields?expr=covid+19\
           &min_rnk=1001\
           &max_rnk=2000\
           &fields=\
               NCTId,\
               BriefTitle,\
               StatusVerifiedDate,\
               StartDate,\
               CompletionDate,\
               OrgFullName,\
               OrgClass,\
               keyword,\
               LocationFacility,\
               LocationStatus,\
               LocationCity,\
               LocationCountry,\
               EnrollmentCount,\
               STDAge,\
               Gender\
           &fmt=JSON'
tempL2 = requests.get(tempURL2).json()                                                             

tempURL3 = f'http://ClinicalTrials.gov/api/query/study_fields?expr=covid+19\
           &min_rnk=2001\
           &max_rnk={limit1}\
           &fields=\
               NCTId,\
               BriefTitle,\
               StatusVerifiedDate,\
               StartDate,\
               CompletionDate,\
               OrgFullName,\
               OrgClass,\
               keyword,\
               LocationFacility,\
               LocationStatus,\
               LocationCity,\
               LocationCountry,\
               EnrollmentCount,\
               STDAge,\
               Gender\
           &fmt=JSON'
tempL3 = requests.get(tempURL3).json()

Q1_tempL1 = tempL1['StudyFieldsResponse']['StudyFields']
Q1_tempL2 = tempL2['StudyFieldsResponse']['StudyFields']
Q1_tempL3 = tempL3['StudyFieldsResponse']['StudyFields']

In [5]:
# Turn those variables into a Dataframe
Q1_tempDF1 = pd.DataFrame(Q1_tempL1)
Q1_tempDF2 = pd.DataFrame(Q1_tempL2)
Q1_tempDF3 = pd.DataFrame(Q1_tempL3)

In [6]:
# create function that cleans up the results and transforms column values into the necessary data types
def change_remove45(df):
    df = df.astype(str)
    df["NCTId"] = df["NCTId"].str.lstrip("['")
    df["NCTId"] = df["NCTId"].str.rstrip("']")
    df["BriefTitle"] = df["BriefTitle"].str.lstrip("['")
    df["BriefTitle"] = df["BriefTitle"].str.rstrip("']")
    df["StatusVerifiedDate"] = df["StatusVerifiedDate"].str.lstrip("['")
    df["StatusVerifiedDate"] = df["StatusVerifiedDate"].str.rstrip("']")
    df["CompletionDate"] = df["CompletionDate"].str.lstrip("['")
    df["CompletionDate"] = df["CompletionDate"].str.rstrip("']")
    df["OrgFullName"] = df["OrgFullName"].str.lstrip("['")
    df["OrgFullName"] = df["OrgFullName"].str.rstrip("']")
    df["OrgClass"] = df["OrgClass"].str.lstrip("['")
    df["OrgClass"] = df["OrgClass"].str.rstrip("']")
    df["Keyword"] = df["Keyword"].str.lstrip("['")
    df["Keyword"] = df["Keyword"].str.rstrip("']")
    df["LocationCity"] = df["LocationCity"].str.lstrip("['")
    df["LocationCity"] = df["LocationCity"].str.rstrip("']")
    df["LocationCountry"] = df["LocationCountry"].str.lstrip("['")
    df["LocationCountry"] = df["LocationCountry"].str.rstrip("']")
    df["EnrollmentCount"] = df["EnrollmentCount"].str.lstrip("['")
    df["EnrollmentCount"] = df["EnrollmentCount"].str.rstrip("']")
    df["StdAge"] = df["StdAge"].str.lstrip("['")
    df["StdAge"] = df["StdAge"].str.rstrip("']")
    df["Gender"] = df["Gender"].str.lstrip("['")
    df["Gender"] = df["Gender"].str.rstrip("']")
    df["LocationFacility"] = df["LocationFacility"].str.lstrip("['")
    df["LocationFacility"] = df["LocationFacility"].str.rstrip("']")
    df["StartDate"] = df["StartDate"].str.lstrip("['")
    df["StartDate"] = df["StartDate"].str.rstrip("']")
    df["LocationStatus"] = df["LocationStatus"].str.lstrip("['")
    df["LocationStatus"] = df["LocationStatus"].str.rstrip("']")
    df["NCTId"] = df["NCTId"].str.replace("'","")
    df["BriefTitle"] = df["BriefTitle"].str.replace("'","")
    df["StatusVerifiedDate"] = df["StatusVerifiedDate"].str.replace("'","")
    df["CompletionDate"] = df["CompletionDate"].str.replace("'","")
    df["OrgFullName"] = df["OrgFullName"].str.replace("'","")
    df["OrgClass"] = df["OrgClass"].str.replace("'","")
    df["Keyword"] = df["Keyword"].str.replace("'","")
    df["LocationCity"] = df["LocationCity"].str.replace("'","")
    df["LocationCountry"] = df["LocationCountry"].str.replace("'","")
    df["EnrollmentCount"] = df["EnrollmentCount"].str.replace("'","")
    df["StdAge"] = df["StdAge"].str.replace("'","")
    df["Gender"] = df["Gender"].str.replace("'","")
    df["StartDate"] = df["StartDate"].str.replace("'","")
    df["LocationFacility"] = df["LocationFacility"].str.replace("'","")
    df["LocationFacility"] = df["LocationStatus"].str.replace("'","")
    df["EnrollmentCount"] = pd.to_numeric(df["EnrollmentCount"])
    return df 

In [7]:
# run the first dataframe against the function
clean_Q1_tempDF3 = change_remove45(Q1_tempDF3)
clean_Q1_tempDF3

Unnamed: 0,Rank,NCTId,BriefTitle,StatusVerifiedDate,StartDate,CompletionDate,OrgFullName,OrgClass,Keyword,LocationFacility,LocationStatus,LocationCity,LocationCountry,EnrollmentCount,StdAge,Gender
0,2001,NCT04375501,A Study of Outcomes in Patients With Fractured...,May 2020,"February 1, 2020","April 15, 2020",Barts & The London NHS Trust,OTHER,"COVID-19, Coronavirus, Neck of femur, Morbidit...",,,London,United Kingdom,442.0,"Adult, Older Adult",All
1,2002,NCT04378920,A Study of Trans Crocetin in Patients With Acu...,May 2020,"April 14, 2020","October 31, 2020",Institut de Cancerologie Strasbourg Europe,OTHER,,"Recruiting, Recruiting","Recruiting', 'Recruiting","Strasbourg, Strasbourg","France, France",180.0,"Adult, Older Adult",All
2,2003,NCT04416009,Extracellular Water in Covid 19 Pneumonia,June 2020,"July 5, 2020","February 20, 2021",Gaziosmanpasa Taksim Research and Education Ho...,OTHER_GOV,"covid-19, NİCaS, pneumonia",Recruiting,Recruiting,Istanbul,Turkey,52.0,"Adult, Older Adult",All
3,2004,NCT04389671,The Safety and Preliminary Efficacy of Lucinac...,May 2020,"June 1, 2020",November 2020,Windtree Therapeutics,INDUSTRY,,,,,,30.0,"Adult, Older Adult",All
4,2005,NCT04320472,Acute Encephalopathy in Critically Ill Patient...,June 2020,"March 23, 2020",December 2020,Ictal Group,OTHER,,"Recruiting, Recruiting, Recruiting, Recruiting...","Recruiting', 'Recruiting', 'Recruiting', 'Recr...","Miami, Atlanta, São Paulo, Cali, Argenteuil, B...","United States, United States, Brazil, Colombia...",250.0,"Adult, Older Adult",All
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
944,2945,NCT03728257,Lung Transplant G0 (LTGO): Improving Self-Mana...,June 2020,"March 25, 2019","April 8, 2022",University of Pittsburgh,OTHER,,Recruiting,Recruiting,Pittsburgh,United States,112.0,"Adult, Older Adult",All
945,2946,NCT03823469,Evaluating the Impact of a Culinary Coaching T...,May 2020,"May 20, 2019",February 2022,Spaulding Rehabilitation Hospital,OTHER,"Home cooking, Telemedicine, Nutrition, Health ...","Completed, Recruiting","Completed', 'Recruiting","Charlestown, Ramat Gan","United States, Israel",78.0,"Adult, Older Adult",All
946,2947,NCT04429061,Reaching 90 90 90 in Adolescents in Zambia: Us...,June 2020,"March 1, 2020","June 30, 2023",University of Alabama at Birmingham,OTHER,"Adolescents, Zambia, Football",Recruiting,Recruiting,Lusaka,Zambia,3200.0,"Child, Adult, Older Adult",Female
947,2948,NCT03922334,Navigating New Motherhood 2,June 2020,"January 21, 2020",December 2024,Northwestern University,OTHER,,Recruiting,Recruiting,Chicago,United States,400.0,"Child, Adult",Female


In [8]:
# run the second dataframe against the function
clean_Q1_tempDF2 = change_remove45(Q1_tempDF2)
clean_Q1_tempDF2

Unnamed: 0,Rank,NCTId,BriefTitle,StatusVerifiedDate,StartDate,CompletionDate,OrgFullName,OrgClass,Keyword,LocationFacility,LocationStatus,LocationCity,LocationCountry,EnrollmentCount,StdAge,Gender
0,1001,NCT04387214,COVID-19 Pandemic and Academic Performance of ...,June 2020,"April 13, 2020","May 31, 2020",South Valley University,OTHER,,,,Qinā,Egypt,857.0,"Child, Adult",All
1,1002,NCT04472013,Systematic Assessment of SARS-CoV-2 Neurotropi...,July 2020,September 2020,December 2022,"University Hospital, Basel, Switzerland",OTHER,"SARS-CoV-2, neurotropism of SARS-CoV-2, microg...","Recruiting, Recruiting","Recruiting', 'Recruiting","Basel, Liestal","Switzerland, Switzerland",40.0,"Adult, Older Adult",All
2,1003,NCT04354870,COVID-19 PrEP HCW HCQ Study,April 2020,"April 3, 2020","September 1, 2020",NYU Langone Health,OTHER,,Recruiting,Recruiting,New York,United States,350.0,"Adult, Older Adult",All
3,1004,NCT04460534,Detection of COVID-19 (SARS-CoV-2) in the Seme...,July 2020,"May 7, 2020","August 31, 2021",GCS Ramsay Santé pour l’Enseignement et la Rec...,OTHER,,Recruiting,Recruiting,Le Chesnay,France,100.0,"Adult, Older Adult",Male
4,1005,NCT04306497,TCM Differentiation and Treatment Protocol of ...,June 2020,"January 22, 2020","May 30, 2020","Jiangsu Famous Medical Technology Co., Ltd.",INDUSTRY,"COVID-19, Syndrome investigation, differentiat...",,,Huaian,China,340.0,"Adult, Older Adult",All
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,1996,NCT04336215,Rutgers COVID-19 Cohort Study,April 2020,"April 7, 2020","October 21, 2021","Rutgers, The State University of New Jersey",OTHER,,"Recruiting, Recruiting, Recruiting, Recruiting...","Recruiting', 'Recruiting', 'Recruiting', 'Recr...","New Brunswick, New Brunswick, Newark, Newark, ...","United States, United States, United States, U...",750.0,"Adult, Older Adult",All
996,1997,NCT04389671,The Safety and Preliminary Efficacy of Lucinac...,May 2020,"June 1, 2020",November 2020,Windtree Therapeutics,INDUSTRY,,,,,,30.0,"Adult, Older Adult",All
997,1998,NCT04468646,To Determine the Efficacy of Neurokinin 1 Rece...,July 2020,"June 15, 2020","August 30, 2020",University of Health Sciences Lahore,OTHER,"neurokinin 1 receptor, Substance P, Respirator...",Recruiting,Recruiting,Lahore,Pakistan,100.0,"Adult, Older Adult",All
998,1999,NCT04336748,HCQ for Primary Prophylaxis Against COVID19 in...,April 2020,April 2020,August 2020,Medical University of Vienna,OTHER,,,,,,440.0,"Adult, Older Adult",All


In [9]:
# run the 3rd dataframe against the function
clean_Q1_tempDF1 = change_remove45(Q1_tempDF1)
clean_Q1_tempDF1

Unnamed: 0,Rank,NCTId,BriefTitle,StatusVerifiedDate,StartDate,CompletionDate,OrgFullName,OrgClass,Keyword,LocationFacility,LocationStatus,LocationCity,LocationCountry,EnrollmentCount,StdAge,Gender
0,1,NCT04395924,Maternal-foetal Transmission of SARS-Cov-2,June 2020,"May 5, 2020",May 2021,"""Centre Hospitalier Régional dOrléans""",OTHER,"Pregnancy, RT-PCR-COVID-19, SARS-CoV 2 serolog...",Recruiting,Recruiting,Orléans,France,50.0,Adult,Female
1,2,NCT04395482,Lung CT Scan Analysis of SARS-CoV2 Induced Lun...,May 2020,"April 15, 2020","October 15, 2020",University of Milano Bicocca,OTHER,"Lung injury, sars-covid-2, coronavirus infection","Recruiting, Recruiting, Recruiting, Recruiting...","Recruiting', 'Recruiting', 'Recruiting', 'Recr...","Bergamo, Bergamo, Ferrara, Lecco, Melzo, Monza...","Italy, Italy, Italy, Italy, Italy, Italy, Ital...",500.0,"Adult, Older Adult",All
2,3,NCT04476940,COVID-19 Breastfeeding Guideline for African-A...,July 2020,September 2020,June 2022,Meharry Medical College,OTHER,"COVID-19, Exclusive Breastfeeding, Breastfeedi...",,,Nashville,United States,200.0,Adult,Female
3,4,NCT04412265,Frailty in Elderly Patients With COVID-19,June 2020,"March 1, 2020","March 1, 2021",University of Milano Bicocca,OTHER,"Coronavirus, sars-covid-2, coronavirus infecti...",Recruiting,Recruiting,Monza,Italy,300.0,"Adult, Older Adult",All
4,5,NCT04427332,Smell and Taste Disorders in COVID-19 Patients,June 2020,"May 15, 2020","September 30, 2020",University of Milano Bicocca,OTHER,"Coronavirus, sars-covid-2, coronavirus infection",Recruiting,Recruiting,Monza,Italy,500.0,"Adult, Older Adult",All
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,996,NCT04410328,Aggrenox To Treat Acute Covid-19,July 2020,"July 28, 2020","December 15, 2021","Rutgers, The State University of New Jersey",OTHER,"COVID-19, SARS-CoV, Aggrenox, Dipyridamole, As...",,,,,132.0,"Adult, Older Adult",All
996,997,NCT04342221,Hydroxychloroquine for COVID-19,March 2020,"March 29, 2020",February 2022,University Hospital Tuebingen,OTHER,,"Recruiting, Recruiting, Recruiting, Recruiting...","Recruiting', 'Recruiting', 'Recruiting', 'Recr...","Balingen, Darmstadt, Hamburg, Minden, Reutling...","Germany, Germany, Germany, Germany, Germany, G...",220.0,"Adult, Older Adult",All
997,998,NCT04420247,Efficacy of Chloroquine or Hydroxychloroquine ...,June 2020,"April 16, 2020","August 31, 2020",Centro de Estudos e Pesquisa em Emergencias Me...,OTHER,"VIRAL PNEUMONIA, CORONAVIRUS, COVID-19, COVID,...",Recruiting,Recruiting,Curitiba,Brazil,100.0,"Adult, Older Adult",All
998,999,NCT04342182,Convalescent Plasma as Therapy for Covid-19 Se...,May 2020,"April 8, 2020","July 1, 2020",Erasmus Medical Center,OTHER,,"Recruiting, Recruiting, Recruiting, Recruiting...","Recruiting', 'Recruiting', 'Recruiting', 'Recr...","Rotterdam, Alkmaar, Amsterdam, Arnhem, Delft, ...","Netherlands, Netherlands, Netherlands, Netherl...",426.0,"Adult, Older Adult",All


In [10]:
#Create url search with Keywrod "COVID-19" containing the relevant fields we need for analysis and make single call to get data example
query = 'coronavirus'
quest2URL = f'http://ClinicalTrials.gov/api/query/study_fields?expr={query}\
           &min_rnk=1\
           &max_rnk=1\
           &fields=\
               NCTId,\
               BriefTitle,\
               StatusVerifiedDate,\
               StartDate,\
               CompletionDate,\
               OrgFullName,\
               OrgClass,\
               keyword,\
               LocationFacility,\
               LocationStatus,\
               LocationCity,\
               LocationCountry,\
               EnrollmentCount,\
               STDAge,\
               Gender\
           &fmt=JSON'
Quest2 = requests.get(quest2URL).json()
Quest2

{'StudyFieldsResponse': {'APIVrs': '1.01.02',
  'DataVrs': '2020:07:30 00:15:40.081',
  'Expression': 'coronavirus',
  'NStudiesAvail': 347327,
  'NStudiesFound': 1491,
  'MinRank': 1,
  'MaxRank': 1,
  'NStudiesReturned': 1,
  'FieldList': ['NCTId',
   'BriefTitle',
   'StatusVerifiedDate',
   'StartDate',
   'CompletionDate',
   'OrgFullName',
   'OrgClass',
   'Keyword',
   'LocationFacility',
   'LocationStatus',
   'LocationCity',
   'LocationCountry',
   'EnrollmentCount',
   'StdAge',
   'Gender'],
  'StudyFields': [{'Rank': 1,
    'NCTId': ['NCT04279795'],
    'BriefTitle': ['Detection of 2019 Novel Coronavirus in Multiple Organ System and Its Relationship With Clinical Manifestations'],
    'StatusVerifiedDate': ['February 2020'],
    'StartDate': ['January 20, 2020'],
    'CompletionDate': ['February 28, 2021'],
    'OrgFullName': ['Third Affiliated Hospital, Sun Yat-Sen University'],
    'OrgClass': ['OTHER'],
    'Keyword': ['2019 Novel Coronavirus'],
    'LocationFacility'

In [11]:
# Take the amount of possible results and use as our limit, so we know how many times to call the API
limit2 = Quest2['StudyFieldsResponse']['NStudiesFound'] 
limit2

1491

In [12]:
# Make our series of 2 requests and store them in three variables
tempURL1 = 'http://ClinicalTrials.gov/api/query/study_fields?expr=covid+19\
           &min_rnk=1\
           &max_rnk=1000\
           &fields=\
               NCTId,\
               BriefTitle,\
               StatusVerifiedDate,\
               StartDate,\
               CompletionDate,\
               OrgFullName,\
               OrgClass,\
               keyword,\
               LocationFacility,\
               LocationStatus,\
               LocationCity,\
               LocationCountry,\
               EnrollmentCount,\
               STDAge,\
               Gender\
           &fmt=JSON'   

tempL1 = requests.get(tempURL1).json()

tempURL2 = f'http://ClinicalTrials.gov/api/query/study_fields?expr=covid+19\
           &min_rnk=1001\
           &max_rnk={limit2}\
           &fields=\
               NCTId,\
               BriefTitle,\
               StatusVerifiedDate,\
               StartDate,\
               CompletionDate,\
               OrgFullName,\
               OrgClass,\
               keyword,\
               LocationFacility,\
               LocationStatus,\
               LocationCity,\
               LocationCountry,\
               EnrollmentCount,\
               STDAge,\
               Gender\
           &fmt=JSON'
tempL2 = requests.get(tempURL2).json()                                                             

Q2_tempL1 = tempL1['StudyFieldsResponse']['StudyFields']
Q2_tempL2 = tempL2['StudyFieldsResponse']['StudyFields']

In [13]:
# Turn the variables into DataFrames
Q2_tempDF1 = pd.DataFrame(Q2_tempL1)
Q2_tempDF2 = pd.DataFrame(Q2_tempL2)

In [14]:
# run the 4th dataframe against the function
clean_Q2_tempDF1 = change_remove45(Q2_tempDF1)
clean_Q2_tempDF1

Unnamed: 0,Rank,NCTId,BriefTitle,StatusVerifiedDate,StartDate,CompletionDate,OrgFullName,OrgClass,Keyword,LocationFacility,LocationStatus,LocationCity,LocationCountry,EnrollmentCount,StdAge,Gender
0,1,NCT04395482,Lung CT Scan Analysis of SARS-CoV2 Induced Lun...,May 2020,"April 15, 2020","October 15, 2020",University of Milano Bicocca,OTHER,"Lung injury, sars-covid-2, coronavirus infection","Recruiting, Recruiting, Recruiting, Recruiting...","Recruiting', 'Recruiting', 'Recruiting', 'Recr...","Bergamo, Bergamo, Ferrara, Lecco, Melzo, Monza...","Italy, Italy, Italy, Italy, Italy, Italy, Ital...",500.0,"Adult, Older Adult",All
1,2,NCT04395924,Maternal-foetal Transmission of SARS-Cov-2,June 2020,"May 5, 2020",May 2021,"""Centre Hospitalier Régional dOrléans""",OTHER,"Pregnancy, RT-PCR-COVID-19, SARS-CoV 2 serolog...",Recruiting,Recruiting,Orléans,France,50.0,Adult,Female
2,3,NCT04476940,COVID-19 Breastfeeding Guideline for African-A...,July 2020,September 2020,June 2022,Meharry Medical College,OTHER,"COVID-19, Exclusive Breastfeeding, Breastfeedi...",,,Nashville,United States,200.0,Adult,Female
3,4,NCT04412265,Frailty in Elderly Patients With COVID-19,June 2020,"March 1, 2020","March 1, 2021",University of Milano Bicocca,OTHER,"Coronavirus, sars-covid-2, coronavirus infecti...",Recruiting,Recruiting,Monza,Italy,300.0,"Adult, Older Adult",All
4,5,NCT04427332,Smell and Taste Disorders in COVID-19 Patients,June 2020,"May 15, 2020","September 30, 2020",University of Milano Bicocca,OTHER,"Coronavirus, sars-covid-2, coronavirus infection",Recruiting,Recruiting,Monza,Italy,500.0,"Adult, Older Adult",All
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,996,NCT04341480,The Safety of Chemotherapy for Patients With G...,April 2020,"April 10, 2020","July 10, 2020",Tongji Hospital,OTHER,"COVID-19, SARS-CoV-2, Gynecological cancer, Ch...",,,Wuhan,China,207.0,"Adult, Older Adult",Female
996,997,NCT04445389,"Safety and Immunogenicity Study of GX-19, a CO...",July 2020,"June 17, 2020","June 17, 2022","Genexine, Inc.",INDUSTRY,,Recruiting,Recruiting,Seoul,"Korea, Republic of",210.0,Adult,All
997,998,NCT04383886,Evaluation of Emergency Department (ED) Staff ...,April 2020,"April 18, 2020","August 18, 2020",Hospices Civils de Lyon,OTHER,"COVID-19 pandemic, level of stress, emergency ...","Recruiting, Recruiting, Recruiting, Recruiting...","Recruiting', 'Recruiting', 'Recruiting', 'Recr...","Lyon, Lyon, Lyon, Pierre-Bénite, Villefranche-...","France, France, France, France, France",200.0,"Adult, Older Adult",All
998,999,NCT04393311,Ulinastatin for the Treatment of COVID-19 in H...,May 2020,July 2020,January 2021,Stanford University,OTHER,,,,Stanford,United States,150.0,"Adult, Older Adult",All


In [15]:
# run the 5th dataframe against the function
clean_Q2_tempDF2 = change_remove45(Q2_tempDF2)
clean_Q2_tempDF2

Unnamed: 0,Rank,NCTId,BriefTitle,StatusVerifiedDate,StartDate,CompletionDate,OrgFullName,OrgClass,Keyword,LocationFacility,LocationStatus,LocationCity,LocationCountry,EnrollmentCount,StdAge,Gender
0,1001,NCT04432805,Descriptive and Evaluation Study of the Use of...,June 2020,"June 15, 2020","June 15, 2021",Assistance Publique Hopitaux De Marseille,OTHER,,Recruiting,Recruiting,Marseille,France,160.0,"Adult, Older Adult",Female
1,1002,NCT04320277,Baricitinib in Symptomatic Patients Infected b...,April 2020,"May 16, 2020","July 30, 2020",Hospital of Prato,OTHER,"COVID-19, Baricitinib, Moderate disease, Infec...",,,Prato,Italy,200.0,"Adult, Older Adult",All
2,1003,NCT04445389,"Safety and Immunogenicity Study of GX-19, a CO...",July 2020,"June 17, 2020","June 17, 2022","Genexine, Inc.",INDUSTRY,,Recruiting,Recruiting,Seoul,"Korea, Republic of",210.0,Adult,All
3,1004,NCT04352842,Echocardiographic Manifestation in Patient Wit...,April 2020,"January 21, 2020","April 8, 2020",RenJi Hospital,OTHER,"COVID-19, Echocardiography, cardiac structure,...",,,Shanghai,China,51.0,"Adult, Older Adult",All
4,1005,NCT04346342,PRactice of VENTilation in COVID-19 Patients (...,July 2020,"March 6, 2020","October 1, 2020",Academisch Medisch Centrum - Universiteit van ...,OTHER,COVID,"Recruiting, Recruiting, Recruiting, Recruiting...","Recruiting', 'Recruiting', 'Recruiting', 'Recr...","Almere, Amsterdam, Amsterdam, Apeldoorn, Arnhe...","Netherlands, Netherlands, Netherlands, Netherl...",1000.0,"Adult, Older Adult",All
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
486,1487,NCT04347941,Awake Prone Positioning to Reduce Invasive VEn...,April 2020,"April 11, 2020","May 11, 2021",University College Hospital Galway,OTHER,,,,Galway,Ireland,200.0,"Adult, Older Adult",All
487,1488,NCT04390464,mulTi-Arm Therapeutic Study in Pre-ICu Patient...,May 2020,"May 8, 2020","May 1, 2022",Cambridge University Hospitals NHS Foundation ...,OTHER,,Recruiting,Recruiting,Cambridge,United Kingdom,1167.0,"Adult, Older Adult",All
488,1489,NCT04335552,Pragmatic Factorial Trial of Hydroxychloroquin...,April 2020,"April 17, 2020","August 1, 2020",Duke University,OTHER,COVID-19,"Recruiting, Recruiting, Not yet recruiting, Re...","Recruiting', 'Recruiting', 'Not yet recruiting...","Durham, Durham, Durham, Raleigh","United States, United States, United States, U...",500.0,"Child, Adult, Older Adult",All
489,1490,NCT04359277,A Randomized Trial of Anticoagulation Strategi...,June 2020,"April 21, 2020","April 21, 2021",NYU Langone Health,OTHER,,Recruiting,Recruiting,New York,United States,1000.0,"Adult, Older Adult",All


In [16]:
# combine all 5 dataframes into a single, large dataframe
DF_QQ = pd.concat([clean_Q1_tempDF1, clean_Q1_tempDF2, clean_Q1_tempDF3, clean_Q2_tempDF1, clean_Q2_tempDF2], ignore_index=True)
DF_QQ = DF_QQ.drop_duplicates()
DF_QQ

Unnamed: 0,Rank,NCTId,BriefTitle,StatusVerifiedDate,StartDate,CompletionDate,OrgFullName,OrgClass,Keyword,LocationFacility,LocationStatus,LocationCity,LocationCountry,EnrollmentCount,StdAge,Gender
0,1,NCT04395924,Maternal-foetal Transmission of SARS-Cov-2,June 2020,"May 5, 2020",May 2021,"""Centre Hospitalier Régional dOrléans""",OTHER,"Pregnancy, RT-PCR-COVID-19, SARS-CoV 2 serolog...",Recruiting,Recruiting,Orléans,France,50.0,Adult,Female
1,2,NCT04395482,Lung CT Scan Analysis of SARS-CoV2 Induced Lun...,May 2020,"April 15, 2020","October 15, 2020",University of Milano Bicocca,OTHER,"Lung injury, sars-covid-2, coronavirus infection","Recruiting, Recruiting, Recruiting, Recruiting...","Recruiting', 'Recruiting', 'Recruiting', 'Recr...","Bergamo, Bergamo, Ferrara, Lecco, Melzo, Monza...","Italy, Italy, Italy, Italy, Italy, Italy, Ital...",500.0,"Adult, Older Adult",All
2,3,NCT04476940,COVID-19 Breastfeeding Guideline for African-A...,July 2020,September 2020,June 2022,Meharry Medical College,OTHER,"COVID-19, Exclusive Breastfeeding, Breastfeedi...",,,Nashville,United States,200.0,Adult,Female
3,4,NCT04412265,Frailty in Elderly Patients With COVID-19,June 2020,"March 1, 2020","March 1, 2021",University of Milano Bicocca,OTHER,"Coronavirus, sars-covid-2, coronavirus infecti...",Recruiting,Recruiting,Monza,Italy,300.0,"Adult, Older Adult",All
4,5,NCT04427332,Smell and Taste Disorders in COVID-19 Patients,June 2020,"May 15, 2020","September 30, 2020",University of Milano Bicocca,OTHER,"Coronavirus, sars-covid-2, coronavirus infection",Recruiting,Recruiting,Monza,Italy,500.0,"Adult, Older Adult",All
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4435,1487,NCT04347941,Awake Prone Positioning to Reduce Invasive VEn...,April 2020,"April 11, 2020","May 11, 2021",University College Hospital Galway,OTHER,,,,Galway,Ireland,200.0,"Adult, Older Adult",All
4436,1488,NCT04390464,mulTi-Arm Therapeutic Study in Pre-ICu Patient...,May 2020,"May 8, 2020","May 1, 2022",Cambridge University Hospitals NHS Foundation ...,OTHER,,Recruiting,Recruiting,Cambridge,United Kingdom,1167.0,"Adult, Older Adult",All
4437,1489,NCT04335552,Pragmatic Factorial Trial of Hydroxychloroquin...,April 2020,"April 17, 2020","August 1, 2020",Duke University,OTHER,COVID-19,"Recruiting, Recruiting, Not yet recruiting, Re...","Recruiting', 'Recruiting', 'Not yet recruiting...","Durham, Durham, Durham, Raleigh","United States, United States, United States, U...",500.0,"Child, Adult, Older Adult",All
4438,1490,NCT04359277,A Randomized Trial of Anticoagulation Strategi...,June 2020,"April 21, 2020","April 21, 2021",NYU Langone Health,OTHER,,Recruiting,Recruiting,New York,United States,1000.0,"Adult, Older Adult",All


In [17]:
# check for dupes
dupe_dupe = DF_QQ.pivot_table(index=['NCTId'], aggfunc='size')
print (dupe_dupe)

NCTId
NCT01306084    1
NCT02517489    1
NCT02656381    1
NCT02735707    1
NCT02765191    1
              ..
NCT04493242    2
NCT04493268    2
NCT04493294    2
NCT04493307    2
NCT04493359    2
Length: 2919, dtype: int64


In [18]:
# drop duplicates
df_final = DF_QQ.drop_duplicates(subset='NCTId', keep="first")
df_final

Unnamed: 0,Rank,NCTId,BriefTitle,StatusVerifiedDate,StartDate,CompletionDate,OrgFullName,OrgClass,Keyword,LocationFacility,LocationStatus,LocationCity,LocationCountry,EnrollmentCount,StdAge,Gender
0,1,NCT04395924,Maternal-foetal Transmission of SARS-Cov-2,June 2020,"May 5, 2020",May 2021,"""Centre Hospitalier Régional dOrléans""",OTHER,"Pregnancy, RT-PCR-COVID-19, SARS-CoV 2 serolog...",Recruiting,Recruiting,Orléans,France,50.0,Adult,Female
1,2,NCT04395482,Lung CT Scan Analysis of SARS-CoV2 Induced Lun...,May 2020,"April 15, 2020","October 15, 2020",University of Milano Bicocca,OTHER,"Lung injury, sars-covid-2, coronavirus infection","Recruiting, Recruiting, Recruiting, Recruiting...","Recruiting', 'Recruiting', 'Recruiting', 'Recr...","Bergamo, Bergamo, Ferrara, Lecco, Melzo, Monza...","Italy, Italy, Italy, Italy, Italy, Italy, Ital...",500.0,"Adult, Older Adult",All
2,3,NCT04476940,COVID-19 Breastfeeding Guideline for African-A...,July 2020,September 2020,June 2022,Meharry Medical College,OTHER,"COVID-19, Exclusive Breastfeeding, Breastfeedi...",,,Nashville,United States,200.0,Adult,Female
3,4,NCT04412265,Frailty in Elderly Patients With COVID-19,June 2020,"March 1, 2020","March 1, 2021",University of Milano Bicocca,OTHER,"Coronavirus, sars-covid-2, coronavirus infecti...",Recruiting,Recruiting,Monza,Italy,300.0,"Adult, Older Adult",All
4,5,NCT04427332,Smell and Taste Disorders in COVID-19 Patients,June 2020,"May 15, 2020","September 30, 2020",University of Milano Bicocca,OTHER,"Coronavirus, sars-covid-2, coronavirus infection",Recruiting,Recruiting,Monza,Italy,500.0,"Adult, Older Adult",All
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3929,981,NCT04321421,Hyperimmune Plasma for Critical Patients With ...,May 2020,"March 17, 2020","May 7, 2020",Foundation IRCCS San Matteo Hospital,OTHER,"hyperimmune plasma, COVID-19",,,"Pavia, Mantova","Italy, Italy",49.0,"Adult, Older Adult",All
3933,985,NCT04489446,Sildenafil in COVID-19,July 2020,August 2020,March 2021,Universidad Nacional Andres Bello,OTHER,"COVID19, SARS-COV2 Infection, Sildenafil, Phos...",,,Viña Del Mar,Chile,40.0,"Adult, Older Adult",All
3936,988,NCT04438993,The COVID-19 Disease and CARdiac Events Study,June 2020,"May 29, 2020",November 2020,NHS Lanarkshire,OTHER_GOV,"Echocardiogram, Troponin, NT-proBNP, Cytokines",Recruiting,Recruiting,Glasgow,United Kingdom,100.0,"Adult, Older Adult",All
3945,997,NCT04445389,"Safety and Immunogenicity Study of GX-19, a CO...",July 2020,"June 17, 2020","June 17, 2022","Genexine, Inc.",INDUSTRY,,Recruiting,Recruiting,Seoul,"Korea, Republic of",210.0,Adult,All


In [19]:
# set the index to Study ID
df_final.set_index('NCTId', inplace=True)
df_final

Unnamed: 0_level_0,Rank,BriefTitle,StatusVerifiedDate,StartDate,CompletionDate,OrgFullName,OrgClass,Keyword,LocationFacility,LocationStatus,LocationCity,LocationCountry,EnrollmentCount,StdAge,Gender
NCTId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
NCT04395924,1,Maternal-foetal Transmission of SARS-Cov-2,June 2020,"May 5, 2020",May 2021,"""Centre Hospitalier Régional dOrléans""",OTHER,"Pregnancy, RT-PCR-COVID-19, SARS-CoV 2 serolog...",Recruiting,Recruiting,Orléans,France,50.0,Adult,Female
NCT04395482,2,Lung CT Scan Analysis of SARS-CoV2 Induced Lun...,May 2020,"April 15, 2020","October 15, 2020",University of Milano Bicocca,OTHER,"Lung injury, sars-covid-2, coronavirus infection","Recruiting, Recruiting, Recruiting, Recruiting...","Recruiting', 'Recruiting', 'Recruiting', 'Recr...","Bergamo, Bergamo, Ferrara, Lecco, Melzo, Monza...","Italy, Italy, Italy, Italy, Italy, Italy, Ital...",500.0,"Adult, Older Adult",All
NCT04476940,3,COVID-19 Breastfeeding Guideline for African-A...,July 2020,September 2020,June 2022,Meharry Medical College,OTHER,"COVID-19, Exclusive Breastfeeding, Breastfeedi...",,,Nashville,United States,200.0,Adult,Female
NCT04412265,4,Frailty in Elderly Patients With COVID-19,June 2020,"March 1, 2020","March 1, 2021",University of Milano Bicocca,OTHER,"Coronavirus, sars-covid-2, coronavirus infecti...",Recruiting,Recruiting,Monza,Italy,300.0,"Adult, Older Adult",All
NCT04427332,5,Smell and Taste Disorders in COVID-19 Patients,June 2020,"May 15, 2020","September 30, 2020",University of Milano Bicocca,OTHER,"Coronavirus, sars-covid-2, coronavirus infection",Recruiting,Recruiting,Monza,Italy,500.0,"Adult, Older Adult",All
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
NCT04321421,981,Hyperimmune Plasma for Critical Patients With ...,May 2020,"March 17, 2020","May 7, 2020",Foundation IRCCS San Matteo Hospital,OTHER,"hyperimmune plasma, COVID-19",,,"Pavia, Mantova","Italy, Italy",49.0,"Adult, Older Adult",All
NCT04489446,985,Sildenafil in COVID-19,July 2020,August 2020,March 2021,Universidad Nacional Andres Bello,OTHER,"COVID19, SARS-COV2 Infection, Sildenafil, Phos...",,,Viña Del Mar,Chile,40.0,"Adult, Older Adult",All
NCT04438993,988,The COVID-19 Disease and CARdiac Events Study,June 2020,"May 29, 2020",November 2020,NHS Lanarkshire,OTHER_GOV,"Echocardiogram, Troponin, NT-proBNP, Cytokines",Recruiting,Recruiting,Glasgow,United Kingdom,100.0,"Adult, Older Adult",All
NCT04445389,997,"Safety and Immunogenicity Study of GX-19, a CO...",July 2020,"June 17, 2020","June 17, 2022","Genexine, Inc.",INDUSTRY,,Recruiting,Recruiting,Seoul,"Korea, Republic of",210.0,Adult,All
