In [1]:
# import necessary packages
import os
import xmltodict
import pprint
import pandas as pd
import re

In [2]:
def pull_trial(filename):
    '''
    main function to pull information from each file 
    filename: xml file name
    return: list of file information [nct id, condition, intervenction, age range, gender, healthy participants, criteria]
    '''
    #parse into folder where file is 
    folder=filename[0:7]+'xxxx'
    os.chdir('/Users/meldrumapple/Desktop/Capstone/AllPublicXML/'+str(folder))
    #open file and convert to dictionary
    with open(filename, 'r', encoding='utf-8') as doc:
        file = doc.read()
    dct=xmltodict.parse(file)
    
    #Extract trial info
    try: 
        num=dct['clinical_study']['id_info']['nct_id'] #nct id
    except: 
        num=pd.NA
    
    #pull condition
    try: 
        condition= dct['clinical_study']['condition']
    except: 
        condition=pd.NA
    
    # pull intervention data- this can be formatted as list of dictionary or just dictionary
    try: 
        intv=[dct['clinical_study']['intervention']['intervention_name'],dct['clinical_study']['intervention']['intervention_type']]
    except:
        try:
            intv=[dct['clinical_study']['intervention'][0]['intervention_name'],dct['clinical_study']['intervention'][0]['intervention_type']]
        except:
            intv=pd.NA #sometimes trial has no intervention category at all
    
    #pull age range
    try:
        ages=[dct['clinical_study']['eligibility']['minimum_age'],dct['clinical_study']['eligibility']['maximum_age']]
    except: 
        ages=pd.NA
        
    #pull gender
    try:
        gender=dct['clinical_study']['eligibility']['gender']
    except:
        gender=pd.NA
        
    # pull healthy
    try: 
        healthy=dct['clinical_study']['eligibility']['healthy_volunteers']
    except:
        healthy=pd.NA
    
    # Extract criteria and clean up
    try: 
        criteria= dct['clinical_study']['eligibility']['criteria']['textblock']
        # Cleaning Criteria Text: 
        criteria=criteria.lower() # make lowercase
        criteria = re.sub(r'\d+\.', ' ', criteria) #remove numbering
        for each in ['-','(',')',"'",':','i.e.','.','inclusion criteria', 'inclusion', ',']: #list of other punctuation to remove
            criteria=criteria.replace(each, '')
        criteria=criteria.split('exclusion criteria')
        if len(criteria)==1: 
            criteria=str(criteria[0]).split('exclusion')
        for i in range(len(criteria)):
            criteria[i]=re.sub(r'\s\s+', '##', criteria[i])
            criteria[i]=criteria[i].split('##')
            criteria[i]=[x for x in criteria[i] if x] #remove empty strings
    except:
        criteria=[['NA'], ['NA']]
    # if no exclusion criteria, add NA
    if len(criteria==1):
        criteria.append(['NA'])
    # mark inclusion and exclusion criteria
    in_criteria=criteria[0]
    ex_criteria=criteria[1]
    # add all variables to a row in intialized dataframe
    df.loc[len(df.index)] = [num, condition, intv, ages, gender, healthy, in_criteria, ex_criteria]
    return None

In [6]:
#initialize empty df
df=pd.DataFrame(columns=['nct_id', 'condition', 'intervention','ages','gender','healthy','in_criteria', 'ex_criteria']) 

In [7]:
#pull from NCTIds in batches of 100000, cover all possible NCTIds
for k in list(range(50, 57)):
    start=(k-1)*100000
    end=k*100000
    for i in range(start, end):
        j='00000000'+str(i)           #add a bunch of zeros onto front of numbers
        j=j[-8:]                      #select only the last 8 digits
        try:
            pull_trial('NCT'+str(j)+'.xml')
        except:
            None
    print('trials '+str(start)+'-'+str(end)+' done')

trials 4900000-5000000 done
trials 5000000-5100000 done
trials 5100000-5200000 done
trials 5200000-5300000 done
trials 5300000-5400000 done
trials 5400000-5500000 done
trials 5500000-5600000 done


In [8]:
df.to_csv('/Users/meldrumapple/Desktop/Capstone/fullpull6')

NameError: name 'datmerge' is not defined