In [1]:
import os
import pandas as pd
import xml.etree.ElementTree as ET

In [2]:
project_path = os.getcwd()
studies = os.listdir(os.path.join(project_path, "data", "XML")) 

In [3]:
def createFrame(studies):
    """Create the initial frame of study IDs for merging"""
    build = []

    for study in studies:
            
            filepath = os.path.join(project_path, "data", "XML", study)
            tree = ET.parse(filepath)
            root = tree.getroot()
            
            for elm in root.findall('.//nct_id'):
                build.append(elm.text)
            
    frame = pd.DataFrame.from_dict({'id' : build})
    
    return frame


def buildVariables(studies, frame, targets):
    """loop through targets, extract from XML, and merge with the frame"""
    for label, loc in targets.items():

        # initialize the list for each variable extract
        build = []
        
        # loop through all studies
        for study in studies:
            
            # load the xml
            filepath = os.path.join(project_path, "data", "XML", study)
            tree = ET.parse(filepath)
            root = tree.getroot()
            
            # extract the ID
            elm_id = root.findall('.//nct_id')[0].text
            
            # handle target collection
            if(len(root.findall(loc)) == 1):
                elm_target = root.findall(loc)[0].text
            else:
                elm_target = ""
            
            # build pair
            pair = [elm_id, elm_target]
            build.append(pair)
        
        # after looping through studies, convert build to df and merge with frame
        df = pd.DataFrame(build, columns=['id', label])
        frame = frame.merge(df, on='id')
    
    return frame


def buildTag(studies, tagName, tagLevels):
    """Loop through studies and extract specified information from multiple tags"""
    
    build = []
    colnames = ['id'] + tagLevels

    for study in studies:
        
        # load the xml        
        filepath = os.path.join(project_path, "data", "XML", study)
        tree = ET.parse(filepath)
        root = tree.getroot()
        
        # extract the ID
        id = root.findall('.//nct_id')[0].text
        
        # get all tags in the study
        tags = root.findall(".//" + tagName)
        
        # get all relevant information in each tag
        for t in tags:
            
            # initialize the row (each column is a tag 'level')
            row = [id]

            for level in tagLevels:
                
                if(len(t.findall(".//" + level)) == 1):
                    row.append(t.findall(".//" + level)[0].text)
                else:
                    row.append("")
            
            # accumulate rows
            build.append(row)

    df = pd.DataFrame(build, columns=colnames)
    return df

In [4]:
targets = {'agency' : ".//lead_sponsor/agency",
           'allocation' : './/allocation',
           'title' : './/brief_title',
           'status' : './/overall_status',
           'PI' : './/overall_official/last_name',
           'phase' : './/phase',
           'study_type' : './/study_type',
           'primary_outcome_measure' : './/primary_outcome/measure',
           'acronym' : './/acronym',
           'start_date' : './/start_date',
           'completion_date' : './/completion_date',
           'primary_completion_date' : './/primary_completion_date',
           'gender' : './/gender',
           'minimum_age' : './/minimum_age',
           'maximum_age' : './/maximum_age',
           'number_of_arms' : './/number_of_arms',
           'enrollment' : './/enrollment'}

frame = createFrame(studies)

studydetails = buildVariables(studies, frame, targets)

tagIntervention = buildTag(studies, 
                           'intervention',
                           ['intervention_type', 'intervention_name', 'description'])

tagLocation = buildTag(studies,
                       'location',
                       ['city', 'state', 'country'])

tagPrimaryOutcome = buildTag(studies,
                             'primary_outcome',
                             ['measure', 'time_frame', 'description'])

tagSecondaryOutcome = buildTag(studies,
                               'secondary_outcome',
                               ['measure', 'time_frame', 'description'])

In [8]:
studydetails.to_csv(os.path.join(project_path, "data", "CTD_Details.csv"))
tagIntervention.to_csv(os.path.join(project_path, "data", "CTD_Intervention.csv"))
tagLocation.to_csv(os.path.join(project_path, "data", "CTD_Location.csv"))
tagPrimaryOutcome.to_csv(os.path.join(project_path, "data", "CTD_Outcome1.csv"))
tagSecondaryOutcome.to_csv(os.path.join(project_path, "data", "CTD_Outcome2.csv"))