In [81]:
import requests, re
import numpy as np
from lxml import etree
import xml.etree.ElementTree as ET
from tqdm import tqdm_notebook as tqdm
import zipfile, urllib
import pandas as pd
pd.set_option('display.max_columns', None)
from scipy import stats
import collections
import datetime as dt
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
def get_data_urls(year_range=None):
    """
        Args:
            year_range - List: Parameter will take either a 2 value list or `None`. If parameter is a list, results will be restricted to years less than the first value and greater than the second.
                Example: [2003, 2010] - Files returned : 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010
                Example: [2003, 2003] - Files returned : 2003
        Returns:
            all_data_links - List: List of links filtered by the requested `year_range`
    """
    url_base = 'https://exporter.nih.gov'
    url_search = url_base + '/ExPORTER_Catalog.aspx'

    url_text = requests.get(url_search).text

    parser = etree.XMLParser(recover=True)
    root = ET.fromstring(url_text, parser=parser)
    
    #### I stole this code from the internet but it will replace the prefix that gets added
    #### There's probably a better way to do this
    for elem in root.getiterator():
        if not hasattr(elem.tag, 'find'): continue
        i = elem.tag.find('}')
        if i >= 0:
            elem.tag = elem.tag[i+1:]
    ####
    data_2019 = [i.get('href') for i in root.findall('.//tr[@class="row_bg"]/td//a')]
    data_2019 = [i for i in data_2019  if ('XMLData' in i) and ('2019' in i)]
    data_2019 = [url_base + '/' + i for i in data_2019]
    
    
    ### Range + 1900 goes from 1985 to 2018    
    years = [1900 + i for i in range(85,119)]
    years = sorted(years, reverse = True)
    fy_links = [f'https://exporter.nih.gov/XMLData/final/RePORTER_PRJ_X_FY{year}.zip' for year in years]

    all_data_links = data_2019 + fy_links
    ### Make a copy because we modifying in place ya'll
    all_data_loop = all_data_links.copy()
    if year_range is not None:
        for link in all_data_loop:
            link_year = re.search('FY.{4}.*\.zip', link).group()
            ### Remove '.zip' then remove any '_\d' stuff that 2019 files have
            link_year = link_year.split('.')[0].split('_')[0]
            ### Remove 'FY'
            link_year = link_year[2:]
            link_year = int(link_year)
            if (link_year < year_range[0]) or (link_year > year_range[1]):
                ### modifying in place ya'll
                all_data_links.remove(link)
                
    return all_data_links

In [4]:
def extract_xml_from_zip(zip_links, out_format = 'dict'):
    """
    Function takes in a link to a zip file with an XML inside and will return a data structure containing that data.
    Data structure can be either a dictionary or a dataframe.
    
    """
    if isinstance(zip_links, str):
        zip_links = [zip_links]
    projects_dict = {}
    for zip_link in zip_links:
        zip_file = zip_link.split('/')[-1]
        file_name = zip_file.split('.')[0]
        xml_name = file_name + '.xml'
        with zipfile.ZipFile(urllib.request.urlretrieve(zip_link, filename=None)[0], 'r') as archive:
            xml_data = archive.read(xml_name)
        rows = []
        root = ET.fromstring(xml_data)

        
        for project in root.findall('./row'):
            project_dict = {}
            for column in project.findall('./'):
                column_header = column.tag.lower()
                if column_header == 'project_termsx':
                    column_header = 'project_terms'
                if column_header == 'project_terms':
                    project_dict['project_terms'] = [i.text for i in column.findall('./')]
                else:
                    project_dict[column_header] = column.text
                
            project_dict['data_source'] = zip_link
            projects_dict[project_dict['application_id']] = project_dict
    
    if out_format.lower() == 'df':
        projects_dict = pd.DataFrame.from_dict(projects_dict, orient='index')
    return projects_dict

In [6]:
urls = get_data_urls([2019, 2019])
urls

['https://exporter.nih.gov/XMLData/final/RePORTER_PRJ_X_FY2019_053.zip',
 'https://exporter.nih.gov/XMLData/final/RePORTER_PRJ_X_FY2019_052.zip',
 'https://exporter.nih.gov/XMLData/final/RePORTER_PRJ_X_FY2019_051.zip',
 'https://exporter.nih.gov/XMLData/final/RePORTER_PRJ_X_FY2019_050.zip',
 'https://exporter.nih.gov/XMLData/final/RePORTER_PRJ_X_FY2019_049.zip',
 'https://exporter.nih.gov/XMLData/final/RePORTER_PRJ_X_FY2019_048.zip',
 'https://exporter.nih.gov/XMLData/final/RePORTER_PRJ_X_FY2019_047.zip',
 'https://exporter.nih.gov/XMLData/final/RePORTER_PRJ_X_FY2019_046.zip',
 'https://exporter.nih.gov/XMLData/final/RePORTER_PRJ_X_FY2019_045.zip',
 'https://exporter.nih.gov/XMLData/final/RePORTER_PRJ_X_FY2019_044.zip',
 'https://exporter.nih.gov/XMLData/final/RePORTER_PRJ_X_FY2019_043.zip',
 'https://exporter.nih.gov/XMLData/final/RePORTER_PRJ_X_FY2019_042.zip',
 'https://exporter.nih.gov/XMLData/final/RePORTER_PRJ_X_FY2019_041.zip',
 'https://exporter.nih.gov/XMLData/final/RePORTER_P

In [7]:
example_df = extract_xml_from_zip(urls[0], out_format = 'df')

example_df.head()

Unnamed: 0,application_id,activity,administering_ic,application_type,arra_funded,award_notice_date,budget_start,budget_end,cfda_code,core_project_num,ed_inst_type,foa_number,full_project_num,funding_ics,funding_mechanism,fy,ic_name,nih_spending_cats,org_city,org_country,org_dept,org_district,org_duns,org_fips,org_ipf_code,org_name,org_state,org_zipcode,phr,pis,program_officer_name,project_start,project_end,project_terms,project_title,serial_number,study_section,study_section_name,subproject_id,support_year,suffix,direct_cost_amt,indirect_cost_amt,total_cost,total_cost_sub_project,data_source
10000664,10000664,OT2,HL,1,N,09/24/2019,09/20/2019,02/17/2020,837.0,OT2HL152640,,RFA-OT-A1-007,1OT2HL152640-01,NHLBI:88401\,OTHERS,2019,"NATIONAL HEART, LUNG, AND BLOOD INSTITUTE",,BOSTON,UNITED STATES,,7,76593722,US,1504801,BOSTON CHILDREN'S HOSPITAL,MA,21155724,,,"WELNIAK, LISBETH A",09/20/2019,02/17/2020,"[Adult, Age, Biological Markers, Boston, Brain...",Boston Consortium to Cure SCD - Williams/Ellen...,152640,ZHL1,Special Emphasis Panel,,1,,88401,,88401.0,,https://exporter.nih.gov/XMLData/final/RePORTE...
10000666,10000666,OT2,HL,1,N,09/24/2019,09/20/2019,02/17/2020,837.0,OT2HL152639,,RFA-OT-A1-007,1OT2HL152639-01,NHLBI:84000\,OTHERS,2019,"NATIONAL HEART, LUNG, AND BLOOD INSTITUTE",,BOSTON,UNITED STATES,,7,76593722,US,1504801,BOSTON CHILDREN'S HOSPITAL,MA,21155724,,,"WELNIAK, LISBETH A",09/20/2019,02/17/2020,"[acute chest syndrome, Affinity, Attenuated, b...",Boston Consortium to Cure SCD - Williams/John ...,152639,ZHL1,Special Emphasis Panel,,1,,84000,,84000.0,,https://exporter.nih.gov/XMLData/final/RePORTE...
10000883,10000883,P20,CA,3,N,09/25/2019,09/01/2019,08/31/2020,,P20CA202925,,PAR-16-084,3P20CA202925-04S1,NCI:25392\,RESEARCH CENTERS,2019,NATIONAL CANCER INSTITUTE,,DURHAM,UNITED STATES,,1,44387793,US,2221101,DUKE UNIVERSITY,NC,277054673,This proposed NCI P20 collaborative partnershi...,,"BAILEY, LEEANN ODETTE",,,"[Address, Advisory Committees, African America...",Admin Core,202925,ZCA1,Special Emphasis Panel,7201.0,4,S1,15771,9621.0,,25392.0,https://exporter.nih.gov/XMLData/final/RePORTE...
10000885,10000885,P20,CA,3,N,09/25/2019,09/01/2019,08/31/2020,,P20CA202925,,PA-18-906,3P20CA202925-04S2,NCI:17237\,RESEARCH CENTERS,2019,NATIONAL CANCER INSTITUTE,,DURHAM,UNITED STATES,,1,44387793,US,2221101,DUKE UNIVERSITY,NC,277054673,This proposed NCI P20 collaborative partnershi...,,"BAILEY, LEEANN ODETTE",,,"[Address, Advisory Committees, African America...",Admin Core,202925,ZCA1,Special Emphasis Panel,7201.0,4,S2,11071,6166.0,,17237.0,https://exporter.nih.gov/XMLData/final/RePORTE...
10000887,10000887,P20,CA,3,N,09/25/2019,09/01/2019,08/31/2020,,P20CA202925,,PAR-16-084,3P20CA202925-04S1,NCI:25391\,RESEARCH CENTERS,2019,NATIONAL CANCER INSTITUTE,,DURHAM,UNITED STATES,,1,44387793,US,2221101,DUKE UNIVERSITY,NC,277054673,Project Narrative African Americans are diag...,,"BAILEY, LEEANN ODETTE",,,"[5'-AMP-activated protein kinase, abiraterone,...",Pilot Project 1,202925,ZCA1,Special Emphasis Panel,7202.0,4,S1,15771,9620.0,,25391.0,https://exporter.nih.gov/XMLData/final/RePORTE...
