In [77]:
# Import dependencies
import pandas
import mechanize
import re
from bs4 import BeautifulSoup
from dateutil import rrule
from datetime import datetime, timedelta

In [79]:
# CONFIGURATION

# the URL where a list of all projects can be had without having to paginate
BASE_URL = "https://healthit.ahrq.gov"
PROJECT_LIST_URL = BASE_URL+"/ahrq-funded-projects/search?search_api_aggregation_4=&sort_by=field_fp_dates_value&multiselect_edit-sort-by=field_fp_dates_value&sort_order=ASC&multiselect_edit-sort-order=ASC&items_per_page=All&multiselect_edit-items-per-page=All"
PROJECT_FIELDS = ["abstract","contract-number","category","funded-amount","principal-investigator","organization","city","location","dates","status","technology","community","care-setting","aspect-care","target-population"]

In [80]:
# HELPERS
def elements_to_semicolon_sep_string(elements):
    if elements: # not empty
        items = list(map((lambda i: i.string), elements))
    else: # empty
        items = []
    items = filter(None, items) # remove None from items list
    return ";".join(items)

In [None]:
# GET THE DATA! ( by month :-\ )
print datetime.now()

projects = []

# create mechanize browser and go to search form page
browser = mechanize.Browser() 
browser.open(PROJECT_LIST_URL) # URL for only 10: "https://healthit.ahrq.gov/ahrq-funded-projects/search?search_api_aggregation_4=&sort_by=field_fp_dates_value&multiselect_edit-sort-by=field_fp_dates_value&sort_order=ASC&multiselect_edit-sort-order=ASC&items_per_page=10&multiselect_edit-items-per-page=10"


# extract list of projects 
b = browser.response()
soup = BeautifulSoup(b.read())
project_links_elements = soup.select("div.view-funded-project-search div.view-content div.views-row a")
project_urls = list(map((lambda ple: BASE_URL+ple.get('href')), project_links_elements))

for project_url in project_urls:
    project_data = {}
    project_data["profile_url"] = project_url
    
    browser.open(project_url)
    b = browser.response()
    soup = BeautifulSoup(b.read())
    project_data["title"] = soup.find(id="page-title").string
    for field_slug in PROJECT_FIELDS:
        project_data[field_slug] = elements_to_semicolon_sep_string(soup.select(".field.field-name-field-fp-{0} .field-item".format(field_slug)))
    projects.append(project_data)

print datetime.now()


In [76]:
# DO STUFF WITH THE DATA 

# First things first, convert the dictionary to a pandas dataframe
projects_df = pandas.DataFrame(projects)
projects_df = projects_df.dropna(how='all') # remove empty rows

# Export data to csv
projects_df.to_csv('data/projects.csv',index=False,encoding='utf-8')
projects_df


Unnamed: 0,abstract,aspect-care,care-setting,category,city,community,contract-number,dates,funded-amount,location,organization,principal-investigator,profile_url,status,target-population,technology,title
0,,Care Coordination;Primary Care;Specialty Care,Ambulatory;Community Health Center;Emergency D...,State and Regional Demonstrations in Health In...,Nashville,Non-rural;Rural,290-04-0006,,"$5,189,496",Tennessee,Vanderbilt Center for Better Health,"Frisse, Mark E.",https://healthit.ahrq.gov/ahrq-funded-projects...,Closed,General,Devices;Electronic Medication Administration,State and Regional Demonstration in Health Inf...
1,,Specialty Care,Ambulatory;Ophthalmology;Rehabilitation;Specia...,Transforming Healthcare Quality Through Inform...,New York,Non-rural,,,"$1,442,113",New York,Lighthouse International,"Stuen, Cynthia",https://healthit.ahrq.gov/ahrq-funded-projects...,Closed,,Clinical Decision Support (CDS) System;Electro...,Creating an Evidence Base for Vision Rehabilit...
2,,Acute Care;Quality Measurement/Quality Improve...,Inpatient,Demonstrating the Value of Health Information ...,Iowa City,Rural,,,"$1,304,478",Iowa,University of Iowa,"Ward, Marcia",https://healthit.ahrq.gov/ahrq-funded-projects...,Closed,Rural Health,Clinical Information Systems;Web-based Reporti...,Health Information Technology Value in Rural H...
3,,Chronic Disease Management,Ambulatory,Health IT Grant,Seattle,Non-rural,,,"$99,992",Washington,Group Health Cooperative,"Ralston, James D.",https://healthit.ahrq.gov/ahrq-funded-projects...,Closed,,Consumer Informatics;Health Information Exchan...,Patient-Provider Electronic Messenger in Chron...
4,,Acute Care;Medication Management,Inpatient;Pharmacy,Transforming Healthcare Quality Through Inform...,Durham,Non-rural,,,"$1,423,869",North Carolina,Duke University,"Ferranti, Jeffrey",https://healthit.ahrq.gov/ahrq-funded-projects...,Closed,General,Clinical Decision Support (CDS) System;Compute...,Automated Adverse Drug Event Detection and Int...
5,,Improving Care Across Transitions of Care;Medi...,Inpatient,Demonstrating the Value of Health Information ...,Chicago,Non-rural;Rural,,,"$1,256,591",Illinois,University of Illinois at Chicago,"Graumlich, James",https://healthit.ahrq.gov/ahrq-funded-projects...,Closed,Adults,Automated Discharge Summary;Clinical Documenta...,Value of Technology to Transfer Discharge Info...
6,,Medication Management;Specialty Care,Ambulatory;Inpatient,Demonstrating the Value of Health Information ...,New Haven,Non-rural,,,"$1,474,873",Connecticut,Yale University,"Friedman, Amy",https://healthit.ahrq.gov/ahrq-funded-projects...,Closed,Adults;Kidney Disease;Low-SES/Low Income;Racia...,Electronic Medical Record (EMR)/Electronic Hea...,Web-based Renal Transplant Patient Medication ...
7,,Acute Care,Inpatient,Demonstrating the Value of Health Information ...,Ann Arbor,Non-rural,,,"$1,486,634",Michigan,University of Michigan at Ann Arbor,"Keenan, Gail",https://healthit.ahrq.gov/ahrq-funded-projects...,Closed,,Clinical Decision Support (CDS) System;Internet,Health Information Technology Support for Safe...
8,,Care Coordination;Protected Health Information...,Ambulatory;Community Health Center;Emergency D...,State and Regional Demonstrations in Health In...,Murray,Non-rural;Rural,290-04-0002,,"$4,939,058",Utah,Utah Health Information Network,"Root, Jan",https://healthit.ahrq.gov/ahrq-funded-projects...,Closed,General,Computerized Provider Order Entry (CPOE);Healt...,State and Regional Demonstration in Health Inf...
9,,Care Coordination;Protected Health Information...,Ambulatory;Community Health Center;Emergency D...,State and Regional Demonstrations in Health In...,Providence,Non-rural;Rural,290-04-0007,,"$5,000,000",Rhode Island,Rhode Island Department of Health,"Zimmerman, Amy",https://healthit.ahrq.gov/ahrq-funded-projects...,Closed,General,Bar Coding;E-Prescribing;Health Information Ex...,State and Regional Demonstration in Health Inf...


In [6]:
# Questions? Happy to help.. just email mark.silverberg@socrata.com