In [1]:
import os
import json
import numpy as np
import pandas as pd

from googleapiclient import discovery
from apiclient.discovery import build # libraries needed for google sheets API
from httplib2 import Http
from oauth2client import file, client, tools

import urllib
import requests

In [2]:
# reference using google API and Medium posts on how to access the Google API
# https://towardsdatascience.com/how-to-access-google-sheet-data-using-the-python-api-and-convert-to-pandas-dataframe-5ec020564f0e
# https://developers.google.com/sheets/api/guides/concepts#spreadsheet_id

# the following function was modified from Medium link sourced above and the quickstart.py file from google
def get_google_sheet(spreadsheet_id, range_name):
    """ Retrieve sheet data using OAuth credentials and Google Python API. """
    scopes = 'https://www.googleapis.com/auth/spreadsheets.readonly'
    # Setup the Sheets API
    store = file.Storage(os.getenv("HOME")+'/keys/'+'token.json')
    creds = store.get()
    if not creds or creds.invalid:
        flow = client.flow_from_clientsecrets(os.getenv("HOME")+'/keys/'+'credentials.json', scopes)
        creds = tools.run_flow(flow, store)
    service = build('sheets', 'v4', http=creds.authorize(Http()))

    # Call the Sheets API
    gsheet = service.spreadsheets().values().get(spreadsheetId=spreadsheet_id, range=range_name).execute()
    return gsheet

In [14]:
# this is a google sheet that was downloaded from https://data.medicare.gov/Hospital-Compare/Hospital-General-Information/xubh-q36u
# we had to manually look up the link to each mastercharge file on each hospitals' website
# google sheet link https://docs.google.com/spreadsheets/d/1F8yPe-2uMcAOOzRmYnFenC77GiXnJ0afnRXtBUYe5TQ/edit?usp=sharing

SHEET_ID = '1F8yPe-2uMcAOOzRmYnFenC77GiXnJ0afnRXtBUYe5TQ' 
RANGE = 'NYChospitals' # this is the range

gsheet = get_google_sheet(SHEET_ID, RANGE)

In [15]:
gsheet.keys() # check the keys of the gsheet

dict_keys(['range', 'majorDimension', 'values'])

In [16]:
## turn the list of hospitals from gsheet into pd df
nycHospitals = pd.DataFrame.from_records(gsheet['values'][1:], columns=gsheet['values'][0]) 

In [17]:
nycHospitals.head() # to see the dataframe

Unnamed: 0,in_nyc,Provider ID,Hospital Name,Main webpage link,DRG webpage link,All hospital charges link,Remarks on chargemaster link,format,Address,City,...,Readmission national comparison footnote,Patient experience national comparison,Patient experience national comparison footnote,Effectiveness of care national comparison,Effectiveness of care national comparison footnote,Timeliness of care national comparison,Timeliness of care national comparison footnote,Efficient use of medical imaging national comparison,Efficient use of medical imaging national comparison footnote,Location
0,True,330204,BELLEVUE HOSPITAL CENTER,https://www.bellevuehospital.com/patient-pricing,,https://www.bellevuehospital.com/sites/default...,"csv file, no DRG",csv,462 FIRST AVENUE,NEW YORK,...,,Below the national average,,Same as the national average,,Below the national average,,Same as the national average,,"462 FIRST AVENUE NEW YORK, NY (40.740079, -73...."
1,True,330009,BRONX-LEBANON HOSPITAL CENTER,https://www.bronxcare.org/about-us/paying-for-...,https://www.bronxcare.org/fileadmin/SiteFiles/...,https://www.bronxcare.org/fileadmin/SiteFiles/...,"2017 data, Excel file; for DRG, split into \nA...",xls,1276 FULTON AVENUE,BRONX,...,,Below the national average,,Same as the national average,,Below the national average,,Same as the national average,,"1276 FULTON AVENUE BRONX, NY (40.83175, -73.90..."
2,True,330233,BROOKDALE HOSPITAL MEDICAL CENTER,http://www.brookdalehospital.org/charge-master...,http://www.brookdalehospital.org/assets/charge...,http://www.brookdalehospital.org/assets/brookd...,Excel file,xlsx,1 BROOKDALE PLAZA,BROOKLYN,...,,Below the national average,,Below the national average,,Below the national average,,Same as the national average,,"1 BROOKDALE PLAZA BROOKLYN, NY (40.6545, -73.9..."
3,True,330056,BROOKLYN HOSPITAL CENTER AT DOWNTOWN CAMPUS,https://www.tbh.org/patients-and-visitors/bill...,,https://www.tbh.org/sites/default/files/Brookl...,"csv file, no DRG",csv,121 DEKALB AVENUE,BROOKLYN,...,,Below the national average,,Same as the national average,,Below the national average,,Same as the national average,,"121 DEKALB AVENUE BROOKLYN, NY (40.689785, -73..."
4,True,330196,CONEY ISLAND HOSPITAL,https://www.nychealthandhospitals.org/coneyisl...,https://hhinternet.blob.core.windows.net/uploa...,https://hhinternet.blob.core.windows.net/uploa...,"2017 data, NYC Health+Hospital Group, Excel file",xlsx,2601 OCEAN PARKWAY,BROOKLYN,...,,Below the national average,,Below the national average,,Below the national average,,Not Available,Results are not available for this reporting p...,"2601 OCEAN PARKWAY BROOKLYN, NY (40.58494, -73..."


In [18]:
nycHospitals['url'] = np.where(nycHospitals['DRG webpage link']=='NA',
                               nycHospitals['All hospital charges link'], 
                               nycHospitals['DRG webpage link'])

In [19]:
nycHospitals.columns

Index(['in_nyc', 'Provider ID', 'Hospital Name', 'Main webpage link',
       'DRG webpage link', 'All hospital charges link',
       'Remarks on chargemaster link', 'format', 'Address', 'City', 'State',
       'ZIP Code', 'County Name', 'Phone Number', 'Hospital Type',
       'Hospital Ownership', 'Emergency Services',
       'Meets criteria for meaningful use of EHRs', 'Hospital overall rating',
       'Hospital overall rating footnote', 'Mortality national comparison',
       'Mortality national comparison footnote',
       'Safety of care national comparison',
       'Safety of care national comparison footnote',
       'Readmission national comparison',
       'Readmission national comparison footnote',
       'Patient experience national comparison',
       'Patient experience national comparison footnote',
       'Effectiveness of care national comparison',
       'Effectiveness of care national comparison footnote',
       'Timeliness of care national comparison',
       'Timeli

In [20]:
nycHospitals.drop(['in_nyc', 'Main webpage link',
       'DRG webpage link', 'All hospital charges link',
       'Remarks on chargemaster link', 'Address', 'City', 'State', 'ZIP Code',
       'County Name', 'Phone Number', 'Hospital Type', 'Hospital Ownership',
       'Emergency Services', 'Meets criteria for meaningful use of EHRs',
       'Hospital overall rating', 'Hospital overall rating footnote',
       'Mortality national comparison',
       'Mortality national comparison footnote',
       'Safety of care national comparison',
       'Safety of care national comparison footnote',
       'Readmission national comparison',
       'Readmission national comparison footnote',
       'Patient experience national comparison',
       'Patient experience national comparison footnote',
       'Effectiveness of care national comparison',
       'Effectiveness of care national comparison footnote',
       'Timeliness of care national comparison',
       'Timeliness of care national comparison footnote',
       'Efficient use of medical imaging national comparison',
       'Efficient use of medical imaging national comparison footnote',
       'Location'], axis = 1, inplace = True)

In [21]:
nycHospitals.columns = ['providerId', 'hospitalName', 'format','url']

In [22]:
nycHospitals

Unnamed: 0,providerId,hospitalName,format,url
0,330204,BELLEVUE HOSPITAL CENTER,csv,https://www.bellevuehospital.com/sites/default...
1,330009,BRONX-LEBANON HOSPITAL CENTER,xls,https://www.bronxcare.org/fileadmin/SiteFiles/...
2,330233,BROOKDALE HOSPITAL MEDICAL CENTER,xlsx,http://www.brookdalehospital.org/assets/charge...
3,330056,BROOKLYN HOSPITAL CENTER AT DOWNTOWN CAMPUS,csv,https://www.tbh.org/sites/default/files/Brookl...
4,330196,CONEY ISLAND HOSPITAL,xlsx,https://hhinternet.blob.core.windows.net/uploa...
5,330128,ELMHURST HOSPITAL CENTER,xlsx,https://hhinternet.blob.core.windows.net/uploa...
6,330193,FLUSHING HOSPITAL MEDICAL CENTER,,
7,330240,HARLEM HOSPITAL CENTER,xlsx,https://hhinternetauto.blob.core.windows.net/u...
8,330270,HOSPITAL FOR SPECIAL SURGERY,xlsx,https://www.hss.edu/files/HSS-Average-Inpatien...
9,330397,INTERFAITH MEDICAL CENTER,csv,http://www.interfaithmedical.com/patients-and-...


In [12]:
l = len(nycHospitals)

In [26]:
for i in range(0, l):
    url = nycHospitals['url'][i]
    name = nycHospitals['hospitalName'][i].replace(' ', '_').lower()
    
    if nycHospitals['format'][i] == 'csv':
        try:
            urllib.request.urlretrieve(url, name+nycHospitals['providerId'][i]+'.csv')
            os.system('mv ' + name+nycHospitals['providerId'][i]+'.csv ' + os.getcwd()+'/dataFiles/hospitalMasterCharges/')
            print(i, name)
        except:
            print('An error occurred in :', name)
    elif nycHospitals['format'][i] == 'xlsx':
        try:
            resp = requests.get(url)
            output = open(name+nycHospitals['providerId'][i]+'.xlsx', 'wb')
            output.write(resp.content)
            output.close()
            os.system('mv ' + name+nycHospitals['providerId'][i]+'.xlsx ' + os.getcwd()+'/dataFiles/hospitalMasterCharges/')
            print(i, name)
        except:
            print('An error occurred in :', name)
            !curl url > name+nycHospitals['providerId'][i]+'.xlsx'
    elif nycHospitals['format'][i] == 'xls':
        try:
            resp = requests.get(url)
            output = open(name+nycHospitals['providerId'][i]+'.xls', 'wb')
            output.write(resp.content)
            output.close()
            os.system('mv ' + name+nycHospitals['providerId'][i]+'.xls ' + os.getcwd()+'/dataFiles/hospitalMasterCharges/')
            print(i, name)
        except:
            print('An error occurred in :', name)
            !curl url > name+nycHospitals['providerId'][i]+'.xls'
    else:
        print(i, name, "NA")

0 bellevue_hospital_center
1 bronx-lebanon_hospital_center
2 brookdale_hospital_medical_center
3 brooklyn_hospital_center_at_downtown_campus
4 coney_island_hospital
5 elmhurst_hospital_center
6 flushing_hospital_medical_center NA
7 harlem_hospital_center
8 hospital_for_special_surgery
9 interfaith_medical_center
10 jacobi_medical_center
11 jamaica_hospital_medical_center NA
12 kings_county_hospital_center
13 kingsbrook_jewish_medical_center
An error occurred in : lenox_hill_hospital
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0curl: (6) Could not resolve host: url
15 lincoln_medical_&_mental_health_center
16 long_island_jewish_medical_center
17 maimonides_medical_center
18 metropolitan_hospital_center
19 montefiore_medical_center
20 mount_sinai_beth_israel NA
21 mount_sinai_hospital
22 mount_sinai_

'/projects/mls_hospital/shared'