In [12]:
import os
import json
import numpy as np
import pandas as pd
import re
import csv

from googleapiclient import discovery
from apiclient.discovery import build # libraries needed for google sheets API
from httplib2 import Http
from oauth2client import file, client, tools

import urllib
import requests

print(pd.__version__)

0.23.4


In [2]:
# reference using google API and Medium posts on how to access the Google API
# https://towardsdatascience.com/how-to-access-google-sheet-data-using-the-python-api-and-convert-to-pandas-dataframe-5ec020564f0e
# https://developers.google.com/sheets/api/guides/concepts#spreadsheet_id

# the following function was modified from Medium link sourced above and the quickstart.py file from google
def get_google_sheet(spreadsheet_id, range_name):
    """ Retrieve sheet data using OAuth credentials and Google Python API. """
    scopes = 'https://www.googleapis.com/auth/spreadsheets.readonly'
    # Setup the Sheets API
    store = file.Storage(os.getenv("HOME")+'/keys/'+'token.json')
    creds = store.get()
    if not creds or creds.invalid:
        flow = client.flow_from_clientsecrets(os.getenv("HOME")+'/keys/'+'credentials.json', scopes)
        creds = tools.run_flow(flow, store)
    service = build('sheets', 'v4', http=creds.authorize(Http()))

    # Call the Sheets API
    gsheet = service.spreadsheets().values().get(spreadsheetId=spreadsheet_id, range=range_name).execute()
    return gsheet

In [13]:
# this is a google sheet that was downloaded from https://data.medicare.gov/Hospital-Compare/Hospital-General-Information/xubh-q36u
# we had to manually look up the link to each mastercharge file on each hospitals' website
# google sheet link https://docs.google.com/spreadsheets/d/1F8yPe-2uMcAOOzRmYnFenC77GiXnJ0afnRXtBUYe5TQ/edit?usp=sharing

SHEET_ID = '1F8yPe-2uMcAOOzRmYnFenC77GiXnJ0afnRXtBUYe5TQ' 
RANGE = 'nycHospitalsProviderIds' # this is the range

gsheet = get_google_sheet(SHEET_ID, RANGE)

In [14]:
gsheet.keys() # check the keys of the gsheet

dict_keys(['range', 'majorDimension', 'values'])

In [15]:
## turn the list of hospitals from gsheet into pd df
nycHospitals = pd.DataFrame.from_records(gsheet['values'][1:], columns=gsheet['values'][0]) 

In [16]:
nycHospitals.head(3) # to see the dataframe

Unnamed: 0,in_nyc,Provider ID,fac_id,Hospital Name,Main webpage link,DRG webpage link,All hospital charges link,Remarks on chargemaster link,manualDownload,Address,...,Readmission national comparison footnote,Patient experience national comparison,Patient experience national comparison footnote,Effectiveness of care national comparison,Effectiveness of care national comparison footnote,Timeliness of care national comparison,Timeliness of care national comparison footnote,Efficient use of medical imaging national comparison,Efficient use of medical imaging national comparison footnote,Location
0,True,330009,1164,BRONX-LEBANON HOSPITAL CENTER,https://www.bronxcare.org/about-us/paying-for-...,https://www.bronxcare.org/fileadmin/SiteFiles/...,https://www.bronxcare.org/fileadmin/SiteFiles/...,"2017 data, Excel file; for DRG, split into \nA...",no,1276 FULTON AVENUE,...,,Below the national average,,Same as the national average,,Below the national average,,Same as the national average,,"1276 FULTON AVENUE BRONX, NY (40.83175, -73.90..."
1,True,330014,1629,JAMAICA HOSPITAL MEDICAL CENTER,https://jamaicahospital.org/paying-for-care/ou...,https://jamaicahospital.org/wp-content/uploads...,,for full list need to contact by contacting th...,yes,89TH AVENUE AND VAN WYCK EXPRESSWAY,...,,Below the national average,,Below the national average,,Below the national average,,Same as the national average,,"89TH AVENUE AND VAN WYCK EXPRESSWAY JAMAICA, N..."
2,True,330019,1293,"NEW YORK COMMUNITY HOSPITAL OF BROOKLYN, INC.",https://nych.com/about-us/hospital-charges/,,https://nych.com/wp-content/uploads/2019/01/CH...,Excel file,no,2525 KINGS HIGHWAY,...,,Below the national average,,Same as the national average,,Below the national average,,Not Available,Results are not available for this reporting p...,"2525 KINGS HIGHWAY BROOKLYN, NY (40.613783, -7..."


In [7]:
nycHospitals['url'] = np.where(nycHospitals['DRG webpage link']=='NA',
                               nycHospitals['All hospital charges link'], 
                               nycHospitals['DRG webpage link'])

In [8]:
nycHospitals.columns

Index(['in_nyc', 'Provider ID', 'Hospital Name', 'Main webpage link',
       'DRG webpage link', 'All hospital charges link',
       'Remarks on chargemaster link', 'manualDownload', 'Address', 'City',
       'State', 'ZIP Code', 'County Name', 'Phone Number', 'Hospital Type',
       'Hospital Ownership', 'Emergency Services',
       'Meets criteria for meaningful use of EHRs', 'Hospital overall rating',
       'Hospital overall rating footnote', 'Mortality national comparison',
       'Mortality national comparison footnote',
       'Safety of care national comparison',
       'Safety of care national comparison footnote',
       'Readmission national comparison',
       'Readmission national comparison footnote',
       'Patient experience national comparison',
       'Patient experience national comparison footnote',
       'Effectiveness of care national comparison',
       'Effectiveness of care national comparison footnote',
       'Timeliness of care national comparison',
      

In [9]:
nycHospitals.drop(['in_nyc', 'Main webpage link',
       'DRG webpage link', 'All hospital charges link',
       'Remarks on chargemaster link', 'Address', 'City', 'State', 'ZIP Code',
       'County Name', 'Phone Number', 'Hospital Type', 'Hospital Ownership',
       'Emergency Services', 'Meets criteria for meaningful use of EHRs',
       'Hospital overall rating', 'Hospital overall rating footnote',
       'Mortality national comparison',
       'Mortality national comparison footnote',
       'Safety of care national comparison',
       'Safety of care national comparison footnote',
       'Readmission national comparison',
       'Readmission national comparison footnote',
       'Patient experience national comparison',
       'Patient experience national comparison footnote',
       'Effectiveness of care national comparison',
       'Effectiveness of care national comparison footnote',
       'Timeliness of care national comparison',
       'Timeliness of care national comparison footnote',
       'Efficient use of medical imaging national comparison',
       'Efficient use of medical imaging national comparison footnote',
       'Location'], axis = 1, inplace = True)

In [10]:
nycHospitals['format'] = nycHospitals['url'].str.split('.').str[-1:]
nycHospitals.head()

Unnamed: 0,Provider ID,Hospital Name,manualDownload,url,format
0,330204,BELLEVUE HOSPITAL CENTER,no,https://www.bellevuehospital.com/sites/default...,[csv]
1,330009,BRONX-LEBANON HOSPITAL CENTER,no,https://www.bronxcare.org/fileadmin/SiteFiles/...,[xlsx]
2,330233,BROOKDALE HOSPITAL MEDICAL CENTER,no,http://www.brookdalehospital.org/assets/charge...,[xlsx]
3,330056,BROOKLYN HOSPITAL CENTER AT DOWNTOWN CAMPUS,no,https://www.tbh.org/sites/default/files/Brookl...,[csv]
4,330196,CONEY ISLAND HOSPITAL,no,https://hhinternet.blob.core.windows.net/uploa...,[xlsx]


In [11]:
nycHospitals.columns = ['providerId', 'hospitalName', 'manualDownload','url', 'format']

In [12]:
nycHospitals['tempName'] = nycHospitals['hospitalName'].str.translate ({ord(c): " " for c in "!@#$%^&*()[]{};:,./<>?\|`~-=_+"}).str.title().str.split()

In [13]:
l = len(nycHospitals)

In [14]:
destFol = os.getcwd()+'/dataFiles/rawHospitalChargeData/'
filenames = []

for i in range(0, l):
    s=''
    url = nycHospitals['url'][i]
    name = nycHospitals['tempName'][i][0].lower()+s.join(nycHospitals['tempName'][i][1:])
    pid = nycHospitals['providerId'][i]
    
    if nycHospitals['format'][i][0] == 'csv':
        try:
            urllib.request.urlretrieve(url, name+pid+'.csv')
            os.system('mv ' + name+pid+'.csv ' + destFol)
            filenames.append(name+pid+'.csv')
            #print(i, name)
        except:
            print('An error occurred in : Download Manually ', name+pid+'.csv', '\n' + url)
    elif nycHospitals['format'][i][0] == 'xlsx':
        try:
            urllib.request.urlretrieve(url, name+pid+'.xlsx')
            os.system('mv ' + name+pid+'.xlsx ' + destFol)
            filenames.append(name+pid+'.xlsx')
            #print(i, name)
        except:
            print('An error occurred in :  Download Manually ', name+pid+'.xlsx', '\n' + url)
            filenames.append(name+pid+'.xlsx')
    elif nycHospitals['format'][i][0] == 'xls':
        try:
            urllib.request.urlretrieve(url, name+pid+'.xls')
            os.system('mv ' + name+pid+'.xls ' + destFol)
            filenames.append(name+pid+'.xls')
            # print(i, name)
        except:
            print('An error occurred in :  Download Manually ', name+pid+'.xls', '\n' + url)
            filenames.append(name+pid+'.xls')
    else:
        print(i, name+pid, "_____Requires Manual Download")
        filenames.append(name+pid+'.xlsx')

An error occurred in :  Download Manually  hospitalForSpecialSurgery330270.xlsx 
https://www.hss.edu/files/HSS-Average-Inpatient-Charges-by-MS-DRG-as-of-January-1-2019.xlsx
An error occurred in :  Download Manually  lenoxHillHospital330119.xlsx 
https://www.northwell.edu/sites/northwell.edu/files/inline-files/Northwell%20Health%20-%20CMS%20Mandate%20UPLOAD%20Files%2012192018_2.xlsx
An error occurred in :  Download Manually  longIslandJewishMedicalCenter330195.xlsx 
https://www.northwell.edu/sites/northwell.edu/files/inline-files/Northwell%20Health%20-%20CMS%20Mandate%20UPLOAD%20Files%2012192018_2.xlsx
An error occurred in :  Download Manually  mountSinaiBethIsrael330169.xlsx 
https://www.mountsinai.org/files/MSHealth/Assets/HS/About/Chargemaster_MSH.xlsx
An error occurred in :  Download Manually  mountSinaiHospital330024.xlsx 
https://www.mountsinai.org/files/MSHealth/Assets/HS/About/Average%20Charges%20by%20DRG_MSH.xlsx
An error occurred in :  Download Manually  mountSinaiWest330046.x