# Data Loading to BigQuery

This notebook covers the setup of taking all of the files that we've discovered around Price Transparency and loading them into a standard format. 

We first need to do one, and then we will do the rest. See https://github.com/pauldria/ncssm-2022-jterm-price-transparency#data for more.

In [1]:
from tools import tools

from google.cloud import bigquery

import datetime
import json
import os
import pandas as pd
import urllib

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [2]:
%env GOOGLE_APPLICATION_CREDENTIALS=/home/raff/.ssh/ncssm-price-transparency-d58392a32442.json

env: GOOGLE_APPLICATION_CREDENTIALS=/home/raff/.ssh/ncssm-price-transparency-d58392a32442.json


In [3]:
with open("config.json", "r") as f:
    config = json.load(f)

In [4]:
date_obtained = datetime.datetime.now().strftime("%Y-%m-%d")

for i in range(len(config["data"])):
    entry = config["data"][i]
    
    identifier    = i
    hospital_name = entry["hospital_name"]
    filepath      = entry["data_url_local"] if "data_url_local" in entry else entry["data_url"]
    skiprows      = entry["skiprows"]
    date_provided = entry["date_provided"]
    gross_charge_name   = entry["gross_charge_name"]
    min_inpatient_name  = entry["min_inpatient_name"]
    max_inpatient_name  = entry["max_inpatient_name"]
    min_outpatient_name = entry["min_outpatient_name"]
    max_outpatient_name = entry["max_outpatient_name"]
    
    print(f"Processing {hospital_name}")
    
    df             = pd.read_csv(filepath, skiprows = skiprows, dtype = str, na_values = "")
    df_processed   = tools.process(df)
    df_transformed = tools.transform(df_processed, fixed_start = 0, fixed_end = 4)
    df_transformed.columns = ["code", "ndc", "rev_code", "description", "payer", "cost"]
    df_standardized = tools.standardize(df_transformed, 
                                        cols = ["code", "description", "payer", "cost"],
                                        id = identifier,
                                        date_obtained = date_obtained,
                                        date_provided = date_provided,
                                        hospital_name = hospital_name,
                                        gross_charge_name = gross_charge_name,
                                        min_inpatient_name = min_inpatient_name,
                                        max_inpatient_name = max_inpatient_name,
                                        min_outpatient_name = min_outpatient_name,
                                        max_outpatient_name = max_outpatient_name)

Processing Alamance Regional Medical Center


In [5]:
print(df.shape)
print(df_processed.shape)
print(df_transformed.shape)
print(df_standardized.shape)

(113385, 115)
(113385, 115)
(12585735, 6)
(12585735, 9)


In [6]:
df_standardized.head()

Unnamed: 0,id,date_obtained,date_provided,hospital_name,code,code_type,description,payer,cost
0,0,2021-11-28,2021-09-28,Alamance Regional Medical Center,MS001,,Heart Transplant Or Implant Of Heart Assist Sy...,_GROSS_CHARGE,"$490,771.98"
1,0,2021-11-28,2021-09-28,Alamance Regional Medical Center,MS002,,Heart Transplant Or Implant Of Heart Assist Sy...,_GROSS_CHARGE,"$540,615.96"
2,0,2021-11-28,2021-09-28,Alamance Regional Medical Center,MS003,,Ecmo Or Tracheostomy With Mv >96 Hours Or Prin...,_GROSS_CHARGE,"$256,535.52"
3,0,2021-11-28,2021-09-28,Alamance Regional Medical Center,MS004,,Tracheostomy With Mv >96 Hours Or Principal Di...,_GROSS_CHARGE,"$256,923.78"
4,0,2021-11-28,2021-09-28,Alamance Regional Medical Center,MS011,,"Tracheostomy For Face, Mouth And Neck Diagnose...",_GROSS_CHARGE,"$58,037.92"


In [7]:
df_standardized.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12585735 entries, 0 to 12585734
Data columns (total 9 columns):
 #   Column         Dtype         
---  ------         -----         
 0   id             int64         
 1   date_obtained  datetime64[ns]
 2   date_provided  datetime64[ns]
 3   hospital_name  object        
 4   code           object        
 5   code_type      object        
 6   description    object        
 7   payer          object        
 8   cost           object        
dtypes: datetime64[ns](2), int64(1), object(6)
memory usage: 864.2+ MB


In [8]:
client = bigquery.Client()

In [9]:
schema = []
for c in df_standardized:
    bigquery_type = bigquery.enums.SqlTypeNames.STRING
    if c in ["id"]:
        bigquery_type = bigquery.enums.SqlTypeNames.INT64
    if c in ["date_obtained", "date_provided"]:
        bigquery_type = bigquery.enums.SqlTypeNames.DATE
    schema.append(bigquery.SchemaField(c, bigquery_type))
schema

[SchemaField('id', 'INTEGER', 'NULLABLE', None, (), None),
 SchemaField('date_obtained', 'DATE', 'NULLABLE', None, (), None),
 SchemaField('date_provided', 'DATE', 'NULLABLE', None, (), None),
 SchemaField('hospital_name', 'STRING', 'NULLABLE', None, (), None),
 SchemaField('code', 'STRING', 'NULLABLE', None, (), None),
 SchemaField('code_type', 'STRING', 'NULLABLE', None, (), None),
 SchemaField('description', 'STRING', 'NULLABLE', None, (), None),
 SchemaField('payer', 'STRING', 'NULLABLE', None, (), None),
 SchemaField('cost', 'STRING', 'NULLABLE', None, (), None)]

In [10]:
job_config = bigquery.LoadJobConfig(
    schema            = schema,
    write_disposition = "WRITE_TRUNCATE",
)

In [11]:
job = client.load_table_from_dataframe(
    df_standardized, "ncssm-price-transparency.hospital_data.hospital_0", job_config=job_config
)

In [12]:
job.result()  # Wait for the job to complete.

LoadJob<project=ncssm-price-transparency, location=US, id=2e61d15d-8695-46d5-96c6-187f3660daa3>