In [52]:
import pandas as pd
import sqlite3
from tqdm.notebook import tqdm

In [53]:
pd.set_option("display.max_columns", 500)

## * Load SQLite Table 1: cbsa *
### Filter to top tot_ratio, then to Nashville cbsa

zips_raw = pd.read_excel("../data/ZIP_CBSA_122020.xlsx", converters={'zip': lambda x: str(x)}, engine='openpyxl')
zips = zips_raw.reset_index().rename({
    'ZIP': 'zip',
    'CBSA': 'cbsa',
    'TOT_RATIO': 'tot_ratio'
}, axis=1)
zips = zips.drop(['index', 'RES_RATIO', 'BUS_RATIO', 'OTH_RATIO'], axis=1)

with sqlite3.connect('../data/hcbb.sqlite') as db:
    # Temporarily load to sqlite db
    zips.to_sql('temp_zips', db, if_exists = 'replace', index = False)
    # Use ranking and filtering
    query = ''' 
    WITH ratio AS (
        SELECT 
            zip, 
            cbsa, 
            RANK() OVER(PARTITION BY zip ORDER BY tot_ratio DESC) AS rnk 
        FROM temp_zips
    )
    SELECT 
        zip, 
        cbsa
    FROM ratio
    WHERE rnk = 1 AND cbsa = '34980';
    '''
    # Get the dataframe
    cbsa_df = pd.read_sql(query, db)
    # Delete the temporary table
    cursor = db.cursor()
    cursor.execute("DROP TABLE temp_zips")
    print("Temporary table dropped...")
    # Finally, load to final sqlite db
    cbsa_df.to_sql('cbsa', db, if_exists = 'replace', index = False)
    print("cbsa table created.")

In [None]:
# Test to confirm the table loaded.
with sqlite3.connect('../data/hcbb.sqlite') as db :
    query = """
    SELECT *
    FROM cbsa
    """ 
    
    test = pd.read_sql(query, db)

test.shape

## * Load SQLite Table 2: npidata *
### Find Primary Taxonomy; Clean Columns

with sqlite3.connect('../data/hcbb.sqlite') as db:

    npidata_raw = pd.read_csv("../data/npidata_pfile_20050523-20210207.csv", chunksize = 10000)
    for chunk in tqdm(npidata_raw):

        npidata = pd.concat([
            chunk[['NPI']],
            # Entity Type Code: 1 = Provider (doctors, nurses, etc.) / 2 = Facility (Hospitals, Urgent Care, Doctors Offices) 
            chunk[['Entity Type Code']],
            # Entity Name: Either First/Last or Organization or Other Organization Name contained in the following fields:
            chunk.loc[:, 'Provider Organization Name (Legal Business Name)':'Provider Credential Text'],
            # Address: Business Practice Location (not mailing), contained in the following fields:
            chunk.loc[:, 'Provider First Line Business Practice Location Address':'Provider Business Practice Location Address Postal Code'],
        ], axis=1)

        npi_taxonomy = pd.concat([
            chunk[['NPI']],
            # The provider's taxonomy code, which is contained in one of the 'Healthcare Provider Taxonomy Code*' columns
            chunk[chunk.columns[pd.Series(chunk.columns).str.startswith('Healthcare Provider Taxonomy Code_')]],
            chunk[chunk.columns[pd.Series(chunk.columns).str.startswith('Healthcare Provider Primary Taxonomy Switch_')]]
        ], axis=1)

        # Pivot from wide to long format
        npi_taxonomy = pd.wide_to_long(
            npi_taxonomy,
            stubnames=['Healthcare Provider Taxonomy Code', 'Healthcare Provider Primary Taxonomy Switch'],
            i=['NPI'], 
            j='primary_taxonomy_index',
            sep="_"
        )

        # Only keep the primary taxonomy
        npi_taxonomy = npi_taxonomy[npi_taxonomy['Healthcare Provider Primary Taxonomy Switch'] == 'Y']

        # Housekeeping
        npi_taxonomy = npi_taxonomy.reset_index()\
            .drop(columns=['primary_taxonomy_index', 'Healthcare Provider Primary Taxonomy Switch'])\
            .rename({ 'Healthcare Provider Taxonomy Code': 'taxonomy_code' }, axis=1)

        npidata = npidata.merge(
            npi_taxonomy,
            how='left',
            on='NPI'
        )

        # Rename columns
        npidata = npidata.reset_index().rename({
            'NPI': 'npi',
            'Entity Type Code': 'entity_type_code',
            'Provider Organization Name (Legal Business Name)': 'provider_org_name',
            'Provider Last Name (Legal Name)': 'provider_last_name',
            'Provider First Name': 'provider_first_name',
            'Provider Middle Name': 'provider_middle_name',
            'Provider Name Prefix Text': 'provider_name_prefix',
            'Provider Name Suffix Text': 'provider_name_suffix',
            'Provider Credential Text': 'provider_credential',
            'Provider First Line Business Practice Location Address': 'provider_business_address_1',
            'Provider Second Line Business Practice Location Address': 'provider_business_address_2',
            'Provider Business Practice Location Address City Name': 'provider_business_city',
            'Provider Business Practice Location Address State Name': 'provider_business_state',
            'Provider Business Practice Location Address Postal Code': 'provider_business_zip'
        }, axis=1)

        # Create Zip5 column to merge down the road
        npidata['provider_business_zip5'] = [str(i)[0:5] for i in npidata['provider_business_zip']]
        
        # Correct data types
        npidata['npi'] = npidata['npi'].astype(str)
        npidata['entity_type_code'] = npidata['entity_type_code'].astype(str).str.split('.').str[0]
        npidata['provider_business_zip'] = npidata['provider_business_zip'].astype(str).str.split('.').str[0]
        npidata['provider_business_zip5'] = npidata['provider_business_zip5'].astype(str).str.split('.').str[0]
        
        # Remove unneeded columns
        npidata = npidata.drop('index', axis=1)
        
        npidata.to_sql('npidata', db, if_exists = 'append', index = False)                           

    print('task done')

In [93]:
# Test to confirm the table loaded.
with sqlite3.connect('../data/hcbb.sqlite') as db :
    query = """
    SELECT COUNT(*)
    FROM npidata
    """ 
    
    test = pd.read_sql(query, db)

test

Unnamed: 0,COUNT(*)
0,6714038


### Removed Table to make an update.

#create a database or connect to an existing one
db = sqlite3.connect('../data/hcbb.sqlite')
#if you need to edit the database...
cursor = db.cursor()
#Drop the table and return a line that says that it's gone
cursor.execute("DROP TABLE referrals")
print("Table dropped...")

## * Load SQLite Table 3: taxonomy *

with sqlite3.connect('../data/hcbb.sqlite') as db:   
    taxonomy = pd.read_csv("../data/nucc_taxonomy_210.csv")
    taxonomy = taxonomy[['Code', 'Grouping', 'Classification', 'Specialization']]
    taxonomy.columns = ['taxonomy_code', 'grouping', 'classification', 'specialization']
    taxonomy.to_sql('taxonomy', db, if_exists = 'append', index = False)  
    
    print('task done')

## * Load SQLite Table 4, hop_teaming *

for chunk in tqdm(pd.read_csv("../data/DocGraph_Hop_Teaming_2017.csv", chunksize = 10000)):
    # Append the chunk to a hop_teaming table
    chunk.to_sql(
        'hop_teaming', # The table name
        db, # The database
        if_exists = 'append', 
        index = False # Do not include the pandas index column
    )

#When done, print done
print('Task done.')

In [136]:
#Listing currently existing tables in the database
with sqlite3.connect('../data/hcbb.sqlite') as db :
    query = """
        SELECT name
        FROM sqlite_master 
        WHERE type ='table' 
        AND name NOT LIKE 'sqlite_%';
        """ 

    test_df = pd.read_sql(query, db)

display(test_df)

Unnamed: 0,name
0,taxonomy
1,hop_teaming
2,cbsa
3,npidata
4,npidata_nashville
5,filtered_hop_teaming
6,hospitals
7,referrals


In [125]:
#See first row
with sqlite3.connect('../data/hcbb.sqlite') as db :
    query = """
        SELECT *
        From filtered_hop_teaming
        LIMIT 1
        """ 

    test_df = pd.read_sql(query, db)

display(test_df)

Unnamed: 0,from_npi,to_npi,patient_count,transaction_count,average_day_wait,std_day_wait
0,1003963976,1003028770,2535,3945,0.0,0.0


In [127]:
#See number of rows
with sqlite3.connect('../data/hcbb.sqlite') as db :
    query = """
        SELECT COUNT(*)
        From filtered_hop_teaming
        """ 

    test_df = pd.read_sql(query, db)

display(test_df)

Unnamed: 0,COUNT(*)
0,41418


## * Load SQLite Table 5, npidata_nashville *

with sqlite3.connect('../data/hcbb.sqlite') as db:
    # Use ranking and filtering
    query = ''' 
    SELECT *
    FROM npidata
    INNER JOIN cbsa
    ON npidata.provider_business_zip5 = cbsa.zip
    LEFT JOIN taxonomy USING(taxonomy_code)
    
    '''
    # Get the dataframe
    npidata_nashville = pd.read_sql(query, db)
    npidata_nashville.to_sql('npidata_nashville', db, if_exists = 'replace', index = False)
    print("npidata_nashville table created.")

## * Filter by entity type *
### Filter from_npi to be entity type 1 and to_npi to be entity type 2, to be used for SQLite Table 4.
#### NOTE: The following cell codes runs in ~5-10 minutes. Set to markdown for security.

In [117]:
with sqlite3.connect('../data/hcbb.sqlite') as db :
    query = """
    WITH npi_entity_type_1 AS (
        SELECT npi
        FROM npidata_nashville 
        WHERE entity_type_code = 1
    ), npi_entity_type_2 AS (
        SELECT npi
        FROM npidata_nashville 
        WHERE entity_type_code = 2
    )
    SELECT *
    FROM hop_teaming
    WHERE from_npi IN npi_entity_type_1
    AND to_npi IN npi_entity_type_2
    """
    
    filtered_hop_teaming = pd.read_sql(query, db)

display(filtered_hop_teaming.shape)
display(filtered_hop_teaming.head())

(217273, 6)

Unnamed: 0,from_npi,to_npi,patient_count,transaction_count,average_day_wait,std_day_wait
0,1043232879,1003028770,24,24,112.333,80.894
1,1043302466,1003028770,24,26,98.192,97.772
2,1033297429,1003028770,56,62,53.145,58.831
3,1043206329,1003028770,173,177,97.864,81.756
4,1003855537,1003028770,15,16,84.25,77.117


In [120]:
display(filtered_hop_teaming.shape)
display(filtered_hop_teaming.head())

(217273, 6)

Unnamed: 0,from_npi,to_npi,patient_count,transaction_count,average_day_wait,std_day_wait
0,1043232879,1003028770,24,24,112.333,80.894
1,1043302466,1003028770,24,26,98.192,97.772
2,1033297429,1003028770,56,62,53.145,58.831
3,1043206329,1003028770,173,177,97.864,81.756
4,1003855537,1003028770,15,16,84.25,77.117


## * Filter by transaction count and average day wait *

In [121]:
# Filter so that the transaction_count is >= 50 and average_day_wait <= 50
filtered_hop_teaming = filtered_hop_teaming[
    (filtered_hop_teaming["transaction_count"] >= 50) &
    (filtered_hop_teaming["average_day_wait"] < 50)
]

display(filtered_hop_teaming.shape)
display(filtered_hop_teaming.head())

(41418, 6)

Unnamed: 0,from_npi,to_npi,patient_count,transaction_count,average_day_wait,std_day_wait
9,1003963976,1003028770,2535,3945,0.0,0.0
27,1033246640,1003863580,58,58,45.603,56.574
31,1033215157,1003863580,124,126,22.833,53.329
34,1023223898,1003863580,1739,1872,0.169,5.185
38,1023253549,1003863580,34,53,31.887,50.676


## * Load SQLite Table 6, filtered_hop_teaming *

### IMPORTANT! This loading into the database should only be run once. If you run this multiple times, it will create duplicate entries in the database. For the security of not re-running this code by accident, the code here is converted into markdown. If you need to rebuild the database, delete the data/hcbb.sqlite file and re-run this cell as code. You will also need to make sure to re-run any other related scripts that builds other tables in the database.

with sqlite3.connect('../data/hcbb.sqlite') as db:
    filtered_hop_teaming.to_sql(
        'filtered_hop_teaming', 
        db, 
        if_exists = 'append', 
        index = False)

    # When done, print done
    print('Task done.')

In [None]:
filtered_hop_teaming.head()

## * Load SQLite Table 7, hospitals *


In [129]:
hospitals = pd.read_csv('../data/nashville_hospitals_normalized.csv')
hospitals.head()

Unnamed: 0,to_facility,to_facility_group,to_facility_name_normalised,to_address,to_grouping,to_classification,to_npi,total_patients,total_transactions,Unnamed: 9
0,WILLIAMSON COUNTY HOSPITAL DISTRICT,Williamson Medical Center,Williamson County Hospital,4321 CAROTHERS PARKWAY,Hospitals,General Acute Care Hospital,1265445506,71272,98355,
1,VANDERBILT UNIVERSITY MEDICAL CENTER,Vanderbilt University Medical Center,Vanderbilt University Medical Center,1601 23RD AVE S,Hospitals,General Acute Care Hospital,1558408633,1745,2598,
2,VANDERBILT UNIVERSITY MEDICAL CENTER,Vanderbilt University Medical Center,Vanderbilt University Medical Center,1211 MEDICAL CENTER DRIVE,Hospitals,General Acute Care Hospital,1396882205,404973,603123,
3,VANDERBILT UNIVERSITY MEDICAL CENTER,Vanderbilt University Medical Center,Vanderbilt University Lebanon Medical Center,1411 W. BADDOUR PARKWAY,Hospitals,General Acute Care Hospital,1306889597,34548,48142,
4,TROUSDALE MEDICAL CENTER LLC,Trousdale Medical Center,Trousdale Medical Center,500 CHURCH ST,Hospitals,General Acute Care Hospital,1467763458,1905,4245,Trouser


In [130]:
with sqlite3.connect('../data/hcbb.sqlite') as db:   
    hospitals = pd.read_csv("../data/nashville_hospitals_normalized.csv")
    hospitals.to_sql('hospitals', db, if_exists = 'append', index = False)  
    
    print('task done')

task done


  sql.to_sql(


## * Load SQLite Table 8, hospital_referrals *

In [138]:
hospital_referrals = pd.read_csv('../data/nashville_referrals_normalised_only_hospitals.csv')

In [144]:
with sqlite3.connect('../data/hcbb.sqlite') as db:   
    referrals = pd.read_csv("../data/nashville_referrals_normalised_only_hospitals_any_avg_day_wait.csv")
    referrals.to_sql('referrals', db, if_exists = 'append', index = False)  
    
    print('task done')

task done


  sql.to_sql(
