In [2]:
import pandas as pd
import sqlite3
from tqdm.notebook import tqdm

In [3]:
pd.set_option("display.max_columns", 500)

## SUMMARY
### 4 SQLite Tables:
#### 1 - npidata - only filtered to correct taxonomy and zip codes
#### 2 - taxonomy - no filtering
#### 3 - hop_teaming - no filtering
#### 4 - filtered_hop_teaming - Filtered to correct entity types, transaction counts, and wait times

## VERONICA - Only pay attention to cells directly below a header with * Title Here *

## * Load SQLite Table 1: npidata *
### Filter to Nashville zip code and Primary Taxonomy; Clean Columns
### VERONICA - The taxonomy code is complex, but correct. Maeva did it.

In [None]:
# Read in Zip codes (keep leading zeros, filter to zip codes in the Nashville CBSA)
zips = pd.read_excel("../data/ZIP_CBSA_122017.xlsx", converters={'zip': lambda x: str(x)})
zips = zips[zips['cbsa'] == 34980]

with sqlite3.connect('../data/hcbb.sqlite') as db:

    npidata_raw = pd.read_csv("../data/npidata_pfile_20050523-20210207.csv", chunksize = 10000)
    for chunk in tqdm(npidata_raw):

        npidata = pd.concat([
            chunk[['NPI']],
            # Entity Type Code: 1 = Provider (doctors, nurses, etc.) / 2 = Facility (Hospitals, Urgent Care, Doctors Offices) 
            chunk[['Entity Type Code']],
            # Entity Name: Either First/Last or Organization or Other Organization Name contained in the following fields:
            chunk.loc[:, 'Provider Organization Name (Legal Business Name)':'Provider Credential Text'],
            # Address: Business Practice Location (not mailing), contained in the following fields:
            chunk.loc[:, 'Provider First Line Business Practice Location Address':'Provider Business Practice Location Address Postal Code'],
        ], axis=1)

        npi_taxonomy = pd.concat([
            chunk[['NPI']],
            # The provider's taxonomy code, which is contained in one of the 'Healthcare Provider Taxonomy Code*' columns
            chunk[chunk.columns[pd.Series(chunk.columns).str.startswith('Healthcare Provider Taxonomy Code_')]],
            chunk[chunk.columns[pd.Series(chunk.columns).str.startswith('Healthcare Provider Primary Taxonomy Switch_')]]
        ], axis=1)

        # Pivot from wide to long format
        npi_taxonomy = pd.wide_to_long(
            npi_taxonomy,
            stubnames=['Healthcare Provider Taxonomy Code', 'Healthcare Provider Primary Taxonomy Switch'],
            i=['NPI'], 
            j='primary_taxonomy_index',
            sep="_"
        )

        # Only keep the primary taxonomy
        npi_taxonomy = npi_taxonomy[npi_taxonomy['Healthcare Provider Primary Taxonomy Switch'] == 'Y']

        # Housekeeping
        npi_taxonomy = npi_taxonomy.reset_index()\
            .drop(columns=['primary_taxonomy_index', 'Healthcare Provider Primary Taxonomy Switch'])\
            .rename({ 'Healthcare Provider Taxonomy Code': 'taxonomy_code' }, axis=1)

        npidata = npidata.merge(
            npi_taxonomy,
            how='left',
            on='NPI'
        )

        # Rename columns
        npidata = npidata.reset_index().rename({
            'NPI': 'npi',
            'Entity Type Code': 'entity_type_code',
            'Provider Organization Name (Legal Business Name)': 'provider_org_name',
            'Provider Last Name (Legal Name)': 'provider_last_name',
            'Provider First Name': 'provider_first_name',
            'Provider Middle Name': 'provider_middle_name',
            'Provider Name Prefix Text': 'provider_name_prefix',
            'Provider Name Suffix Text': 'provider_name_suffix',
            'Provider Credential Text': 'provider_credential',
            'Provider First Line Business Practice Location Address': 'provider_business_address_1',
            'Provider Second Line Business Practice Location Address': 'provider_business_address_2',
            'Provider Business Practice Location Address City Name': 'provider_business_city',
            'Provider Business Practice Location Address State Name': 'provider_business_state',
            'Provider Business Practice Location Address Postal Code': 'provider_business_zip'
        }, axis=1)

        # Create Zip5 column to merge down the road
        npidata['provider_business_zip5'] = [str(i)[0:5] for i in npidata['provider_business_zip']]
        
        # Correct data types
        npidata['npi'] = npidata['npi'].astype(str)
        npidata['entity_type_code'] = npidata['entity_type_code'].astype(str).str.split('.').str[0]
        npidata['provider_business_zip'] = npidata['provider_business_zip'].astype(str).str.split('.').str[0]
        npidata['provider_business_zip5'] = npidata['provider_business_zip5'].astype(str).str.split('.').str[0]
        npidata = npidata[npidata['provider_business_zip5'].isin(zips['zip'])]
        
        # Remove unneeded columns
        npidata = npidata.drop('index', axis=1)
        
        # Filter to Nashville zips, TN state
        npidata = npidata[(npidata['provider_business_zip5'].isin(zips['zip'])) & 
                          (npidata['provider_business_state'].isin(['TN', 'TENNESSEE']))]

        npidata.to_sql('npidata', db, if_exists = 'append', index = False)                           

    print('task done')

### Removed Table to make an update.

#create a database or connect to an existing one
db = sqlite3.connect('../data/hcbb.sqlite')
#if you need to edit the database...
cursor = db.cursor()
#Drop the table and return a line that says that it's gone
cursor.execute("DROP TABLE filtered_hop_teaming")
print("Table dropped...")

In [None]:
# Test to confirm the table loaded.
with sqlite3.connect('../data/hcbb.sqlite') as db :
    query = """
    SELECT COUNT(*) AS count_all
    FROM npidata;
    """ 
    
    test = pd.read_sql(query, db)

test

## * Load SQLite Table 2: taxonomy *

In [None]:
with sqlite3.connect('../data/hcbb.sqlite') as db:   
    taxonomy = pd.read_csv("../data/nucc_taxonomy_210.csv")
    taxonomy = taxonomy[['Code', 'Grouping', 'Classification', 'Specialization']]
    taxonomy.columns = ['taxonomy_code', 'grouping', 'classification', 'specialization']
    taxonomy.to_sql('taxonomy', db, if_exists = 'append', index = False)  
    
    print('task done')

## * Load SQLite Table 3, hop_teaming *

In [None]:
for chunk in tqdm(pd.read_csv("../data/DocGraph_Hop_Teaming_2017.csv", chunksize = 10000)):
    # Append the chunk to a hop_teaming table
    chunk.to_sql(
        'hop_teaming', # The table name
        db, # The database
        if_exists = 'append', 
        index = False # Do not include the pandas index column
    )

#When done, print done
print('Task done.')

In [None]:
#Listing currently existing tables in the database
with sqlite3.connect('../data/hcbb.sqlite') as db :
    query = """
        SELECT name
        FROM sqlite_master 
        WHERE type ='table' 
        AND name NOT LIKE 'sqlite_%';
        """ 

    test_df = pd.read_sql(query, db)

display(test_df)

## * Filter by entity type *
### Filter from_npi to be entity type 1 and to_npi to be entity type 2, to be used for SQLite Table 4.
#### NOTE: The following cell codes runs in ~5-10 minutes. Set to markdown for security.

In [None]:
with sqlite3.connect('../data/hcbb.sqlite') as db :
    query = """
    WITH npi_entity_type_1 AS (
        SELECT npi
        FROM npidata 
        WHERE entity_type_code = 1
    ), npi_entity_type_2 AS (
        SELECT npi
        FROM npidata 
        WHERE entity_type_code = 2
    )
    SELECT *
    FROM hop_teaming
    WHERE from_npi IN npi_entity_type_1
    AND to_npi IN npi_entity_type_2
    """
    
    filtered_hop_teaming = pd.read_sql(query, db)

display(filtered_hop_teaming.shape)
display(filtered_hop_teaming.head())

## * Filter by transaction count and average day wait *

In [None]:
# Filter so that the transaction_count is >= 50 and average_day_wait <= 50
filtered_hop_teaming = filtered_hop_teaming[
    (filtered_hop_teaming["transaction_count"] >= 50) &
    (filtered_hop_teaming["average_day_wait"] <= 50)
]

display(filtered_hop_teaming.shape)
display(filtered_hop_teaming.head())

## * Load SQLite Table 4, filtered_hop_teaming *

### IMPORTANT! This loading into the database should only be run once. If you run this multiple times, it will create duplicate entries in the database. For the security of not re-running this code by accident, the code here is converted into markdown. If you need to rebuild the database, delete the data/hcbb.sqlite file and re-run this cell as code. You will also need to make sure to re-run any other related scripts that builds other tables in the database.

with sqlite3.connect('../data/hcbb.sqlite') as db:
    filtered_hop_teaming.to_sql(
        'filtered_hop_teaming', 
        db, 
        if_exists = 'append', 
        index = False)

    # When done, print done
    print('Task done.')

In [None]:
filtered_hop_teaming.head()