In [1]:
import pandas as pd
import sqlite3
from tqdm.notebook import tqdm

In [2]:
pd.set_option("display.max_columns", 500)

#Load npidata, filtered to Nashville zip codes, into SQLITE

zips = pd.read_excel("../data/ZIP_CBSA_122017.xlsx", converters={'zip': lambda x: str(x)})
zips = zips[zips['cbsa'] == 34980]

with sqlite3.connect('../data/hcbb.sqlite') as db:

    npidata_raw = pd.read_csv("../data/npidata_pfile_20050523-20210207.csv", chunksize = 10000)
    for chunk in tqdm(npidata_raw):

        npidata = pd.concat([
            chunk[['NPI']],
            # Entity Type Code: 1 = Provider (doctors, nurses, etc.) / 2 = Facility (Hospitals, Urgent Care, Doctors Offices) 
            chunk[['Entity Type Code']],
            # Entity Name: Either First/Last or Organization or Other Organization Name contained in the following fields:
            chunk.loc[:, 'Provider Organization Name (Legal Business Name)':'Provider Credential Text'],
            # Address: Business Practice Location (not mailing), contained in the following fields:
            chunk.loc[:, 'Provider First Line Business Practice Location Address':'Provider Business Practice Location Address Postal Code'],
        ], axis=1)

        npi_taxonomy = pd.concat([
            chunk[['NPI']],
            # The provider's taxonomy code, which is contained in one of the 'Healthcare Provider Taxonomy Code*' columns
            chunk[chunk.columns[pd.Series(chunk.columns).str.startswith('Healthcare Provider Taxonomy Code_')]],
            chunk[chunk.columns[pd.Series(chunk.columns).str.startswith('Healthcare Provider Primary Taxonomy Switch_')]]
        ], axis=1)

        # Pivot from widet to long format
        npi_taxonomy = pd.wide_to_long(
            npi_taxonomy,
            stubnames=['Healthcare Provider Taxonomy Code', 'Healthcare Provider Primary Taxonomy Switch'],
            i=['NPI'], 
            j='primary_taxonomy_index',
            sep="_"
        )

        # Only keep the primary taxonomy
        npi_taxonomy = npi_taxonomy[npi_taxonomy['Healthcare Provider Primary Taxonomy Switch'] == 'Y']

        # Housekeeping
        npi_taxonomy = npi_taxonomy.reset_index()\
            .drop(columns=['primary_taxonomy_index', 'Healthcare Provider Primary Taxonomy Switch'])\
            .rename({ 'Healthcare Provider Taxonomy Code': 'taxonomy_code' }, axis=1)

        npidata = npidata.merge(
            npi_taxonomy,
            how='left',
            on='NPI'
        )

        # Rename columns
        npidata = npidata.reset_index().rename({
            'NPI': 'npi',
            'Entity Type Code': 'entity_type_code',
            'Provider Organization Name (Legal Business Name)': 'provider_org_name',
            'Provider Last Name (Legal Name)': 'provider_last_name',
            'Provider First Name': 'provider_first_name',
            'Provider Middle Name': 'provider_middle_name',
            'Provider Name Prefix Text': 'provider_name_prefix',
            'Provider Name Suffix Text': 'provider_name_suffix',
            'Provider Credential Text': 'provider_credential',
            'Provider First Line Business Practice Location Address': 'provider_business_address_1',
            'Provider Second Line Business Practice Location Address': 'provider_business_address_2',
            'Provider Business Practice Location Address City Name': 'provider_business_city',
            'Provider Business Practice Location Address State Name': 'provider_business_state',
            'Provider Business Practice Location Address Postal Code': 'provider_business_zip'
        }, axis=1)

        # Create Zip5 column to merge down the road
        npidata['provider_business_zip5'] = [str(i)[0:5] for i in npidata['provider_business_zip']]
        
        # Correct data types
        npidata['npi'] = npidata['npi'].astype(str)
        npidata['entity_type_code'] = npidata['entity_type_code'].astype(str).str.split('.').str[0]
        npidata['provider_business_zip'] = npidata['provider_business_zip'].astype(str).str.split('.').str[0]
        npidata['provider_business_zip5'] = npidata['provider_business_zip5'].astype(str).str.split('.').str[0]
        npidata = npidata[npidata['provider_business_zip5'].isin(zips['zip'])]
        
        # Remove unneeded columns
        npidata = npidata.drop('index', axis=1)
        
        # Filter to Nashville zips
        npidata = npidata[npidata['provider_business_zip5'].isin(zips['zip'])]

        npidata.to_sql('npidata', db, if_exists = 'append', index = False)                           

    print('task done')

In [20]:
# create a database or connect to an existing one
db = sqlite3.connect('../data/hcbb.sqlite')
# if you need to edit the database...
cursor = db.cursor()
# Drop the table and return a line that says that it's gone
cursor.execute("DROP TABLE npidata")
print("Table dropped...")

Table dropped...


In [23]:
#Test
with sqlite3.connect('../data/hcbb.sqlite') as db :
    query = """
    SELECT COUNT(*) AS count_all
    FROM npidata;
    """ 
    
    test = pd.read_sql(query, db)

test

Unnamed: 0,count_all
0,41401


#Load Taxonomy to SQLITE
with sqlite3.connect('../data/hcbb.sqlite') as db:   
    taxonomy = pd.read_csv("../data/nucc_taxonomy_210.csv")
    taxonomy = taxonomy[['Code', 'Grouping', 'Classification', 'Specialization']]
    taxonomy.columns = ['taxonomy_code', 'grouping', 'classification', 'specialization']
    taxonomy.to_sql('taxonomy', db, if_exists = 'append', index = False)  
    
    print('task done')

In [6]:
# Load Hop Teaming to SQLITE
for chunk in tqdm(pd.read_csv("../data/DocGraph_Hop_Teaming_2017.csv", chunksize = 10000)):
    # Append the chunk to a hop_teaming table
    chunk.to_sql(
        'hop_teaming', # The table name
        db, # The database
        if_exists = 'append', 
        index = False # Do not include the pandas index column
    )

# When done, print done
print('Task done.')

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Task done.


In [18]:
#Listing currently existing tables in the database
with sqlite3.connect('../data/hcbb.sqlite') as db :
    query = """
        SELECT name
        FROM sqlite_master 
        WHERE type ='table' 
        AND name NOT LIKE 'sqlite_%';
        """ 

    test_df = pd.read_sql(query, db)

display(test_df)

Unnamed: 0,name
0,npidata
1,taxonomy
2,hop_teaming


In [14]:
#Listing currently existing tables in the database
with sqlite3.connect('../data/hcbb.sqlite') as db :
    query = """
        SELECT DISTINCT to_npi
            FROM hop_teaming
            WHERE to_npi NOT IN (
                SELECT from_npi
                FROM hop_teaming
            );
        """ 
    
    test = pd.read_sql(query, db)
    
test

Unnamed: 0,from_npi,to_npi,patient_count,transaction_count,average_day_wait,std_day_wait
0,1023046604,1003981598,13,13,120.462,86.479
1,1013950807,1013148030,12,25,23.440,34.864
2,1003810375,1013306133,11,11,32.364,25.621
3,1013995133,1013322155,15,41,26.537,41.173
4,1023046604,1023105665,13,13,142.615,98.607
...,...,...,...,...,...,...
10655,1235307034,1245561513,11,14,33.071,24.895
10656,1255684460,1245699529,14,14,120.929,80.365
10657,1255468328,1245754225,13,16,86.813,96.649
10658,1265493001,1255346318,15,17,91.588,74.816


In [57]:
npidata_1['entity_type_code']

64      1.0
73      1.0
292     1.0
503     1.0
815     1.0
       ... 
9192    2.0
9262    1.0
9268    1.0
9569    1.0
9717    1.0
Name: entity_type_code, Length: 92, dtype: object

## Filter from_npi to be entity type 1 and to_npi to be entity type 2
### NOTE: The following cell codes runs in ~5-10 minutes. Set to markdown for security.

In [24]:
with sqlite3.connect('../data/hcbb.sqlite') as db :
    query = """
    WITH npi_entity_type_1 AS (
        SELECT npi
        FROM npidata 
        WHERE entity_type_code = 1
    ), npi_entity_type_2 AS (
        SELECT npi
        FROM npidata 
        WHERE entity_type_code = 2
    )
    SELECT *
    FROM hop_teaming
    WHERE from_npi IN npi_entity_type_1
    AND to_npi IN npi_entity_type_2
    """
    
    filtered_hop_teaming = pd.read_sql(query, db)

display(filtered_hop_teaming.shape)
display(filtered_hop_teaming.head())

(233546, 6)

Unnamed: 0,from_npi,to_npi,patient_count,transaction_count,average_day_wait,std_day_wait
0,1043215882,1003028770,29,44,47.455,56.183
1,1043232879,1003028770,24,24,112.333,80.894
2,1043302466,1003028770,24,26,98.192,97.772
3,1033297429,1003028770,56,62,53.145,58.831
4,1043206329,1003028770,173,177,97.864,81.756


In [25]:
# Filter so that the transaction_count is >= 50 and average_day_wait <= 50
filtered_hop_teaming = filtered_hop_teaming[
    (filtered_hop_teaming["transaction_count"] >= 50) |
    (filtered_hop_teaming["average_day_wait"] <= 50)
]

display(filtered_hop_teaming.shape)
display(filtered_hop_teaming.head())

(132000, 6)

Unnamed: 0,from_npi,to_npi,patient_count,transaction_count,average_day_wait,std_day_wait
0,1043215882,1003028770,29,44,47.455,56.183
3,1033297429,1003028770,56,62,53.145,58.831
4,1043206329,1003028770,173,177,97.864,81.756
9,1003922881,1003028770,17,21,45.619,41.519
10,1003963976,1003028770,2535,3945,0.0,0.0


## Creating a new table called filtered_hop_teaming

### IMPORTANT! This loading into the database should only be run once. If you run this multiple times, it will create duplicate entries in the database. For the security of not re-running this code by accident, the code here is converted into markdown. If you need to rebuild the database, delete the data/hcbb.sqlite file and re-run this cell as code. You will also need to make sure to re-run any other related scripts that builds other tables in the database.

In [26]:
with sqlite3.connect('../data/hcbb.sqlite') as db:
    filtered_hop_teaming.to_sql(
        'filtered_hop_teaming', 
        db, 
        if_exists = 'append', 
        index = False)

    # When done, print done
    print('Task done.')

Task done.


In [None]:
nodes = list(set(ht.from_npi.tolist() + ht.to_npi.tolist()))
node_df = pd.DataFrame({'npi:ID': nodes})
node_df[':LABEL'] = "Provider"
node_df.to_csv('import/nodes.csv', index = False)
edges = pd.DataFrame({':START_ID' : ht.from_npi, 'patient_count': ht.patient_count, 
                      'transaction_count': ht.transaction_count, 
                     ':END_ID' : ht.to_npi})
edges[':TYPE'] = 'REFERRED_TO'
edges.to_csv('import/edges.csv', index = False)