## Import Libraries

In [None]:
import pandas as pd
import sqlite3
from tqdm.notebook import tqdm
import numpy as np

## Hop Teaming Data

In [None]:
hop_sample = pd.read_csv('../data/DocGraph_Hop_Teaming_2018.csv', nrows = 100000)

In [None]:
hop_sample.info()

In [None]:
#Playing with pre-filtering a bit
hop_sample.loc[(hop_sample['transaction_count'] >= 25) & (hop_sample['average_day_wait'] < 90)]

# Create the hop_team SQLite database
db = sqlite3.connect('../data/hop_team_db.sqlite')

# create the referrals table and add it to the database
for chunk in tqdm(pd.read_csv('../data/DocGraph_Hop_Teaming_2018.csv', chunksize = 10000)):
    # Do some preemptive filtering for likely "accidental" referrals
    chunk = chunk.loc[(chunk['transaction_count'] >= 25) & (chunk['average_day_wait'] < 90)]
    # append to referrals table
    chunk.to_sql('referrals', db, if_exists = 'append', index = False)            

## NPI/NPPES Data

In [None]:
# list of columns needed for our project
select_cols = ['NPI',
               'Entity Type Code',
               'Provider Organization Name (Legal Business Name)',
               'Provider Last Name (Legal Name)',
               'Provider First Name',
               'Provider Middle Name',
               'Provider Name Prefix Text',
               'Provider Name Suffix Text',
               'Provider Credential Text',
               'Provider First Line Business Mailing Address',
               'Provider Second Line Business Mailing Address',
               'Provider Business Mailing Address City Name',
               'Provider Business Mailing Address State Name',
               'Provider Business Mailing Address Postal Code',
               'Healthcare Provider Taxonomy Code_1', 'Healthcare Provider Primary Taxonomy Switch_1',
               'Healthcare Provider Taxonomy Code_2', 'Healthcare Provider Primary Taxonomy Switch_2',
               'Healthcare Provider Taxonomy Code_3', 'Healthcare Provider Primary Taxonomy Switch_3',
               'Healthcare Provider Taxonomy Code_4', 'Healthcare Provider Primary Taxonomy Switch_4',
               'Healthcare Provider Taxonomy Code_5', 'Healthcare Provider Primary Taxonomy Switch_5',
               'Healthcare Provider Taxonomy Code_6', 'Healthcare Provider Primary Taxonomy Switch_6',
               'Healthcare Provider Taxonomy Code_7', 'Healthcare Provider Primary Taxonomy Switch_7',
               'Healthcare Provider Taxonomy Code_8', 'Healthcare Provider Primary Taxonomy Switch_8',
               'Healthcare Provider Taxonomy Code_9', 'Healthcare Provider Primary Taxonomy Switch_9',
               'Healthcare Provider Taxonomy Code_10', 'Healthcare Provider Primary Taxonomy Switch_10',
               'Healthcare Provider Taxonomy Code_11', 'Healthcare Provider Primary Taxonomy Switch_11',
               'Healthcare Provider Taxonomy Code_12', 'Healthcare Provider Primary Taxonomy Switch_12',
               'Healthcare Provider Taxonomy Code_13', 'Healthcare Provider Primary Taxonomy Switch_13',
               'Healthcare Provider Taxonomy Code_14', 'Healthcare Provider Primary Taxonomy Switch_14',
               'Healthcare Provider Taxonomy Code_15', 'Healthcare Provider Primary Taxonomy Switch_15']

In [None]:
# Pull a sample of the data to explore
npi_sample = pd.read_csv('../data/npidata_pfile_20050523-20230212.csv',
                         usecols = select_cols,
                         skiprows = range(1, 600),
                         nrows = 100000)

In [None]:
npi_sample.info()

In [None]:
# How many providers don't indicate a primary taxonomy code, designated by a value of 'X'?
npi_sample['Healthcare Provider Primary Taxonomy Switch_1'].value_counts()
# Based on looking at a few chunks, it appears to be around 4%

In [None]:
# For providers that have indicated a primary taxonomy code, pull that code into a new column
npi_sample['taxonomy_code'] = np.nan

for n in range(1, 16):
    x = str(n)
    npi_sample.loc[npi_sample[f'Healthcare Provider Primary Taxonomy Switch_{x}'] == 'Y', 
                   'taxonomy_code'] = npi_sample[f'Healthcare Provider Taxonomy Code_{x}']

In [None]:
# For providers that do not indicate a primary taxonomy code, pull the code from the first taxonomy column
npi_sample.loc[npi_sample['taxonomy_code'].isna(), 'taxonomy_code'] = npi_sample['Healthcare Provider Taxonomy Code_1']

In [None]:
npi_sample = npi_sample.drop(columns = ['Healthcare Provider Taxonomy Code_1', 'Healthcare Provider Primary Taxonomy Switch_1',
                                  'Healthcare Provider Taxonomy Code_2', 'Healthcare Provider Primary Taxonomy Switch_2',
                                  'Healthcare Provider Taxonomy Code_3', 'Healthcare Provider Primary Taxonomy Switch_3',
                                  'Healthcare Provider Taxonomy Code_4', 'Healthcare Provider Primary Taxonomy Switch_4',
                                  'Healthcare Provider Taxonomy Code_5', 'Healthcare Provider Primary Taxonomy Switch_5',
                                  'Healthcare Provider Taxonomy Code_6', 'Healthcare Provider Primary Taxonomy Switch_6',
                                  'Healthcare Provider Taxonomy Code_7', 'Healthcare Provider Primary Taxonomy Switch_7',
                                  'Healthcare Provider Taxonomy Code_8', 'Healthcare Provider Primary Taxonomy Switch_8',
                                  'Healthcare Provider Taxonomy Code_9', 'Healthcare Provider Primary Taxonomy Switch_9',
                                  'Healthcare Provider Taxonomy Code_10', 'Healthcare Provider Primary Taxonomy Switch_10',
                                  'Healthcare Provider Taxonomy Code_11', 'Healthcare Provider Primary Taxonomy Switch_11',
                                  'Healthcare Provider Taxonomy Code_12', 'Healthcare Provider Primary Taxonomy Switch_12',
                                  'Healthcare Provider Taxonomy Code_13', 'Healthcare Provider Primary Taxonomy Switch_13',
                                  'Healthcare Provider Taxonomy Code_14', 'Healthcare Provider Primary Taxonomy Switch_14',
                                  'Healthcare Provider Taxonomy Code_15', 'Healthcare Provider Primary Taxonomy Switch_15'])

In [None]:
# Create a function to deal with the leading zeroes that are missing from zip codes

def zip_zeroes(x):
    if len(x) > 5:
        return x.zfill(9)[:5]
    elif len(x) > 0:
        return x.zfill(5)
    else: return np.nan

In [None]:
npi_sample['Provider Business Mailing Address Postal Code'] = npi_sample['Provider Business Mailing Address Postal Code'].astype(str).apply(zip_zeroes)

In [None]:
npi_sample.columns = [x.lower()
                     .replace('provider ', '')
                     .replace('business ', '')
                     .replace(' text', '')
                     .replace(' (legal name)', '')
                     .replace(' ', '_') for x in npi_sample.columns] 

In [None]:
npi_sample

# create the providers table from npi/nppes data and add it to the database
for chunk in tqdm(pd.read_csv('../data/npidata_pfile_20050523-20230212.csv', 
                              usecols = select_cols,
                              chunksize = 10000,
                              low_memory = False)):
    
    # For providers that have indicated a primary taxonomy code, pull that code into a new column
    chunk['taxonomy_code'] = np.nan
    for n in range(1, 16):
        x = str(n)
        chunk.loc[chunk[f'Healthcare Provider Primary Taxonomy Switch_{x}'] == 'Y', 
                       'taxonomy_code'] = chunk[f'Healthcare Provider Taxonomy Code_{x}']
        
    # For providers that do not indicate a primary taxonomy code, pull the code from the first taxonomy column
    chunk.loc[chunk['taxonomy_code'].isna(), 'taxonomy_code'] = chunk['Healthcare Provider Taxonomy Code_1']
    
    # Drop the columns no longer needed
    chunk = chunk.drop(columns = ['Healthcare Provider Taxonomy Code_1', 'Healthcare Provider Primary Taxonomy Switch_1',
                                  'Healthcare Provider Taxonomy Code_2', 'Healthcare Provider Primary Taxonomy Switch_2',
                                  'Healthcare Provider Taxonomy Code_3', 'Healthcare Provider Primary Taxonomy Switch_3',
                                  'Healthcare Provider Taxonomy Code_4', 'Healthcare Provider Primary Taxonomy Switch_4',
                                  'Healthcare Provider Taxonomy Code_5', 'Healthcare Provider Primary Taxonomy Switch_5',
                                  'Healthcare Provider Taxonomy Code_6', 'Healthcare Provider Primary Taxonomy Switch_6',
                                  'Healthcare Provider Taxonomy Code_7', 'Healthcare Provider Primary Taxonomy Switch_7',
                                  'Healthcare Provider Taxonomy Code_8', 'Healthcare Provider Primary Taxonomy Switch_8',
                                  'Healthcare Provider Taxonomy Code_9', 'Healthcare Provider Primary Taxonomy Switch_9',
                                  'Healthcare Provider Taxonomy Code_10', 'Healthcare Provider Primary Taxonomy Switch_10',
                                  'Healthcare Provider Taxonomy Code_11', 'Healthcare Provider Primary Taxonomy Switch_11',
                                  'Healthcare Provider Taxonomy Code_12', 'Healthcare Provider Primary Taxonomy Switch_12',
                                  'Healthcare Provider Taxonomy Code_13', 'Healthcare Provider Primary Taxonomy Switch_13',
                                  'Healthcare Provider Taxonomy Code_14', 'Healthcare Provider Primary Taxonomy Switch_14',
                                  'Healthcare Provider Taxonomy Code_15', 'Healthcare Provider Primary Taxonomy Switch_15'])
     
    #clean up zip code column by putting missing leading zeroes back and getting the 9-digit entries down to 5
    chunk['Provider Business Mailing Address Postal Code'] = chunk['Provider Business Mailing Address Postal Code'].astype(str).apply(zip_zeroes)
    
    # Clean up the column names
    chunk.columns = [x.lower()
                     .replace('provider ', '')
                     .replace('business ', '')
                     .replace(' text', '')
                     .replace(' (legal name)', '')
                     .replace(' ', '_') for x in chunk.columns]                  
   
    # append to provider table
    chunk.to_sql('provider', db, if_exists = 'append', index = False)            

## Taxonomy Details

In [None]:
tax_deets = pd.read_csv('../data/nucc_taxonomy_230.csv')

In [None]:
tax_deets.info()

In [None]:
# make column names consistent with formatting of other tables
tax_deets.columns = [x.lower()
                     .replace(' ', '_') for x in tax_deets.columns]
# make the taxonomy code column name match the taxonomy code column name in the provider table
tax_deets = tax_deets.rename(columns = {'code' : 'taxonomy_code'})

In [None]:
tax_deets

# create the taxonomy table and add it to the database
tax_deets.to_sql('taxonomy', db, if_exists = 'append', index = False)

## Zip Code/CBSA Date

In [None]:
zip_cbsa = pd.read_excel('../data/ZIP_CBSA_122021.xlsx')

In [None]:
zip_cbsa.info()

In [None]:
#simplify column names
zip_cbsa = zip_cbsa.rename(columns = {'usps_zip_pref_city' : 'city',
                              'usps_zip_pref_state' : 'state'})

#get the leading zeroes back in place
zip_cbsa['zip'] = zip_cbsa['zip'].astype(str).str.zfill(5)

In [None]:
zip_cbsa.info()

# create the zip_cbsa table and add it to the database
zip_cbsa.to_sql('zip_cbsa', db, if_exists = 'append', index = False)

In [None]:
# db.close()