## Import Libraries

In [1]:
import pandas as pd
import sqlite3
from tqdm.notebook import tqdm
import numpy as np

## NPI/NPPES Data

In [2]:
# list of columns needed for our project
select_cols = ['NPI',
               'Entity Type Code',
               'Provider Organization Name (Legal Business Name)',
               'Provider Last Name (Legal Name)',
               'Provider First Name',
               'Provider Middle Name',
               'Provider Name Prefix Text',
               'Provider Name Suffix Text',
               'Provider Credential Text',
               'Provider First Line Business Mailing Address',
               'Provider Second Line Business Mailing Address',
               'Provider Business Mailing Address City Name',
               'Provider Business Mailing Address State Name',
               'Provider Business Mailing Address Postal Code',
               'Healthcare Provider Taxonomy Code_1', 'Healthcare Provider Primary Taxonomy Switch_1',
               'Healthcare Provider Taxonomy Code_2', 'Healthcare Provider Primary Taxonomy Switch_2',
               'Healthcare Provider Taxonomy Code_3', 'Healthcare Provider Primary Taxonomy Switch_3',
               'Healthcare Provider Taxonomy Code_4', 'Healthcare Provider Primary Taxonomy Switch_4',
               'Healthcare Provider Taxonomy Code_5', 'Healthcare Provider Primary Taxonomy Switch_5',
               'Healthcare Provider Taxonomy Code_6', 'Healthcare Provider Primary Taxonomy Switch_6',
               'Healthcare Provider Taxonomy Code_7', 'Healthcare Provider Primary Taxonomy Switch_7',
               'Healthcare Provider Taxonomy Code_8', 'Healthcare Provider Primary Taxonomy Switch_8',
               'Healthcare Provider Taxonomy Code_9', 'Healthcare Provider Primary Taxonomy Switch_9',
               'Healthcare Provider Taxonomy Code_10', 'Healthcare Provider Primary Taxonomy Switch_10',
               'Healthcare Provider Taxonomy Code_11', 'Healthcare Provider Primary Taxonomy Switch_11',
               'Healthcare Provider Taxonomy Code_12', 'Healthcare Provider Primary Taxonomy Switch_12',
               'Healthcare Provider Taxonomy Code_13', 'Healthcare Provider Primary Taxonomy Switch_13',
               'Healthcare Provider Taxonomy Code_14', 'Healthcare Provider Primary Taxonomy Switch_14',
               'Healthcare Provider Taxonomy Code_15', 'Healthcare Provider Primary Taxonomy Switch_15']

In [3]:
# Pull a sample of the data to explore
npi_sample = pd.read_csv('../data/npidata_pfile_20050523-20230212.csv',
                         usecols = select_cols,
                         skiprows = range(1, 100600),
                         nrows = 100000)

  npi_sample = pd.read_csv('../data/npidata_pfile_20050523-20230212.csv',


In [4]:
npi_sample.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 44 columns):
 #   Column                                            Non-Null Count   Dtype  
---  ------                                            --------------   -----  
 0   NPI                                               100000 non-null  int64  
 1   Entity Type Code                                  100000 non-null  int64  
 2   Provider Organization Name (Legal Business Name)  0 non-null       float64
 3   Provider Last Name (Legal Name)                   99996 non-null   object 
 4   Provider First Name                               100000 non-null  object 
 5   Provider Middle Name                              75143 non-null   object 
 6   Provider Name Prefix Text                         55211 non-null   object 
 7   Provider Name Suffix Text                         3020 non-null    object 
 8   Provider Credential Text                          94403 non-null   object 
 9   Provi

In [5]:
# How many providers don't indicate a primary taxonomy code, designated by a value of 'X'?
npi_sample['Healthcare Provider Primary Taxonomy Switch_1'].value_counts()
# Based on looking at a few chunks, it appears to be around 4%

Y    95584
X     3908
N      508
Name: Healthcare Provider Primary Taxonomy Switch_1, dtype: int64

In [6]:
# For providers that have indicated a primary taxonomy code, pull that code into the primary_taxonomy column
npi_sample['taxonomy_code'] = np.nan

for n in range(1, 16):
    x = str(n)
    npi_sample.loc[npi_sample[f'Healthcare Provider Primary Taxonomy Switch_{x}'] == 'Y', 
                   'taxonomy_code'] = npi_sample[f'Healthcare Provider Taxonomy Code_{x}']

In [7]:
# For providers that do not indicate a primary taxonomy code, pull the code from the first taxonomy column
npi_sample.loc[npi_sample['taxonomy_code'].isna(), 'taxonomy_code'] = npi_sample['Healthcare Provider Taxonomy Code_1']

In [8]:
npi_sample = npi_sample.drop(columns = ['Healthcare Provider Taxonomy Code_1', 'Healthcare Provider Primary Taxonomy Switch_1',
                                  'Healthcare Provider Taxonomy Code_2', 'Healthcare Provider Primary Taxonomy Switch_2',
                                  'Healthcare Provider Taxonomy Code_3', 'Healthcare Provider Primary Taxonomy Switch_3',
                                  'Healthcare Provider Taxonomy Code_4', 'Healthcare Provider Primary Taxonomy Switch_4',
                                  'Healthcare Provider Taxonomy Code_5', 'Healthcare Provider Primary Taxonomy Switch_5',
                                  'Healthcare Provider Taxonomy Code_6', 'Healthcare Provider Primary Taxonomy Switch_6',
                                  'Healthcare Provider Taxonomy Code_7', 'Healthcare Provider Primary Taxonomy Switch_7',
                                  'Healthcare Provider Taxonomy Code_8', 'Healthcare Provider Primary Taxonomy Switch_8',
                                  'Healthcare Provider Taxonomy Code_9', 'Healthcare Provider Primary Taxonomy Switch_9',
                                  'Healthcare Provider Taxonomy Code_10', 'Healthcare Provider Primary Taxonomy Switch_10',
                                  'Healthcare Provider Taxonomy Code_11', 'Healthcare Provider Primary Taxonomy Switch_11',
                                  'Healthcare Provider Taxonomy Code_12', 'Healthcare Provider Primary Taxonomy Switch_12',
                                  'Healthcare Provider Taxonomy Code_13', 'Healthcare Provider Primary Taxonomy Switch_13',
                                  'Healthcare Provider Taxonomy Code_14', 'Healthcare Provider Primary Taxonomy Switch_14',
                                  'Healthcare Provider Taxonomy Code_15', 'Healthcare Provider Primary Taxonomy Switch_15'])

In [9]:
npi_sample

Unnamed: 0,NPI,Entity Type Code,Provider Organization Name (Legal Business Name),Provider Last Name (Legal Name),Provider First Name,Provider Middle Name,Provider Name Prefix Text,Provider Name Suffix Text,Provider Credential Text,Provider First Line Business Mailing Address,Provider Second Line Business Mailing Address,Provider Business Mailing Address City Name,Provider Business Mailing Address State Name,Provider Business Mailing Address Postal Code,taxonomy_code
0,1013955939,1,,MCCLURE-MARING,LYDIA,JO,MS.,,CRNP,220 LOWRY RD,,ERIE,PA,165111327,363L00000X
1,1659319572,1,,ESTES,JANE,,,,LCSW,PO BOX 1589,,BENTON,AR,720181589,1041C0700X
2,1093753915,1,,BUCHELE,BONNIE,B.,DR.,,PH.D.,411 NICHOLS RD,SUITE 194,KANSAS CITY,MO,641122000,103TC0700X
3,1811935737,1,,TENORIO,ROSALINE,R,,,CNNP,5901 HARPER DR NE,PROVIDER ENROLLMENT,ALBUQUERQUE,NM,871093587,363LN0000X
4,1720026644,1,,PRYOR,TERRI,L,,,PA,16706 NATIONAL HWY SW,,FROSTBURG,MD,215323304,363AM0700X
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,1669580163,1,,EVANS,ARTHUR,F,DR.,,DDS,PO BOX 186,,GRANETTE,AR,72736,1223G0001X
99996,1487762985,1,,STIGALL,LARRY,EVERETT,MR.,,DDS,240 DOCTORS DR,,BOONE,NC,28607,122300000X
99997,1396853792,1,,KURZ,EDWARD,GEORGE,DR.,,DDS,330 N CHESTNUT ST,,RAVENNA,OH,442662216,122300000X
99998,1023126422,1,,MURATA,FARA,DIANE,,,LCSW,5738 OLDE WADSWORTH BLVD,,ARVADA,CO,800022535,1041C0700X


In [37]:
zips = []
for x in npi_sample['Provider Business Mailing Address Postal Code']:
    if len(x.astype(str)) > 5:
        zips.append(x.zfill(9)[:5])
    elif len(x) > 0:
        zips.append(x.zfill(5))
    else: zips.append(np.nan)

zips

AttributeError: 'str' object has no attribute 'astype'

In [32]:
npi_sample

Unnamed: 0,NPI,Entity Type Code,Provider Organization Name (Legal Business Name),Provider Last Name (Legal Name),Provider First Name,Provider Middle Name,Provider Name Prefix Text,Provider Name Suffix Text,Provider Credential Text,Provider First Line Business Mailing Address,Provider Second Line Business Mailing Address,Provider Business Mailing Address City Name,Provider Business Mailing Address State Name,Provider Business Mailing Address Postal Code,taxonomy_code
0,1013955939,1,,MCCLURE-MARING,LYDIA,JO,MS.,,CRNP,220 LOWRY RD,,ERIE,PA,16511,363L00000X
1,1659319572,1,,ESTES,JANE,,,,LCSW,PO BOX 1589,,BENTON,AR,72018,1041C0700X
2,1093753915,1,,BUCHELE,BONNIE,B.,DR.,,PH.D.,411 NICHOLS RD,SUITE 194,KANSAS CITY,MO,64112,103TC0700X
3,1811935737,1,,TENORIO,ROSALINE,R,,,CNNP,5901 HARPER DR NE,PROVIDER ENROLLMENT,ALBUQUERQUE,NM,87109,363LN0000X
4,1720026644,1,,PRYOR,TERRI,L,,,PA,16706 NATIONAL HWY SW,,FROSTBURG,MD,21532,363AM0700X
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,1669580163,1,,EVANS,ARTHUR,F,DR.,,DDS,PO BOX 186,,GRANETTE,AR,,1223G0001X
99996,1487762985,1,,STIGALL,LARRY,EVERETT,MR.,,DDS,240 DOCTORS DR,,BOONE,NC,,122300000X
99997,1396853792,1,,KURZ,EDWARD,GEORGE,DR.,,DDS,330 N CHESTNUT ST,,RAVENNA,OH,,122300000X
99998,1023126422,1,,MURATA,FARA,DIANE,,,LCSW,5738 OLDE WADSWORTH BLVD,,ARVADA,CO,,1041C0700X


# Create the hop_team SQLite database
db = sqlite3.connect('../data/hop_team.sqlite')

# create the providers table from npi/nppes data and add it to the database
for chunk in tqdm(pd.read_csv('../data/npidata_pfile_20050523-20230212.csv', 
                              usecols = select_cols,
                              chunksize = 10000)):
    
    # For providers that have indicated a primary taxonomy code, pull that code into the primary_taxonomy column
    npi_sample['taxonomy_code'] = np.nan
    for n in range(1, 16):
        x = str(n)
        npi_sample.loc[npi_sample[f'Healthcare Provider Primary Taxonomy Switch_{x}'] == 'Y', 
                       'taxonomy_code'] = npi_sample[f'Healthcare Provider Taxonomy Code_{x}']
        
    # For providers that do not indicate a primary taxonomy code, pull the code from the first taxonomy column
    npi_sample.loc[npi_sample['taxonomy_code'].isna(), 'taxonomy_code'] = npi_sample['Healthcare Provider Taxonomy Code_1']
    
    # Drop the columns no longer needed
    chunk = chunk.drop(columns = ['Healthcare Provider Taxonomy Code_1', 'Healthcare Provider Primary Taxonomy Switch_1',
                                  'Healthcare Provider Taxonomy Code_2', 'Healthcare Provider Primary Taxonomy Switch_2',
                                  'Healthcare Provider Taxonomy Code_3', 'Healthcare Provider Primary Taxonomy Switch_3',
                                  'Healthcare Provider Taxonomy Code_4', 'Healthcare Provider Primary Taxonomy Switch_4',
                                  'Healthcare Provider Taxonomy Code_5', 'Healthcare Provider Primary Taxonomy Switch_5',
                                  'Healthcare Provider Taxonomy Code_6', 'Healthcare Provider Primary Taxonomy Switch_6',
                                  'Healthcare Provider Taxonomy Code_7', 'Healthcare Provider Primary Taxonomy Switch_7',
                                  'Healthcare Provider Taxonomy Code_8', 'Healthcare Provider Primary Taxonomy Switch_8',
                                  'Healthcare Provider Taxonomy Code_9', 'Healthcare Provider Primary Taxonomy Switch_9',
                                  'Healthcare Provider Taxonomy Code_10', 'Healthcare Provider Primary Taxonomy Switch_10',
                                  'Healthcare Provider Taxonomy Code_11', 'Healthcare Provider Primary Taxonomy Switch_11',
                                  'Healthcare Provider Taxonomy Code_12', 'Healthcare Provider Primary Taxonomy Switch_12',
                                  'Healthcare Provider Taxonomy Code_13', 'Healthcare Provider Primary Taxonomy Switch_13',
                                  'Healthcare Provider Taxonomy Code_14', 'Healthcare Provider Primary Taxonomy Switch_14',
                                  'Healthcare Provider Taxonomy Code_15', 'Healthcare Provider Primary Taxonomy Switch_15'])
    
    chunk.columns = [x.lower()
                     .replace('provider ', '')
                     .replace('business ', '')
                     .replace(' text', '')
                     .replace(' (legal name)', '')
                     .replace(' ', '_') for x in chunk.columns]                  # Clean up the column names
    
    chunk.to_sql('provider', db, if_exists = 'append', index = False)            # append to provider table

## Taxonomy Details

In [10]:
tax_deets = pd.read_csv('../data/nucc_taxonomy_230.csv')

In [11]:
tax_deets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 873 entries, 0 to 872
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Code            873 non-null    object
 1   Grouping        873 non-null    object
 2   Classification  873 non-null    object
 3   Specialization  632 non-null    object
 4   Definition      865 non-null    object
 5   Notes           557 non-null    object
 6   Display Name    873 non-null    object
 7   Section         873 non-null    object
dtypes: object(8)
memory usage: 54.7+ KB


In [12]:
tax_deets.columns = [x.lower() for x in tax_deets.columns]
tax_deets = tax_deets.rename(columns = {'code' : 'taxonomy_code'})
tax_deets

Unnamed: 0,taxonomy_code,grouping,classification,specialization,definition,notes,display name,section
0,193200000X,Group,Multi-Specialty,,A business group of one or more individual pra...,[7/1/2003: new],Multi-Specialty Group,Individual
1,193400000X,Group,Single Specialty,,A business group of one or more individual pra...,[7/1/2003: new],Single Specialty Group,Individual
2,207K00000X,Allopathic & Osteopathic Physicians,Allergy & Immunology,,An allergist-immunologist is trained in evalua...,"Source: American Board of Medical Specialties,...",Allergy & Immunology Physician,Individual
3,207KA0200X,Allopathic & Osteopathic Physicians,Allergy & Immunology,Allergy,"A physician who specializes in the diagnosis, ...",Source: National Uniform Claim Committee,Allergy Physician,Individual
4,207KI0005X,Allopathic & Osteopathic Physicians,Allergy & Immunology,Clinical & Laboratory Immunology,An allergy and immunology physician who specia...,"Source: National Uniform Claim Committee, 2022...",Clinical & Laboratory Immunology (Allergy & Im...,Individual
...,...,...,...,...,...,...,...,...
868,343800000X,Transportation Services,Secured Medical Transport (VAN),,A public or privately owned transportation ser...,,Secured Medical Transport (VAN),Non-Individual
869,344600000X,Transportation Services,Taxi,,A land commercial vehicle used for the transpo...,,Taxi,Non-Individual
870,347D00000X,Transportation Services,Train,,An organization or business licensed to provid...,,Train,Non-Individual
871,347E00000X,Transportation Services,Transportation Broker,,An organization that provides transportation f...,Source: Section 6083 of the Deficit Reduction ...,Transportation Broker,Non-Individual


# create the taxonomy table and add it to the database
tax_deets.to_sql('taxonomy', db, if_exists = 'append', index = False)

## Zip Code/CBSA Date

In [17]:
zip_cbsa = pd.read_excel('../data/ZIP_CBSA_122021.xlsx')

In [18]:
zip_cbsa.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47484 entries, 0 to 47483
Data columns (total 8 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   zip                  47484 non-null  int64  
 1   cbsa                 47484 non-null  int64  
 2   usps_zip_pref_city   47484 non-null  object 
 3   usps_zip_pref_state  47484 non-null  object 
 4   res_ratio            47484 non-null  float64
 5   bus_ratio            47484 non-null  float64
 6   oth_ratio            47484 non-null  float64
 7   tot_ratio            47484 non-null  float64
dtypes: float64(4), int64(2), object(2)
memory usage: 2.9+ MB


In [19]:
zip_cbsa = zip_cbsa.rename(columns = {'usps_zip_pref_city' : 'city',
                              'usps_zip_pref_state' : 'state'})
zip_cbsa['zip'] = zip_cbsa['zip'].astype(str).str.zfill(5)

In [24]:
zip_cbsa

Unnamed: 0,zip,cbsa,city,state,res_ratio,bus_ratio,oth_ratio,tot_ratio
0,00683,41900,SAN GERMAN,PR,0.999842,1.0,1.0,0.999855
1,00683,32420,SAN GERMAN,PR,0.000158,0.0,0.0,0.000145
2,00923,41980,SAN JUAN,PR,1.000000,1.0,1.0,1.000000
3,01010,44140,BRIMFIELD,MA,0.976896,1.0,1.0,0.977816
4,01010,49340,BRIMFIELD,MA,0.023104,0.0,0.0,0.022184
...,...,...,...,...,...,...,...,...
47479,60684,16980,CHICAGO,IL,0.000000,1.0,0.0,1.000000
47480,33945,15980,PINELAND,FL,0.000000,0.0,1.0,1.000000
47481,78144,99999,PANNA MARIA,TX,0.000000,1.0,0.0,1.000000
47482,12257,10580,ALBANY,NY,0.000000,1.0,0.0,1.000000


# create the zip_cbsa table and add it to the database
zip_cbsa.to_sql('zip_cbsa', db, if_exists = 'append', index = False)

## Hop Teaming Data

In [25]:
hop_sample = pd.read_csv('../data/DocGraph_Hop_Teaming_2018.csv', nrows = 10000)

In [26]:
hop_sample.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   from_npi           10000 non-null  int64  
 1   to_npi             10000 non-null  int64  
 2   patient_count      10000 non-null  int64  
 3   transaction_count  10000 non-null  int64  
 4   average_day_wait   10000 non-null  float64
 5   std_day_wait       10000 non-null  float64
dtypes: float64(2), int64(4)
memory usage: 468.9 KB


In [27]:
hop_sample

Unnamed: 0,from_npi,to_npi,patient_count,transaction_count,average_day_wait,std_day_wait
0,1508062167,1730166109,350,370,53.922,72.612
1,1508065640,1730166109,25,25,49.800,55.006
2,1508052093,1730166109,16,16,109.500,70.593
3,1508172545,1730166109,14,14,103.357,75.483
4,1508285131,1730166109,20,21,89.952,89.880
...,...,...,...,...,...,...
9995,1508026980,1730477589,36,46,64.239,77.845
9996,1497997050,1730477589,38,43,40.395,60.289
9997,1497977268,1730477761,14,14,37.857,72.533
9998,1508024217,1730477811,12,12,59.000,46.043


# create the referrals table and add it to the database
for chunk in tqdm(pd.read_csv('../data/DocGraph_Hop_Teaming_2018.csv', chunksize = 10000)):
    chunk.to_sql('referrals', db, if_exists = 'append', index = False)            # append to referrals table