## Libraries and setup

In [1]:
import pandas as pd
import sqlite3
from tqdm.notebook import tqdm

In [2]:
pd.set_option("display.max_columns", 500)
pd.set_option('display.max_rows', 1000)

## Datasets

- `npidata_pfile`: Data File - File contains the FOIA-disclosable NPPES provider data
- `othername_pfile`: Other Name Reference File - File contains additional Other Names associated with Type 2 NPIs
- `pl_pfile`: Practice Location Reference File - File contains all of the non-primary Practice Locations associated with Type 1 and Type 2 NPIs
- `endpoint_pfile`: Endpoint Reference File - File contains all Endpoints associated with Type 1 and Type 2 NPIs

- Taxonomy code to classification crosswalk: `nucc_taxonomy_210.csv`
- Zip code to CBSA crosswalk: `ZIP_CBSA_122020.xlsx`

Read-in the data from file
**TODO: Remove `nrows` to get all the data**

In [11]:
npidata_raw = pd.read_csv("../data/npidata_pfile_20050523-20210207.csv", nrows=1000)

In [4]:
taxonomy_codes = pd.read_csv("../data/nucc_taxonomy_210.csv")

In [9]:
zips = pd.read_excel("../data/ZIP_CBSA_122017.xlsx", converters={'zip': lambda x: str(x)}, engine='openpyxl')

## Subsetting `npidata`

In [6]:
display(npidata_raw.shape)
display(npidata_raw.head())

(1000, 330)

Unnamed: 0,NPI,Entity Type Code,Replacement NPI,Employer Identification Number (EIN),Provider Organization Name (Legal Business Name),Provider Last Name (Legal Name),Provider First Name,Provider Middle Name,Provider Name Prefix Text,Provider Name Suffix Text,Provider Credential Text,Provider Other Organization Name,Provider Other Organization Name Type Code,Provider Other Last Name,Provider Other First Name,Provider Other Middle Name,Provider Other Name Prefix Text,Provider Other Name Suffix Text,Provider Other Credential Text,Provider Other Last Name Type Code,Provider First Line Business Mailing Address,Provider Second Line Business Mailing Address,Provider Business Mailing Address City Name,Provider Business Mailing Address State Name,Provider Business Mailing Address Postal Code,Provider Business Mailing Address Country Code (If outside U.S.),Provider Business Mailing Address Telephone Number,Provider Business Mailing Address Fax Number,Provider First Line Business Practice Location Address,Provider Second Line Business Practice Location Address,Provider Business Practice Location Address City Name,Provider Business Practice Location Address State Name,Provider Business Practice Location Address Postal Code,Provider Business Practice Location Address Country Code (If outside U.S.),Provider Business Practice Location Address Telephone Number,Provider Business Practice Location Address Fax Number,Provider Enumeration Date,Last Update Date,NPI Deactivation Reason Code,NPI Deactivation Date,NPI Reactivation Date,Provider Gender Code,Authorized Official Last Name,Authorized Official First Name,Authorized Official Middle Name,Authorized Official Title or Position,Authorized Official Telephone Number,Healthcare Provider Taxonomy Code_1,Provider License Number_1,Provider License Number State Code_1,Healthcare Provider Primary Taxonomy Switch_1,Healthcare Provider Taxonomy Code_2,Provider License Number_2,Provider License Number State Code_2,Healthcare Provider Primary Taxonomy Switch_2,Healthcare Provider Taxonomy Code_3,Provider License Number_3,Provider License Number State Code_3,Healthcare Provider Primary Taxonomy Switch_3,Healthcare Provider Taxonomy Code_4,Provider License Number_4,Provider License Number State Code_4,Healthcare Provider Primary Taxonomy Switch_4,Healthcare Provider Taxonomy Code_5,Provider License Number_5,Provider License Number State Code_5,Healthcare Provider Primary Taxonomy Switch_5,Healthcare Provider Taxonomy Code_6,Provider License Number_6,Provider License Number State Code_6,Healthcare Provider Primary Taxonomy Switch_6,Healthcare Provider Taxonomy Code_7,Provider License Number_7,Provider License Number State Code_7,Healthcare Provider Primary Taxonomy Switch_7,Healthcare Provider Taxonomy Code_8,Provider License Number_8,Provider License Number State Code_8,Healthcare Provider Primary Taxonomy Switch_8,Healthcare Provider Taxonomy Code_9,Provider License Number_9,Provider License Number State Code_9,Healthcare Provider Primary Taxonomy Switch_9,Healthcare Provider Taxonomy Code_10,Provider License Number_10,Provider License Number State Code_10,Healthcare Provider Primary Taxonomy Switch_10,Healthcare Provider Taxonomy Code_11,Provider License Number_11,Provider License Number State Code_11,Healthcare Provider Primary Taxonomy Switch_11,Healthcare Provider Taxonomy Code_12,Provider License Number_12,Provider License Number State Code_12,Healthcare Provider Primary Taxonomy Switch_12,Healthcare Provider Taxonomy Code_13,Provider License Number_13,Provider License Number State Code_13,Healthcare Provider Primary Taxonomy Switch_13,Healthcare Provider Taxonomy Code_14,Provider License Number_14,Provider License Number State Code_14,Healthcare Provider Primary Taxonomy Switch_14,Healthcare Provider Taxonomy Code_15,Provider License Number_15,Provider License Number State Code_15,Healthcare Provider Primary Taxonomy Switch_15,Other Provider Identifier_1,Other Provider Identifier Type Code_1,Other Provider Identifier State_1,Other Provider Identifier Issuer_1,Other Provider Identifier_2,Other Provider Identifier Type Code_2,Other Provider Identifier State_2,Other Provider Identifier Issuer_2,Other Provider Identifier_3,Other Provider Identifier Type Code_3,Other Provider Identifier State_3,Other Provider Identifier Issuer_3,Other Provider Identifier_4,Other Provider Identifier Type Code_4,Other Provider Identifier State_4,Other Provider Identifier Issuer_4,Other Provider Identifier_5,Other Provider Identifier Type Code_5,Other Provider Identifier State_5,Other Provider Identifier Issuer_5,Other Provider Identifier_6,Other Provider Identifier Type Code_6,Other Provider Identifier State_6,Other Provider Identifier Issuer_6,Other Provider Identifier_7,Other Provider Identifier Type Code_7,Other Provider Identifier State_7,Other Provider Identifier Issuer_7,Other Provider Identifier_8,Other Provider Identifier Type Code_8,Other Provider Identifier State_8,Other Provider Identifier Issuer_8,Other Provider Identifier_9,Other Provider Identifier Type Code_9,Other Provider Identifier State_9,Other Provider Identifier Issuer_9,Other Provider Identifier_10,Other Provider Identifier Type Code_10,Other Provider Identifier State_10,Other Provider Identifier Issuer_10,Other Provider Identifier_11,Other Provider Identifier Type Code_11,Other Provider Identifier State_11,Other Provider Identifier Issuer_11,Other Provider Identifier_12,Other Provider Identifier Type Code_12,Other Provider Identifier State_12,Other Provider Identifier Issuer_12,Other Provider Identifier_13,Other Provider Identifier Type Code_13,Other Provider Identifier State_13,Other Provider Identifier Issuer_13,Other Provider Identifier_14,Other Provider Identifier Type Code_14,Other Provider Identifier State_14,Other Provider Identifier Issuer_14,Other Provider Identifier_15,Other Provider Identifier Type Code_15,Other Provider Identifier State_15,Other Provider Identifier Issuer_15,Other Provider Identifier_16,Other Provider Identifier Type Code_16,Other Provider Identifier State_16,Other Provider Identifier Issuer_16,Other Provider Identifier_17,Other Provider Identifier Type Code_17,Other Provider Identifier State_17,Other Provider Identifier Issuer_17,Other Provider Identifier_18,Other Provider Identifier Type Code_18,Other Provider Identifier State_18,Other Provider Identifier Issuer_18,Other Provider Identifier_19,Other Provider Identifier Type Code_19,Other Provider Identifier State_19,Other Provider Identifier Issuer_19,Other Provider Identifier_20,Other Provider Identifier Type Code_20,Other Provider Identifier State_20,Other Provider Identifier Issuer_20,Other Provider Identifier_21,Other Provider Identifier Type Code_21,Other Provider Identifier State_21,Other Provider Identifier Issuer_21,Other Provider Identifier_22,Other Provider Identifier Type Code_22,Other Provider Identifier State_22,Other Provider Identifier Issuer_22,Other Provider Identifier_23,Other Provider Identifier Type Code_23,Other Provider Identifier State_23,Other Provider Identifier Issuer_23,Other Provider Identifier_24,Other Provider Identifier Type Code_24,Other Provider Identifier State_24,Other Provider Identifier Issuer_24,Other Provider Identifier_25,Other Provider Identifier Type Code_25,Other Provider Identifier State_25,Other Provider Identifier Issuer_25,Other Provider Identifier_26,Other Provider Identifier Type Code_26,Other Provider Identifier State_26,Other Provider Identifier Issuer_26,Other Provider Identifier_27,Other Provider Identifier Type Code_27,Other Provider Identifier State_27,Other Provider Identifier Issuer_27,Other Provider Identifier_28,Other Provider Identifier Type Code_28,Other Provider Identifier State_28,Other Provider Identifier Issuer_28,Other Provider Identifier_29,Other Provider Identifier Type Code_29,Other Provider Identifier State_29,Other Provider Identifier Issuer_29,Other Provider Identifier_30,Other Provider Identifier Type Code_30,Other Provider Identifier State_30,Other Provider Identifier Issuer_30,Other Provider Identifier_31,Other Provider Identifier Type Code_31,Other Provider Identifier State_31,Other Provider Identifier Issuer_31,Other Provider Identifier_32,Other Provider Identifier Type Code_32,Other Provider Identifier State_32,Other Provider Identifier Issuer_32,Other Provider Identifier_33,Other Provider Identifier Type Code_33,Other Provider Identifier State_33,Other Provider Identifier Issuer_33,Other Provider Identifier_34,Other Provider Identifier Type Code_34,Other Provider Identifier State_34,Other Provider Identifier Issuer_34,Other Provider Identifier_35,Other Provider Identifier Type Code_35,Other Provider Identifier State_35,Other Provider Identifier Issuer_35,Other Provider Identifier_36,Other Provider Identifier Type Code_36,Other Provider Identifier State_36,Other Provider Identifier Issuer_36,Other Provider Identifier_37,Other Provider Identifier Type Code_37,Other Provider Identifier State_37,Other Provider Identifier Issuer_37,Other Provider Identifier_38,Other Provider Identifier Type Code_38,Other Provider Identifier State_38,Other Provider Identifier Issuer_38,Other Provider Identifier_39,Other Provider Identifier Type Code_39,Other Provider Identifier State_39,Other Provider Identifier Issuer_39,Other Provider Identifier_40,Other Provider Identifier Type Code_40,Other Provider Identifier State_40,Other Provider Identifier Issuer_40,Other Provider Identifier_41,Other Provider Identifier Type Code_41,Other Provider Identifier State_41,Other Provider Identifier Issuer_41,Other Provider Identifier_42,Other Provider Identifier Type Code_42,Other Provider Identifier State_42,Other Provider Identifier Issuer_42,Other Provider Identifier_43,Other Provider Identifier Type Code_43,Other Provider Identifier State_43,Other Provider Identifier Issuer_43,Other Provider Identifier_44,Other Provider Identifier Type Code_44,Other Provider Identifier State_44,Other Provider Identifier Issuer_44,Other Provider Identifier_45,Other Provider Identifier Type Code_45,Other Provider Identifier State_45,Other Provider Identifier Issuer_45,Other Provider Identifier_46,Other Provider Identifier Type Code_46,Other Provider Identifier State_46,Other Provider Identifier Issuer_46,Other Provider Identifier_47,Other Provider Identifier Type Code_47,Other Provider Identifier State_47,Other Provider Identifier Issuer_47,Other Provider Identifier_48,Other Provider Identifier Type Code_48,Other Provider Identifier State_48,Other Provider Identifier Issuer_48,Other Provider Identifier_49,Other Provider Identifier Type Code_49,Other Provider Identifier State_49,Other Provider Identifier Issuer_49,Other Provider Identifier_50,Other Provider Identifier Type Code_50,Other Provider Identifier State_50,Other Provider Identifier Issuer_50,Is Sole Proprietor,Is Organization Subpart,Parent Organization LBN,Parent Organization TIN,Authorized Official Name Prefix Text,Authorized Official Name Suffix Text,Authorized Official Credential Text,Healthcare Provider Taxonomy Group_1,Healthcare Provider Taxonomy Group_2,Healthcare Provider Taxonomy Group_3,Healthcare Provider Taxonomy Group_4,Healthcare Provider Taxonomy Group_5,Healthcare Provider Taxonomy Group_6,Healthcare Provider Taxonomy Group_7,Healthcare Provider Taxonomy Group_8,Healthcare Provider Taxonomy Group_9,Healthcare Provider Taxonomy Group_10,Healthcare Provider Taxonomy Group_11,Healthcare Provider Taxonomy Group_12,Healthcare Provider Taxonomy Group_13,Healthcare Provider Taxonomy Group_14,Healthcare Provider Taxonomy Group_15,Certification Date
0,1679576722,1.0,,,,WIEBE,DAVID,A,,,M.D.,,,,,,,,,,PO BOX 2168,,KEARNEY,NE,688482168.0,US,3088652512,3088653000.0,3500 CENTRAL AVE,,KEARNEY,NE,688472944.0,US,3088653000.0,3088653000.0,05/23/2005,07/08/2007,,,,M,,,,,,207X00000X,12637,NE,Y,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,645540,1.0,KS,FIRSTGUARD,46969.0,1.0,KS,BCBS,1553.0,1.0,NE,BCBS,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,X,,,,,,,,,,,,,,,,,,,,,,
1,1588667638,1.0,,,,PILCHER,WILLIAM,C,DR.,,MD,,,,,,,,,,1824 KING STREET,SUITE 300,JACKSONVILLE,FL,322044736.0,US,9043881820,9043882000.0,1824 KING STREET,SUITE 300,JACKSONVILLE,FL,322044736.0,US,9043882000.0,9043882000.0,05/23/2005,05/29/2014,,,,M,,,,,,207RC0000X,032024,GA,N,207RC0000X,ME68414,FL,Y,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,00532485C,5.0,GA,,510265.0,1.0,GA,BCBS,251286600.0,5.0,FL,,27888.0,1.0,FL,BCBS,208143.0,1.0,FL,AVMED,00706626A,5.0,GA,,897705.0,1.0,FL,AETNA,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,N,,,,,,,,,,,,,,,,,,,,,,
2,1497758544,2.0,,<UNAVAIL>,"CUMBERLAND COUNTY HOSPITAL SYSTEM, INC",,,,,,,CAPE FEAR VALLEY HOME HEALTH AND HOSPICE,3.0,,,,,,,,3418 VILLAGE DR,,FAYETTEVILLE,NC,283044552.0,US,9106096740,,3418 VILLAGE DR,,FAYETTEVILLE,NC,283044552.0,US,9106097000.0,,05/23/2005,09/26/2011,,,,,NAGOWSKI,MICHAEL,,CEO,9106097000.0,251G00000X,HC0283,NC,Y,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,3401562,5.0,NC,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,N,,,MR.,,,,,,,,,,,,,,,,,,
3,1306849450,1.0,,,,SMITSON,HAROLD,LEROY,DR.,II,M.D.,,,,,,,,,,810 LUCAS DR,,ATHENS,TX,757513446.0,US,9036756778,9036752000.0,810 LUCAS DR,,ATHENS,TX,757513446.0,US,9036757000.0,9036752000.0,05/23/2005,01/03/2008,,,,M,,,,,,2085R0202X,E5444,TX,Y,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,132476603,5.0,TX,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,N,,,,,,,,,,,,,,,,,,,,,,
4,1215930367,1.0,,,,GRESSOT,LAURENT,,DR.,,M.D.,,,,,,,,,,17323 RED OAK DR,,HOUSTON,TX,770901243.0,US,2814405006,2814406000.0,17323 RED OAK DR,,HOUSTON,TX,770901243.0,US,2814405000.0,2814406000.0,05/23/2005,11/25/2014,,,,M,,,,,,174400000X,H6257,TX,N,207RH0003X,H6257,TX,Y,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,N,,,,,,,,,,,,,,,,,,,,,,


Only keep the needed columns

In [7]:
npidata = pd.concat([
    npidata_raw[['NPI']],
    # Entity Type Code: 1 = Provider (doctors, nurses, etc.) / 2 = Facility (Hospitals, Urgent Care, Doctors Offices) 
    npidata_raw[['Entity Type Code']],
    # Entity Name: Either First/Last or Organization or Other Organization Name contained in the following fields:
    npidata_raw.loc[:, 'Provider Organization Name (Legal Business Name)':'Provider Credential Text'],
    # Address: Business Practice Location (not mailing), contained in the following fields:
    npidata_raw.loc[:, 'Provider First Line Business Practice Location Address':'Provider Business Practice Location Address Postal Code'],
], axis=1)

Check the current shape of the dataframe

In [8]:
display(npidata.shape)
display(npidata.head())

(1000, 14)

Unnamed: 0,NPI,Entity Type Code,Provider Organization Name (Legal Business Name),Provider Last Name (Legal Name),Provider First Name,Provider Middle Name,Provider Name Prefix Text,Provider Name Suffix Text,Provider Credential Text,Provider First Line Business Practice Location Address,Provider Second Line Business Practice Location Address,Provider Business Practice Location Address City Name,Provider Business Practice Location Address State Name,Provider Business Practice Location Address Postal Code
0,1679576722,1.0,,WIEBE,DAVID,A,,,M.D.,3500 CENTRAL AVE,,KEARNEY,NE,688472944.0
1,1588667638,1.0,,PILCHER,WILLIAM,C,DR.,,MD,1824 KING STREET,SUITE 300,JACKSONVILLE,FL,322044736.0
2,1497758544,2.0,"CUMBERLAND COUNTY HOSPITAL SYSTEM, INC",,,,,,,3418 VILLAGE DR,,FAYETTEVILLE,NC,283044552.0
3,1306849450,1.0,,SMITSON,HAROLD,LEROY,DR.,II,M.D.,810 LUCAS DR,,ATHENS,TX,757513446.0
4,1215930367,1.0,,GRESSOT,LAURENT,,DR.,,M.D.,17323 RED OAK DR,,HOUSTON,TX,770901243.0


**We will handle the Primary Taxonomy separately and left-join back to `npidata` later**

## Handling The Primary Taxonomy


A provider can have up to 15 taxonomy codes, but we want the one which has Primary Switch = Y in the associated 'Healthcare Provider Primary Taxonomy Switch*' field. Note that this does not always occur in spot 1.

In [9]:
npi_taxonomy = pd.concat([
    npidata_raw[['NPI']],
    # The provider's taxonomy code, which is contained in one of the 'Healthcare Provider Taxonomy Code*' columns
    npidata_raw[npidata_raw.columns[pd.Series(npidata_raw.columns).str.startswith('Healthcare Provider Taxonomy Code_')]],
    npidata_raw[npidata_raw.columns[pd.Series(npidata_raw.columns).str.startswith('Healthcare Provider Primary Taxonomy Switch_')]]
], axis=1)

In [10]:
display(npi_taxonomy.shape)
display(npi_taxonomy.head())

(1000, 31)

Unnamed: 0,NPI,Healthcare Provider Taxonomy Code_1,Healthcare Provider Taxonomy Code_2,Healthcare Provider Taxonomy Code_3,Healthcare Provider Taxonomy Code_4,Healthcare Provider Taxonomy Code_5,Healthcare Provider Taxonomy Code_6,Healthcare Provider Taxonomy Code_7,Healthcare Provider Taxonomy Code_8,Healthcare Provider Taxonomy Code_9,Healthcare Provider Taxonomy Code_10,Healthcare Provider Taxonomy Code_11,Healthcare Provider Taxonomy Code_12,Healthcare Provider Taxonomy Code_13,Healthcare Provider Taxonomy Code_14,Healthcare Provider Taxonomy Code_15,Healthcare Provider Primary Taxonomy Switch_1,Healthcare Provider Primary Taxonomy Switch_2,Healthcare Provider Primary Taxonomy Switch_3,Healthcare Provider Primary Taxonomy Switch_4,Healthcare Provider Primary Taxonomy Switch_5,Healthcare Provider Primary Taxonomy Switch_6,Healthcare Provider Primary Taxonomy Switch_7,Healthcare Provider Primary Taxonomy Switch_8,Healthcare Provider Primary Taxonomy Switch_9,Healthcare Provider Primary Taxonomy Switch_10,Healthcare Provider Primary Taxonomy Switch_11,Healthcare Provider Primary Taxonomy Switch_12,Healthcare Provider Primary Taxonomy Switch_13,Healthcare Provider Primary Taxonomy Switch_14,Healthcare Provider Primary Taxonomy Switch_15
0,1679576722,207X00000X,,,,,,,,,,,,,,,Y,,,,,,,,,,,,,,
1,1588667638,207RC0000X,207RC0000X,,,,,,,,,,,,,,N,Y,,,,,,,,,,,,,
2,1497758544,251G00000X,,,,,,,,,,,,,,,Y,,,,,,,,,,,,,,
3,1306849450,2085R0202X,,,,,,,,,,,,,,,Y,,,,,,,,,,,,,,
4,1215930367,174400000X,207RH0003X,,,,,,,,,,,,,,N,Y,,,,,,,,,,,,,


### Which NPI has no primary taxonomy?

In [11]:
npi_no_primary_taxonomy = npi_taxonomy[(npi_taxonomy['Healthcare Provider Primary Taxonomy Switch_1'] != 'Y') &\
            (npi_taxonomy['Healthcare Provider Primary Taxonomy Switch_2'] != 'Y') &\
            (npi_taxonomy['Healthcare Provider Primary Taxonomy Switch_3'] != 'Y') &\
            (npi_taxonomy['Healthcare Provider Primary Taxonomy Switch_4'] != 'Y') &\
            (npi_taxonomy['Healthcare Provider Primary Taxonomy Switch_5'] != 'Y') &\
            (npi_taxonomy['Healthcare Provider Primary Taxonomy Switch_6'] != 'Y') &\
            (npi_taxonomy['Healthcare Provider Primary Taxonomy Switch_7'] != 'Y') &\
            (npi_taxonomy['Healthcare Provider Primary Taxonomy Switch_8'] != 'Y') &\
            (npi_taxonomy['Healthcare Provider Primary Taxonomy Switch_9'] != 'Y') &\
            (npi_taxonomy['Healthcare Provider Primary Taxonomy Switch_10'] != 'Y') &\
            (npi_taxonomy['Healthcare Provider Primary Taxonomy Switch_11'] != 'Y') &\
            (npi_taxonomy['Healthcare Provider Primary Taxonomy Switch_12'] != 'Y') &\
            (npi_taxonomy['Healthcare Provider Primary Taxonomy Switch_13'] != 'Y') &\
            (npi_taxonomy['Healthcare Provider Primary Taxonomy Switch_14'] != 'Y') &\
            (npi_taxonomy['Healthcare Provider Primary Taxonomy Switch_15'] != 'Y')]

In [12]:
display(npi_no_primary_taxonomy.shape)
display(npi_no_primary_taxonomy)

(63, 31)

Unnamed: 0,NPI,Healthcare Provider Taxonomy Code_1,Healthcare Provider Taxonomy Code_2,Healthcare Provider Taxonomy Code_3,Healthcare Provider Taxonomy Code_4,Healthcare Provider Taxonomy Code_5,Healthcare Provider Taxonomy Code_6,Healthcare Provider Taxonomy Code_7,Healthcare Provider Taxonomy Code_8,Healthcare Provider Taxonomy Code_9,Healthcare Provider Taxonomy Code_10,Healthcare Provider Taxonomy Code_11,Healthcare Provider Taxonomy Code_12,Healthcare Provider Taxonomy Code_13,Healthcare Provider Taxonomy Code_14,Healthcare Provider Taxonomy Code_15,Healthcare Provider Primary Taxonomy Switch_1,Healthcare Provider Primary Taxonomy Switch_2,Healthcare Provider Primary Taxonomy Switch_3,Healthcare Provider Primary Taxonomy Switch_4,Healthcare Provider Primary Taxonomy Switch_5,Healthcare Provider Primary Taxonomy Switch_6,Healthcare Provider Primary Taxonomy Switch_7,Healthcare Provider Primary Taxonomy Switch_8,Healthcare Provider Primary Taxonomy Switch_9,Healthcare Provider Primary Taxonomy Switch_10,Healthcare Provider Primary Taxonomy Switch_11,Healthcare Provider Primary Taxonomy Switch_12,Healthcare Provider Primary Taxonomy Switch_13,Healthcare Provider Primary Taxonomy Switch_14,Healthcare Provider Primary Taxonomy Switch_15
35,1891798849,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
43,1780687731,103T00000X,103TC0700X,103TC2200X,103TF0000X,,,,,,,,,,,,X,X,X,X,,,,,,,,,,,
52,1770586729,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
86,1659374684,261QM1200X,261QR0200X,,,,,,,,,,,,,,X,X,,,,,,,,,,,,,
89,1386647311,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
110,1427051457,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
119,1508869652,314000000X,310400000X,,,,,,,,,,,,,,X,X,,,,,,,,,,,,,
123,1043213192,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
141,1730182783,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
142,1649273699,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


Apply `wide_to_long` to get the matrix of taxonomy in a long format and filter for only the primary taxonomies (**warning: This would get rid of NPI's without primary taxonomies. Do a left-join on `npidata` to fix this**)

In [13]:
# Pivot from wide to long format
npi_taxonomy = pd.wide_to_long(
    npi_taxonomy,
    stubnames=['Healthcare Provider Taxonomy Code', 'Healthcare Provider Primary Taxonomy Switch'],
    i=['NPI'], 
    j='primary_taxonomy_index',
    sep="_"
)

# Only keep the primary taxonomy
npi_taxonomy = npi_taxonomy[npi_taxonomy['Healthcare Provider Primary Taxonomy Switch'] == 'Y']

# Housekeeping
npi_taxonomy = npi_taxonomy.reset_index()\
    .drop(columns=['primary_taxonomy_index', 'Healthcare Provider Primary Taxonomy Switch'])\
    .rename({ 'Healthcare Provider Taxonomy Code': 'primary_taxonomy' }, axis=1)

In [14]:
display(npi_taxonomy.shape)
display(npi_taxonomy)

(937, 2)

Unnamed: 0,NPI,primary_taxonomy
0,1679576722,207X00000X
1,1497758544,251G00000X
2,1306849450,2085R0202X
3,1023011178,251G00000X
4,1932102084,207RC0000X
5,1841293990,231H00000X
6,1750384806,207R00000X
7,1669475711,208000000X
8,1578566626,207Q00000X
9,1487657532,207V00000X


## Merging `npidata` and `npi_taxonomy`

Finally, merge back `npi_taxonomy` to `npidata`. Account for npis without `primary_taxonomy` by doing a left-join

In [15]:
npidata = npidata.merge(
    npi_taxonomy,
    how='left',
    on='NPI'
)

In [16]:
display(npidata.shape)
display(npidata)

(1000, 15)

Unnamed: 0,NPI,Entity Type Code,Provider Organization Name (Legal Business Name),Provider Last Name (Legal Name),Provider First Name,Provider Middle Name,Provider Name Prefix Text,Provider Name Suffix Text,Provider Credential Text,Provider First Line Business Practice Location Address,Provider Second Line Business Practice Location Address,Provider Business Practice Location Address City Name,Provider Business Practice Location Address State Name,Provider Business Practice Location Address Postal Code,primary_taxonomy
0,1679576722,1.0,,WIEBE,DAVID,A,,,M.D.,3500 CENTRAL AVE,,KEARNEY,NE,688472944.0,207X00000X
1,1588667638,1.0,,PILCHER,WILLIAM,C,DR.,,MD,1824 KING STREET,SUITE 300,JACKSONVILLE,FL,322044736.0,207RC0000X
2,1497758544,2.0,"CUMBERLAND COUNTY HOSPITAL SYSTEM, INC",,,,,,,3418 VILLAGE DR,,FAYETTEVILLE,NC,283044552.0,251G00000X
3,1306849450,1.0,,SMITSON,HAROLD,LEROY,DR.,II,M.D.,810 LUCAS DR,,ATHENS,TX,757513446.0,2085R0202X
4,1215930367,1.0,,GRESSOT,LAURENT,,DR.,,M.D.,17323 RED OAK DR,,HOUSTON,TX,770901243.0,207RH0003X
5,1023011178,2.0,COLLABRIA CARE,,,,,,,414 S JEFFERSON ST,,NAPA,CA,945594515.0,251G00000X
6,1932102084,1.0,,ADUSUMILLI,RAVI,K,,,MD,2940 N MCCORD RD,,TOLEDO,OH,436151753.0,207RC0000X
7,1841293990,1.0,,WORTSMAN,SUSAN,,,,MA-CCC,425 E 25TH ST,,NEW YORK,NY,100102547.0,231H00000X
8,1750384806,1.0,,BISBEE,ROBERT,,DR.,,MD,808 JOLIET AVE UNIT 120,,LUBBOCK,TX,794151148.0,207R00000X
9,1669475711,1.0,,SUNG,BIN,SHENG,,,M. D.,7629 TIKI DR,,FULSHEAR,TX,774411548.0,208000000X


## Renaming Columns

In [17]:
npidata = npidata.reset_index().rename({
    'NPI': 'npi',
    'Entity Type Code': 'entity_type_code',
    'Provider Organization Name (Legal Business Name)': 'provider_org_name',
    'Provider Last Name (Legal Name)': 'provider_last_name',
    'Provider First Name': 'provider_first_name',
    'Provider Middle Name': 'provider_middle_name',
    'Provider Name Prefix Text': 'provider_name_prefix',
    'Provider Name Suffix Text': 'provider_name_suffix',
    'Provider Credential Text': 'provider_credential',
    'Provider First Line Business Practice Location Address': 'provider_business_address_1',
    'Provider Second Line Business Practice Location Address': 'provider_business_address_2',
    'Provider Business Practice Location Address City Name': 'provider_business_city',
    'Provider Business Practice Location Address State Name': 'provider_business_state',
    'Provider Business Practice Location Address Postal Code': 'provider_business_zip'
}, axis=1)

In [18]:
display(npidata.shape)
display(npidata.head())

(1000, 16)

Unnamed: 0,index,npi,entity_type_code,provider_org_name,provider_last_name,provider_first_name,provider_middle_name,provider_name_prefix,provider_name_suffix,provider_credential,provider_business_address_1,provider_business_address_2,provider_business_city,provider_business_state,provider_business_zip,primary_taxonomy
0,0,1679576722,1.0,,WIEBE,DAVID,A,,,M.D.,3500 CENTRAL AVE,,KEARNEY,NE,688472944.0,207X00000X
1,1,1588667638,1.0,,PILCHER,WILLIAM,C,DR.,,MD,1824 KING STREET,SUITE 300,JACKSONVILLE,FL,322044736.0,207RC0000X
2,2,1497758544,2.0,"CUMBERLAND COUNTY HOSPITAL SYSTEM, INC",,,,,,,3418 VILLAGE DR,,FAYETTEVILLE,NC,283044552.0,251G00000X
3,3,1306849450,1.0,,SMITSON,HAROLD,LEROY,DR.,II,M.D.,810 LUCAS DR,,ATHENS,TX,757513446.0,2085R0202X
4,4,1215930367,1.0,,GRESSOT,LAURENT,,DR.,,M.D.,17323 RED OAK DR,,HOUSTON,TX,770901243.0,207RH0003X


**Note: Not all NPIs has a primary taxonomy**

In [19]:
display(npidata[npidata['primary_taxonomy'].isna()].shape)
display(npidata[npidata['primary_taxonomy'].isna()])

(63, 16)

Unnamed: 0,index,npi,entity_type_code,provider_org_name,provider_last_name,provider_first_name,provider_middle_name,provider_name_prefix,provider_name_suffix,provider_credential,provider_business_address_1,provider_business_address_2,provider_business_city,provider_business_state,provider_business_zip,primary_taxonomy
35,35,1891798849,,,,,,,,,,,,,,
43,43,1780687731,1.0,,STAUBSINGER,ARLENE,BETH,DR.,,PH.D.,8100 OSWEGO RD,STE 235,LIVERPOOL,NY,130901660.0,
52,52,1770586729,,,,,,,,,,,,,,
86,86,1659374684,2.0,NYDIC OPEN MRI OF AMERICA-BOARDMAN,,,,,,,1449 BOARDMAN CANFIELD RD,STE 140,BOARDMAN,OH,445128070.0,
89,89,1386647311,,,,,,,,,,,,,,
110,110,1427051457,,,,,,,,,,,,,,
119,119,1508869652,2.0,"LAFAYETTE MANOR, INC.",,,,,,,147 LAFAYETTE MANOR RD,,UNIONTOWN,PA,154018900.0,
123,123,1043213192,,,,,,,,,,,,,,
141,141,1730182783,,,,,,,,,,,,,,
142,142,1649273699,,,,,,,,,,,,,,


## Add Taxonomy Code to Classification Crosswalk

In [20]:
display(taxonomy_codes.shape)
display(taxonomy_codes.head())

(865, 10)

Unnamed: 0,Code,Grouping,Classification,Specialization,Definition,Effective Date,Deactivation Date,Last Modified Date,Notes,Display Name
0,193200000X,Group,Multi-Specialty,,A business group of one or more individual pra...,10/1/2003,,,[7/1/2003: new],Multi-Specialty Group
1,193400000X,Group,Single Specialty,,A business group of one or more individual pra...,10/1/2003,,,[7/1/2003: new],Single Specialty Group
2,207K00000X,Allopathic & Osteopathic Physicians,Allergy & Immunology,,An allergist-immunologist is trained in evalua...,4/1/2003,,7/1/2007,"Source: American Board of Medical Specialties,...",Allergy & Immunology Physician
3,207KA0200X,Allopathic & Osteopathic Physicians,Allergy & Immunology,Allergy,Definition to come...,4/1/2003,,,,Allergy Physician
4,207KI0005X,Allopathic & Osteopathic Physicians,Allergy & Immunology,Clinical & Laboratory Immunology,Definition to come...,4/1/2003,,,,Clinical & Laboratory Immunology (Allergy & Im...


Keep only the columns we need

In [21]:
taxonomy_codes = taxonomy_codes[['Code','Specialization', 'Grouping', 'Classification']]

In [22]:
display(taxonomy_codes.shape)
display(taxonomy_codes)

(865, 4)

Unnamed: 0,Code,Specialization,Grouping,Classification
0,193200000X,,Group,Multi-Specialty
1,193400000X,,Group,Single Specialty
2,207K00000X,,Allopathic & Osteopathic Physicians,Allergy & Immunology
3,207KA0200X,Allergy,Allopathic & Osteopathic Physicians,Allergy & Immunology
4,207KI0005X,Clinical & Laboratory Immunology,Allopathic & Osteopathic Physicians,Allergy & Immunology
5,207L00000X,,Allopathic & Osteopathic Physicians,Anesthesiology
6,207LA0401X,Addiction Medicine,Allopathic & Osteopathic Physicians,Anesthesiology
7,207LC0200X,Critical Care Medicine,Allopathic & Osteopathic Physicians,Anesthesiology
8,207LH0002X,Hospice and Palliative Medicine,Allopathic & Osteopathic Physicians,Anesthesiology
9,207LP2900X,Pain Medicine,Allopathic & Osteopathic Physicians,Anesthesiology


In [23]:
taxonomy_codes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 865 entries, 0 to 864
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Code            865 non-null    object
 1   Specialization  625 non-null    object
 2   Grouping        865 non-null    object
 3   Classification  865 non-null    object
dtypes: object(4)
memory usage: 27.2+ KB


In [24]:
npidata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 16 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   index                        1000 non-null   int64  
 1   npi                          1000 non-null   int64  
 2   entity_type_code             950 non-null    float64
 3   provider_org_name            192 non-null    object 
 4   provider_last_name           758 non-null    object 
 5   provider_first_name          758 non-null    object 
 6   provider_middle_name         601 non-null    object 
 7   provider_name_prefix         480 non-null    object 
 8   provider_name_suffix         28 non-null     object 
 9   provider_credential          749 non-null    object 
 10  provider_business_address_1  950 non-null    object 
 11  provider_business_address_2  393 non-null    object 
 12  provider_business_city       950 non-null    object 
 13  provider_business_s

## Merge `npidata` and `taxonomy`

Merge to `npidata`, matching on primary_taxonomy/Code

In [25]:
npidata = npidata.merge(
    taxonomy_codes, 
    how='left', 
    left_on = 'primary_taxonomy', 
    right_on = 'Code', 
copy=False).drop('Code', axis=1)

In [26]:
display(npidata.shape)
display(npidata)

(1000, 19)

Unnamed: 0,index,npi,entity_type_code,provider_org_name,provider_last_name,provider_first_name,provider_middle_name,provider_name_prefix,provider_name_suffix,provider_credential,provider_business_address_1,provider_business_address_2,provider_business_city,provider_business_state,provider_business_zip,primary_taxonomy,Specialization,Grouping,Classification
0,0,1679576722,1.0,,WIEBE,DAVID,A,,,M.D.,3500 CENTRAL AVE,,KEARNEY,NE,688472944.0,207X00000X,,Allopathic & Osteopathic Physicians,Orthopaedic Surgery
1,1,1588667638,1.0,,PILCHER,WILLIAM,C,DR.,,MD,1824 KING STREET,SUITE 300,JACKSONVILLE,FL,322044736.0,207RC0000X,Cardiovascular Disease,Allopathic & Osteopathic Physicians,Internal Medicine
2,2,1497758544,2.0,"CUMBERLAND COUNTY HOSPITAL SYSTEM, INC",,,,,,,3418 VILLAGE DR,,FAYETTEVILLE,NC,283044552.0,251G00000X,,Agencies,"Hospice Care, Community Based"
3,3,1306849450,1.0,,SMITSON,HAROLD,LEROY,DR.,II,M.D.,810 LUCAS DR,,ATHENS,TX,757513446.0,2085R0202X,Diagnostic Radiology,Allopathic & Osteopathic Physicians,Radiology
4,4,1215930367,1.0,,GRESSOT,LAURENT,,DR.,,M.D.,17323 RED OAK DR,,HOUSTON,TX,770901243.0,207RH0003X,Hematology & Oncology,Allopathic & Osteopathic Physicians,Internal Medicine
5,5,1023011178,2.0,COLLABRIA CARE,,,,,,,414 S JEFFERSON ST,,NAPA,CA,945594515.0,251G00000X,,Agencies,"Hospice Care, Community Based"
6,6,1932102084,1.0,,ADUSUMILLI,RAVI,K,,,MD,2940 N MCCORD RD,,TOLEDO,OH,436151753.0,207RC0000X,Cardiovascular Disease,Allopathic & Osteopathic Physicians,Internal Medicine
7,7,1841293990,1.0,,WORTSMAN,SUSAN,,,,MA-CCC,425 E 25TH ST,,NEW YORK,NY,100102547.0,231H00000X,,"Speech, Language and Hearing Service Providers",Audiologist
8,8,1750384806,1.0,,BISBEE,ROBERT,,DR.,,MD,808 JOLIET AVE UNIT 120,,LUBBOCK,TX,794151148.0,207R00000X,,Allopathic & Osteopathic Physicians,Internal Medicine
9,9,1669475711,1.0,,SUNG,BIN,SHENG,,,M. D.,7629 TIKI DR,,FULSHEAR,TX,774411548.0,208000000X,,Allopathic & Osteopathic Physicians,Pediatrics


## Add Zip Code to CBSA Crosswalk

Match each provider to a CBSA using the Business Zip code. A zipcode can be associated to multiple CBSAs, so match with the CBSA with the highest `TOT_RATIO`

In [27]:
display(zips.shape)
display(zips)

(46833, 6)

Unnamed: 0,zip,cbsa,res_ratio,bus_ratio,oth_ratio,tot_ratio
0,00612,11640,0.996207,0.999588,0.995662,0.996515
1,00627,11640,1.000000,1.000000,1.000000,1.000000
2,00638,41980,1.000000,1.000000,1.000000,1.000000
3,00676,10380,1.000000,1.000000,1.000000,1.000000
4,00719,41980,1.000000,1.000000,1.000000,1.000000
...,...,...,...,...,...,...
46828,59354,99999,0.000000,0.000000,1.000000,1.000000
46829,61332,36860,0.000000,0.000000,1.000000,1.000000
46830,99903,99999,0.000000,0.000000,1.000000,1.000000
46831,28274,16740,0.000000,1.000000,0.000000,1.000000


In [28]:
zips.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46833 entries, 0 to 46832
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   zip        46833 non-null  object 
 1   cbsa       46833 non-null  int64  
 2   res_ratio  46833 non-null  float64
 3   bus_ratio  46833 non-null  float64
 4   oth_ratio  46833 non-null  float64
 5   tot_ratio  46833 non-null  float64
dtypes: float64(4), int64(1), object(1)
memory usage: 2.1+ MB


In [29]:
zips[zips['tot_ratio'] != 1.0]

Unnamed: 0,zip,cbsa,res_ratio,bus_ratio,oth_ratio,tot_ratio
0,00612,11640,0.996207,0.999588,0.995662,0.996515
7,00757,42180,0.997146,0.968447,1.000000,0.995783
27,01756,49340,0.998886,1.000000,1.000000,0.999004
65,03104,31700,0.996969,1.000000,1.000000,0.997259
67,03243,18180,0.958963,0.909091,1.000000,0.958420
...,...,...,...,...,...,...
46794,72416,99999,0.000383,0.000000,0.000000,0.000362
46797,79707,36220,0.000057,0.000000,0.000000,0.000054
46813,68820,99999,0.006135,0.000000,0.000000,0.006098
46818,40391,34460,0.000122,0.000000,0.000000,0.000111


Filter to zips within the Nashville cbsa

In [30]:
zips = zips[zips['cbsa'] == 34980]

In [31]:
display(zips.shape)
display(zips)

(163, 6)

Unnamed: 0,zip,cbsa,res_ratio,bus_ratio,oth_ratio,tot_ratio
1201,37032,34980,0.994333,1.0,0.0,0.994505
1202,37033,34980,0.98754,0.995789,1.0,0.988556
1205,37013,34980,1.0,1.0,1.0,1.0
1207,37026,34980,0.978705,1.0,1.0,0.979508
1208,37035,34980,0.991842,1.0,0.0,0.991973
1210,37069,34980,1.0,1.0,1.0,1.0
1211,37010,34980,0.4,0.974359,1.0,0.41163
1212,37121,34980,1.0,1.0,1.0,1.0
1213,37085,34980,1.0,1.0,0.0,1.0
1214,37153,34980,0.947344,0.944444,0.0,0.947274


In [None]:
zips

Each zip code can be in multiple counties, though we only want to match zips to the county with the greatest representation. To do this, groupby `county`, and keep only the rows with the max `tot_ratio`.

In [None]:
zips = zips.groupby('zip')['tot_ratio'].max().reset_index()

In [None]:
display(zips)

Create a list of counties we will use as a filter for the nationwide npidata

In [None]:
zips_list = zips['zip'].to_list()

In [None]:
print(zips_list)

Filter npidata to only where zipcodes are within Nashville cbsa

In [None]:
# Start by converting provider_business_zip to string and indexing to first 5 digits
npidata['provider_business_zip5'] = [str(i)[0:5] for i in npidata['provider_business_zip']]

In [None]:
# Filter rows to 'provider_business_zip5' being in zips_list
npidata_nashville = npidata[npidata['provider_business_zip5'].isin(zips_list)]

In [None]:
display(npidata.shape)
display(npidata_nashville.shape)
display(npidata_nashville.head())

## Now re-applying all this at once and import into sqlite

**IMPORTANT! This loading into the database should only be run once.** If you run this multiple times, it will create duplicate entries in the database. For the security of not re-running this code by accident, the code here is converted into markdown. **If you need to rebuild the database, delete the `data/hcbb.sqlite` file and re-run this cell as code. You will also need to make sure to re-run any other related scripts that builds other tables in the database.**

taxonomy_codes = pd.read_csv("../data/nucc_taxonomy_210.csv")
zips = pd.read_excel("../data/ZIP_CBSA_122017.xlsx", converters={'zip': lambda x: str(x)}, engine='openpyxl')
zips = zips[zips['cbsa'] == 34980]

with sqlite3.connect('../data/hcbb.sqlite') as db:

    # Loading the nppes dataset
    npidata_raw = pd.read_csv("../data/npidata_pfile_20050523-20210207.csv", chunksize = 10000)
    
    # Looping over chuncks of nppes
    for chunk in tqdm(npidata_raw):

        npidata = pd.concat([
            chunk[['NPI']],
            # Entity Type Code: 1 = Provider (doctors, nurses, etc.) / 2 = Facility (Hospitals, Urgent Care, Doctors Offices) 
            chunk[['Entity Type Code']],
            # Entity Name: Either First/Last or Organization or Other Organization Name contained in the following fields:
            chunk.loc[:, 'Provider Organization Name (Legal Business Name)':'Provider Credential Text'],
            # Address: Business Practice Location (not mailing), contained in the following fields:
            chunk.loc[:, 'Provider First Line Business Practice Location Address':'Provider Business Practice Location Address Postal Code'],
        ], axis=1)

        npi_taxonomy = pd.concat([
            chunk[['NPI']],
            # The provider's taxonomy code, which is contained in one of the 'Healthcare Provider Taxonomy Code*' columns
            chunk[chunk.columns[pd.Series(chunk.columns).str.startswith('Healthcare Provider Taxonomy Code_')]],
            chunk[chunk.columns[pd.Series(chunk.columns).str.startswith('Healthcare Provider Primary Taxonomy Switch_')]]
        ], axis=1)

        # Pivot from wide to long format
        npi_taxonomy = pd.wide_to_long(
            npi_taxonomy,
            stubnames=['Healthcare Provider Taxonomy Code', 'Healthcare Provider Primary Taxonomy Switch'],
            i=['NPI'], 
            j='primary_taxonomy_index',
            sep="_"
        )

        # Only keep the primary taxonomy
        npi_taxonomy = npi_taxonomy[npi_taxonomy['Healthcare Provider Primary Taxonomy Switch'] == 'Y']

        # Housekeeping
        npi_taxonomy = npi_taxonomy.reset_index()\
            .drop(columns=['primary_taxonomy_index', 'Healthcare Provider Primary Taxonomy Switch'])\
            .rename({ 'Healthcare Provider Taxonomy Code': 'taxonomy_code' }, axis=1)

        # Merge npidata with npi_taxonomy
        npidata = npidata.merge(
            npi_taxonomy,
            how='left',
            on='NPI'
        )

        # Rename columns
        npidata = npidata.reset_index().rename({
            'NPI': 'npi',
            'Entity Type Code': 'entity_type_code',
            'Provider Organization Name (Legal Business Name)': 'provider_org_name',
            'Provider Last Name (Legal Name)': 'provider_last_name',
            'Provider First Name': 'provider_first_name',
            'Provider Middle Name': 'provider_middle_name',
            'Provider Name Prefix Text': 'provider_name_prefix',
            'Provider Name Suffix Text': 'provider_name_suffix',
            'Provider Credential Text': 'provider_credential',
            'Provider First Line Business Practice Location Address': 'provider_business_address_1',
            'Provider Second Line Business Practice Location Address': 'provider_business_address_2',
            'Provider Business Practice Location Address City Name': 'provider_business_city',
            'Provider Business Practice Location Address State Name': 'provider_business_state',
            'Provider Business Practice Location Address Postal Code': 'provider_business_zip'
        }, axis=1)

        # Create Zip5 column to merge down the road
        npidata['provider_business_zip5'] = [str(i)[0:5] for i in npidata['provider_business_zip']]

        # Correct data types
        npidata['npi'] = npidata['npi'].astype(str)
        npidata['entity_type_code'] = npidata['entity_type_code'].astype(str).str.split('.').str[0]
        npidata['provider_business_zip'] = npidata['provider_business_zip'].astype(str).str.split('.').str[0]
        npidata['provider_business_zip5'] = npidata['provider_business_zip5'].astype(str).str.split('.').str[0]
        
        # Filter by zips
        npidata = npidata[npidata['provider_business_zip5'].isin(zips['zip'])]
        
        # Remove unneeded columns
        npidata = npidata.drop('index', axis=1)
        
        # Load to sqlite db
        npidata.to_sql('npidata', db, if_exists = 'append', index = False)

    print('task done')

In [None]:
with sqlite3.connect('../data/hcbb.sqlite') as db :
    query = """
    SELECT * FROM taxonomy
    LIMIT 5;
    """ 
    
    test_df = pd.read_sql(query, db)

display(test_df)

**Quick Fix for dropping tables (DO NOT RUN UNLESS FOR RECREATING TABLES)**

In [None]:
# with sqlite3.connect('../data/hcbb.sqlite') as db :
#     cursor = db.cursor()
#     cursor.execute("DROP TABLE npidata")
#     print("Table dropped...")