In [1]:
import pandas as pd
import polars as pl

Cleaning strategy for NPPES file:
- read in only columns relevant to this project
- process taxonomy code columns to determine primary
- drop taxonomy code columns (keep only added primary code column)
- join classification crosswalk to get taxonomy name
- join CBSA crosswalk to match provider on zip code
- make column names better for SQL
- put into sqlite

In [2]:
# generate list of taxonomy columns
tax_cols = []

for i in range(1, 16):
    switch_col_name = f'Healthcare Provider Primary Taxonomy Switch_{i}'
    tax_cols.append(switch_col_name)
    code_col_name = f'Healthcare Provider Taxonomy Code_{i}'
    tax_cols.append(code_col_name)

# other relevant columns
cols = ['NPI', 'Entity Type Code', 'Provider Organization Name (Legal Business Name)', 'Provider Last Name (Legal Name)',
        'Provider First Name', 'Provider Middle Name', 'Provider Name Prefix Text', 'Provider Name Suffix Text',
        'Provider Credential Text', 'Provider First Line Business Practice Location Address',
        'Provider Second Line Business Practice Location Address', 'Provider Business Practice Location Address City Name',
        'Provider Business Practice Location Address State Name', 'Provider Business Practice Location Address Postal Code']

# combine column name lists
cols.extend(tax_cols)

In [3]:
npi = pd.read_csv('../data/npidata_pfile_20050523-20240211.csv', usecols = cols, dtype = str, keep_default_na = False)

In [4]:
# modify read in to get necessary columns and rename at the same time
npi.head()

Unnamed: 0,NPI,Entity Type Code,Provider Organization Name (Legal Business Name),Provider Last Name (Legal Name),Provider First Name,Provider Middle Name,Provider Name Prefix Text,Provider Name Suffix Text,Provider Credential Text,Provider First Line Business Practice Location Address,...,Healthcare Provider Taxonomy Code_11,Healthcare Provider Primary Taxonomy Switch_11,Healthcare Provider Taxonomy Code_12,Healthcare Provider Primary Taxonomy Switch_12,Healthcare Provider Taxonomy Code_13,Healthcare Provider Primary Taxonomy Switch_13,Healthcare Provider Taxonomy Code_14,Healthcare Provider Primary Taxonomy Switch_14,Healthcare Provider Taxonomy Code_15,Healthcare Provider Primary Taxonomy Switch_15
0,1679576722,1.0,,WIEBE,DAVID,A,,,M.D.,3500 CENTRAL AVE,...,,,,,,,,,,
1,1588667638,1.0,,PILCHER,WILLIAM,C,DR.,,MD,1824 KING STREET,...,,,,,,,,,,
2,1497758544,2.0,"CUMBERLAND COUNTY HOSPITAL SYSTEM, INC",,,,,,,3418 VILLAGE DR,...,,,,,,,,,,
3,1306849450,,,,,,,,,,...,,,,,,,,,,
4,1215930367,1.0,,GRESSOT,LAURENT,,DR.,,M.D.,17323 RED OAK DR,...,,,,,,,,,,


In [13]:
# pull primary taxonomy codes into one column
# modify to take first 'X' if there is no 'Y'
# add unified flag column (X or Y)
# pull all rows with 'X' into separate dataset (in addition to main)
# don't drop non-flag

def extract_taxonomy_code(row):

#    row['extracted_switch_col'] = None

    # to catch all rows with a 'Y' switch
    for i in range(1, 16):
        switch_col_name = f'Healthcare Provider Primary Taxonomy Switch_{i}'
        if row[switch_col_name] == 'Y':
            code_col_name = f'Healthcare Provider Taxonomy Code_{i}'
            # row['extracted_switch_col'] = 'Y'
            return row[code_col_name]
        
    # if no 'Y' switch was found, capture first 'X'
    # this second chunk makes it incredibly inefficient, find a better way
    # if row['extracted_switch_col'] is None:
    #     for i in range(1, 16):
    #         switch_col_name = f'Healthcare Provider Primary Taxonomy Switch_{i}'
    #         if row[switch_col_name] == 'X':
    #             code_col_name = f'Healthcare Provider Taxonomy Code_{i}'
    #             if 'extracted_switch_col' not in row:
    #                 row['extracted_switch_col'] = 'X'
    #                 return row[code_col_name]
    
    return None # for rows with no 'Y' or 'X' switch

In [14]:
npi['extracted_taxonomy_code'] = npi.apply(extract_taxonomy_code, axis = 1)

In [15]:
npi.head()

Unnamed: 0,NPI,Entity Type Code,Provider Organization Name (Legal Business Name),Provider Last Name (Legal Name),Provider First Name,Provider Middle Name,Provider Name Prefix Text,Provider Name Suffix Text,Provider Credential Text,Provider First Line Business Practice Location Address,...,Healthcare Provider Primary Taxonomy Switch_11,Healthcare Provider Taxonomy Code_12,Healthcare Provider Primary Taxonomy Switch_12,Healthcare Provider Taxonomy Code_13,Healthcare Provider Primary Taxonomy Switch_13,Healthcare Provider Taxonomy Code_14,Healthcare Provider Primary Taxonomy Switch_14,Healthcare Provider Taxonomy Code_15,Healthcare Provider Primary Taxonomy Switch_15,extracted_taxonomy_code
0,1679576722,1.0,,WIEBE,DAVID,A,,,M.D.,3500 CENTRAL AVE,...,,,,,,,,,,207X00000X
1,1588667638,1.0,,PILCHER,WILLIAM,C,DR.,,MD,1824 KING STREET,...,,,,,,,,,,207RC0000X
2,1497758544,2.0,"CUMBERLAND COUNTY HOSPITAL SYSTEM, INC",,,,,,,3418 VILLAGE DR,...,,,,,,,,,,251G00000X
3,1306849450,,,,,,,,,,...,,,,,,,,,,
4,1215930367,1.0,,GRESSOT,LAURENT,,DR.,,M.D.,17323 RED OAK DR,...,,,,,,,,,,207RH0003X


In [16]:
npi.info()
# having doublechecked the values in the switch columns, it appears that there are rows with no 'Y' switch
# should those rows be dropped, or possibly the taxonomy code in Code_1 could be inferred to be the correct code?
# some rows with no 'Y' switch have one or more 'X' switches

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8180697 entries, 0 to 8180696
Data columns (total 45 columns):
 #   Column                                                   Dtype 
---  ------                                                   ----- 
 0   NPI                                                      object
 1   Entity Type Code                                         object
 2   Provider Organization Name (Legal Business Name)         object
 3   Provider Last Name (Legal Name)                          object
 4   Provider First Name                                      object
 5   Provider Middle Name                                     object
 6   Provider Name Prefix Text                                object
 7   Provider Name Suffix Text                                object
 8   Provider Credential Text                                 object
 9   Provider First Line Business Practice Location Address   object
 10  Provider Second Line Business Practice Location Addres

In [17]:
npi['extracted_taxonomy_code'].value_counts(dropna = False)

106S00000X    334562
None          313177
1041C0700X    286286
183500000X    275069
101YM0800X    268346
               ...  
2471Q0001X         2
2472V0600X         2
2278P3800X         2
3418M1130X         1
341800000X         1
Name: extracted_taxonomy_code, Length: 864, dtype: int64

In [None]:
# if we decide to drop rows with no 'Y' switch
# npi.dropna(subset = ['extracted_taxonomy_code'], inplace = True)

In [18]:
# drop taxonomy columns
npi = npi.drop(tax_cols, axis = 1)

In [19]:
# read in classification crosswalk

class_cross = pd.read_csv('../data/nucc_taxonomy_240.csv', usecols = ['Code', 'Grouping', 'Classification', 'Display Name'])

In [20]:
class_cross.head()

Unnamed: 0,Code,Grouping,Classification,Display Name
0,193200000X,Group,Multi-Specialty,Multi-Specialty Group
1,193400000X,Group,Single Specialty,Single Specialty Group
2,207K00000X,Allopathic & Osteopathic Physicians,Allergy & Immunology,Allergy & Immunology Physician
3,207KA0200X,Allopathic & Osteopathic Physicians,Allergy & Immunology,Allergy Physician
4,207KI0005X,Allopathic & Osteopathic Physicians,Allergy & Immunology,Clinical & Laboratory Immunology (Allergy & Im...


In [21]:
# merge npi and class_cross
npi = pd.merge(npi, class_cross, left_on = 'extracted_taxonomy_code', right_on = 'Code', how = 'left')

In [23]:
npi.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8180697 entries, 0 to 8180696
Data columns (total 19 columns):
 #   Column                                                   Dtype 
---  ------                                                   ----- 
 0   NPI                                                      object
 1   Entity Type Code                                         object
 2   Provider Organization Name (Legal Business Name)         object
 3   Provider Last Name (Legal Name)                          object
 4   Provider First Name                                      object
 5   Provider Middle Name                                     object
 6   Provider Name Prefix Text                                object
 7   Provider Name Suffix Text                                object
 8   Provider Credential Text                                 object
 9   Provider First Line Business Practice Location Address   object
 10  Provider Second Line Business Practice Location Addres

In [None]:
# convert column names to better format
# move to end of process after both joins

npi.columns = [x.lower().replace(' ', '_') for x in npi.columns]