In [56]:
import pandas as pd
# import polars as pl
import sqlite3

Cleaning strategy for NPPES file:
- read in only columns relevant to this project
- process taxonomy code columns to determine primary
- drop taxonomy code columns (keep only added primary code column)
- join classification crosswalk to get taxonomy name
- join CBSA crosswalk to match provider on zip code
- make column names better for SQL
- put into sqlite

In [57]:
# generate list of taxonomy columns
tax_cols = []

for i in range(1, 16):
    switch_col_name = f'Healthcare Provider Primary Taxonomy Switch_{i}'
    tax_cols.append(switch_col_name)
    code_col_name = f'Healthcare Provider Taxonomy Code_{i}'
    tax_cols.append(code_col_name)

# other relevant columns
cols = ['NPI', 'Entity Type Code', 'Provider Organization Name (Legal Business Name)', 'Provider Last Name (Legal Name)',
        'Provider First Name', 'Provider Middle Name', 'Provider Name Prefix Text', 'Provider Name Suffix Text',
        'Provider Credential Text', 'Provider First Line Business Practice Location Address',
        'Provider Second Line Business Practice Location Address', 'Provider Business Practice Location Address City Name',
        'Provider Business Practice Location Address State Name', 'Provider Business Practice Location Address Postal Code']

# combine column name lists
cols.extend(tax_cols)

In [58]:
npi = pd.read_csv('../data/npidata_pfile_20050523-20240211.csv', usecols = cols, dtype = str)

In [59]:
# pull primary taxonomy codes into one column
# modify to take first 'X' if there is no 'Y'
# add unified flag column (X or Y)
# pull all rows with 'X' into separate dataset (in addition to main)
# don't drop non-flag

def extract_taxonomy_code(row):

#    row['extracted_switch_col'] = None

    # to catch all rows with a 'Y' switch
    for i in range(1, 16):
        switch_col_name = f'Healthcare Provider Primary Taxonomy Switch_{i}'
        if row[switch_col_name] == 'Y':
            code_col_name = f'Healthcare Provider Taxonomy Code_{i}'
            # row['extracted_switch_col'] = 'Y'
            return row[code_col_name]
        
    # if no 'Y' switch was found, capture first 'X'
    # this second chunk makes it incredibly inefficient, find a better way
    # if row['extracted_switch_col'] is None:
    #     for i in range(1, 16):
    #         switch_col_name = f'Healthcare Provider Primary Taxonomy Switch_{i}'
    #         if row[switch_col_name] == 'X':
    #             code_col_name = f'Healthcare Provider Taxonomy Code_{i}'
    #             if 'extracted_switch_col' not in row:
    #                 row['extracted_switch_col'] = 'X'
    #                 return row[code_col_name]
    
    return None # for rows with no 'Y' or 'X' switch

In [60]:
npi['extracted_taxonomy_code'] = npi.apply(extract_taxonomy_code, axis = 1)

In [61]:
npi.head()

Unnamed: 0,NPI,Entity Type Code,Provider Organization Name (Legal Business Name),Provider Last Name (Legal Name),Provider First Name,Provider Middle Name,Provider Name Prefix Text,Provider Name Suffix Text,Provider Credential Text,Provider First Line Business Practice Location Address,...,Healthcare Provider Primary Taxonomy Switch_11,Healthcare Provider Taxonomy Code_12,Healthcare Provider Primary Taxonomy Switch_12,Healthcare Provider Taxonomy Code_13,Healthcare Provider Primary Taxonomy Switch_13,Healthcare Provider Taxonomy Code_14,Healthcare Provider Primary Taxonomy Switch_14,Healthcare Provider Taxonomy Code_15,Healthcare Provider Primary Taxonomy Switch_15,extracted_taxonomy_code
0,1679576722,1.0,,WIEBE,DAVID,A,,,M.D.,3500 CENTRAL AVE,...,,,,,,,,,,207X00000X
1,1588667638,1.0,,PILCHER,WILLIAM,C,DR.,,MD,1824 KING STREET,...,,,,,,,,,,207RC0000X
2,1497758544,2.0,"CUMBERLAND COUNTY HOSPITAL SYSTEM, INC",,,,,,,3418 VILLAGE DR,...,,,,,,,,,,251G00000X
3,1306849450,,,,,,,,,,...,,,,,,,,,,
4,1215930367,1.0,,GRESSOT,LAURENT,,DR.,,M.D.,17323 RED OAK DR,...,,,,,,,,,,207RH0003X


In [62]:
npi.info()
# having doublechecked the values in the switch columns, it appears that there are rows with no 'Y' switch
# should those rows be dropped, or possibly the taxonomy code in Code_1 could be inferred to be the correct code?
# some rows with no 'Y' switch have one or more 'X' switches

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8180697 entries, 0 to 8180696
Data columns (total 45 columns):
 #   Column                                                   Dtype 
---  ------                                                   ----- 
 0   NPI                                                      object
 1   Entity Type Code                                         object
 2   Provider Organization Name (Legal Business Name)         object
 3   Provider Last Name (Legal Name)                          object
 4   Provider First Name                                      object
 5   Provider Middle Name                                     object
 6   Provider Name Prefix Text                                object
 7   Provider Name Suffix Text                                object
 8   Provider Credential Text                                 object
 9   Provider First Line Business Practice Location Address   object
 10  Provider Second Line Business Practice Location Addres

In [63]:
npi['Provider Business Practice Location Address Postal Code'].value_counts(dropna = False)

NaN          269457
483345312      9634
559050001      7454
910165239      6066
483751803      6036
              ...  
891281045         1
207212709         1
481873716         1
956181629         1
190475353         1
Name: Provider Business Practice Location Address Postal Code, Length: 1810141, dtype: int64

In [64]:
# truncate zipcode to 5 digits (match CBSA crosswalk)
npi['Provider Business Practice Location Address Postal Code'] = npi['Provider Business Practice Location Address Postal Code'].str.slice(0, 5)

In [65]:
# if we decide to drop rows with no 'Y' switch
# npi.dropna(subset = ['extracted_taxonomy_code'], inplace = True)

In [66]:
# drop taxonomy columns
npi = npi.drop(tax_cols, axis = 1)

In [87]:
# convert column names to better format
npi.columns = [x.lower().replace(' ', '_').replace('(', '').replace(')', '') for x in npi.columns]

In [89]:
# npi data to sqlite
# create sqlite database
db = sqlite3.connect('../data/provider.sqlite')

npi.to_sql('npi',
           db,
           if_exists = 'replace',
           index = False)

db.close()

In [70]:
# read in classification crosswalk
class_cross = pd.read_csv('../data/nucc_taxonomy_240.csv', usecols = ['Code', 'Grouping', 'Classification', 'Display Name'])

In [71]:
class_cross.head()

Unnamed: 0,Code,Grouping,Classification,Display Name
0,193200000X,Group,Multi-Specialty,Multi-Specialty Group
1,193400000X,Group,Single Specialty,Single Specialty Group
2,207K00000X,Allopathic & Osteopathic Physicians,Allergy & Immunology,Allergy & Immunology Physician
3,207KA0200X,Allopathic & Osteopathic Physicians,Allergy & Immunology,Allergy Physician
4,207KI0005X,Allopathic & Osteopathic Physicians,Allergy & Immunology,Clinical & Laboratory Immunology (Allergy & Im...


In [72]:
# convert column names to better format
class_cross.columns = [x.lower().replace(' ', '_') for x in class_cross.columns]

In [73]:
# classification crosswalk data to sqlite
# create sqlite database
db = sqlite3.connect('../data/provider.sqlite')

class_cross.to_sql('class_cross',
           db,
           if_exists = 'replace',
           index = False)

db.close()

In [91]:
# read in CBSA crosswalk

cbsa_cross = pd.read_csv('../data/ZIP_CBSA_122023.xlsx - Export Worksheet.csv', usecols = ['ZIP', 'CBSA', 'USPS_ZIP_PREF_CITY', 'USPS_ZIP_PREF_STATE'], dtype = str)

In [92]:
cbsa_cross.head()

Unnamed: 0,ZIP,CBSA,USPS_ZIP_PREF_CITY,USPS_ZIP_PREF_STATE
0,501,35620,HOLTSVILLE,NY
1,601,38660,ADJUNTAS,PR
2,602,10380,AGUADA,PR
3,603,10380,AGUADILLA,PR
4,604,10380,AGUADILLA,PR


In [93]:
cbsa_cross.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47598 entries, 0 to 47597
Data columns (total 4 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   ZIP                  47598 non-null  object
 1   CBSA                 47598 non-null  object
 2   USPS_ZIP_PREF_CITY   47598 non-null  object
 3   USPS_ZIP_PREF_STATE  47598 non-null  object
dtypes: object(4)
memory usage: 1.5+ MB


In [94]:
# convert column names to better format
cbsa_cross.columns = [x.lower().replace(' ', '_') for x in cbsa_cross.columns]

In [95]:
# CBSA crosswalk data to sqlite
# create sqlite database
db = sqlite3.connect('../data/provider.sqlite')

cbsa_cross.to_sql('cbsa_cross',
           db,
           if_exists = 'replace',
           index = False)

db.close()

In [79]:
# read in hop teaming data
hop = pd.read_csv('../data/DocGraph_Hop_Teaming_2018.csv')

In [80]:
hop.head()

Unnamed: 0,from_npi,to_npi,patient_count,transaction_count,average_day_wait,std_day_wait
0,1508062167,1730166109,350,370,53.922,72.612
1,1508065640,1730166109,25,25,49.8,55.006
2,1508052093,1730166109,16,16,109.5,70.593
3,1508172545,1730166109,14,14,103.357,75.483
4,1508285131,1730166109,20,21,89.952,89.88


In [81]:
hop.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 217911308 entries, 0 to 217911307
Data columns (total 6 columns):
 #   Column             Dtype  
---  ------             -----  
 0   from_npi           int64  
 1   to_npi             int64  
 2   patient_count      int64  
 3   transaction_count  int64  
 4   average_day_wait   float64
 5   std_day_wait       float64
dtypes: float64(2), int64(4)
memory usage: 9.7 GB


In [82]:
hop = hop[hop['transaction_count'] >= 50]

In [83]:
hop = hop[hop['average_day_wait'] <= 50]

In [84]:
hop.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 34184634 entries, 7 to 217911304
Data columns (total 6 columns):
 #   Column             Dtype  
---  ------             -----  
 0   from_npi           int64  
 1   to_npi             int64  
 2   patient_count      int64  
 3   transaction_count  int64  
 4   average_day_wait   float64
 5   std_day_wait       float64
dtypes: float64(2), int64(4)
memory usage: 1.8 GB


In [85]:
# hop data to sqlite
# create sqlite database
db = sqlite3.connect('../data/provider.sqlite')

hop.to_sql('hop',
           db,
           if_exists = 'replace',
           index = False)

db.close()

In [96]:
db = sqlite3.connect('../data/provider.sqlite')

db.execute('CREATE INDEX npi_index ON npi(npi)')
db.execute('CREATE INDEX to_npi ON hop(to_npi)')
db.execute('CREATE INDEX from_npi ON hop(from_npi)')

db.close()