# Get the Primary Taxonomy for each NPI

### Import libraries and select the column names

In [1]:
# import libraries
import pandas as pd
import sqlite3
from tqdm.notebook import tqdm

In [2]:
# Import the file header for the NPI data as a dataframe
nppes_header = pd.read_csv('../data/nppes/npidata_pfile_20050523-20210207_FileHeader.csv')

In [3]:
# Write the file header columns to a list
nppes_headers_list = nppes_header.columns.to_list()

In [4]:
# Write that list back to a dataframe in a column called 'variables'
nppes_headers_list = pd.DataFrame(nppes_headers_list, columns = ['variables'])

In [5]:
# Check out the variables generated
nppes_headers_list

Unnamed: 0,variables
0,NPI
1,Entity Type Code
2,Replacement NPI
3,Employer Identification Number (EIN)
4,Provider Organization Name (Legal Business Name)
...,...
325,Healthcare Provider Taxonomy Group_12
326,Healthcare Provider Taxonomy Group_13
327,Healthcare Provider Taxonomy Group_14
328,Healthcare Provider Taxonomy Group_15


In [6]:
# Select the Taxonomy variables
nppes_headers_vars = nppes_headers_list[(nppes_headers_list['variables'].str.contains('Taxonomy') == True)
                              & (nppes_headers_list['variables'].str.contains('Group') == False)]['variables'].tolist()

In [7]:
# Add in the NPI
nppes_headers_vars.insert(0, 'NPI')

In [8]:
# Take a look at the variables selected
nppes_headers_vars

['NPI',
 'Healthcare Provider Taxonomy Code_1',
 'Healthcare Provider Primary Taxonomy Switch_1',
 'Healthcare Provider Taxonomy Code_2',
 'Healthcare Provider Primary Taxonomy Switch_2',
 'Healthcare Provider Taxonomy Code_3',
 'Healthcare Provider Primary Taxonomy Switch_3',
 'Healthcare Provider Taxonomy Code_4',
 'Healthcare Provider Primary Taxonomy Switch_4',
 'Healthcare Provider Taxonomy Code_5',
 'Healthcare Provider Primary Taxonomy Switch_5',
 'Healthcare Provider Taxonomy Code_6',
 'Healthcare Provider Primary Taxonomy Switch_6',
 'Healthcare Provider Taxonomy Code_7',
 'Healthcare Provider Primary Taxonomy Switch_7',
 'Healthcare Provider Taxonomy Code_8',
 'Healthcare Provider Primary Taxonomy Switch_8',
 'Healthcare Provider Taxonomy Code_9',
 'Healthcare Provider Primary Taxonomy Switch_9',
 'Healthcare Provider Taxonomy Code_10',
 'Healthcare Provider Primary Taxonomy Switch_10',
 'Healthcare Provider Taxonomy Code_11',
 'Healthcare Provider Primary Taxonomy Switch_11'

### Create a dataframe with the primary taxonomy code for each NPI in the NPPES dataset

In [9]:
# Create a function to iterate over the taxonomy data and return only the primary one
def get_primary_tax(row):
    tax = ''
    for i in range (1, 16):
        if row['Healthcare Provider Primary Taxonomy Switch_' + str(i)] == 'Y':
            tax = row['Healthcare Provider Taxonomy Code_' + str(i)]
    return tax

In [10]:
# Create an empty dataframe to contain the results
npi_specialty = []

In [11]:
# Iterate over the csv in chunks of 50,000 lines
# Import from each row the NPI and the primary taxonomy
# Append it to the empty list variable npi_specialty
# This will require 135 iterations to complete
# This produces a lot of warnings if you run it without the low_memory parameter set
for chunk in tqdm(pd.read_csv('../data/nppes/npidata_pfile_20050523-20210207.csv', usecols = nppes_headers_vars, chunksize = 50000, low_memory=False)):
    for index, row in chunk.iterrows():
        npi = row['NPI']
        primary_tax = get_primary_tax(row)
        npi_specialty.append([npi, primary_tax])

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [12]:
# Take a look at the list of lists produced
npi_specialty

[[1679576722, '207X00000X'],
 [1588667638, '207RC0000X'],
 [1497758544, '251G00000X'],
 [1306849450, '2085R0202X'],
 [1215930367, '207RH0003X'],
 [1023011178, '251G00000X'],
 [1932102084, '207RC0000X'],
 [1841293990, '231H00000X'],
 [1750384806, '207R00000X'],
 [1669475711, '208000000X'],
 [1578566626, '207Q00000X'],
 [1487657532, '207V00000X'],
 [1396748448, '363A00000X'],
 [1205839354, '207X00000X'],
 [1114920261, '207R00000X'],
 [1023011079, '251E00000X'],
 [1932102985, '207Q00000X'],
 [1841293891, '208600000X'],
 [1750384707, '174400000X'],
 [1669475612, '363LW0102X'],
 [1578566527, '207X00000X'],
 [1487657433, '293D00000X'],
 [1396748349, '207VE0102X'],
 [1205839255, '207RC0000X'],
 [1114920162, '207R00000X'],
 [1922001973, '1041C0700X'],
 [1831192889, '207R00000X'],
 [1740283795, '313M00000X'],
 [1659374601, '207Q00000X'],
 [1568465516, '208G00000X'],
 [1477556421, '152W00000X'],
 [1255334207, '207RC0000X'],
 [1164425112, '1835P1200X'],
 [1073516027, '207W00000X'],
 [1982607933, 

In [13]:
# Convert the list into a pandas dataframe
npi_specialty = pd.DataFrame(npi_specialty)

# And name the columns accordingly
npi_specialty.columns = ['npi','primary_taxonomy']

### Get the description of each taxonomy code in the NPPES dataset and join it to the list of providers

In [14]:
# Read in the taxonomy code-to-description crosswalk table
tax_desc = pd.read_csv('../data/nucc_taxonomy_210.csv', usecols = ['Code','Classification'])

In [15]:
# Join the specialty descriptions to the NPPES dataset
npi_specialty = npi_specialty.merge(tax_desc, how='left', left_on='primary_taxonomy', right_on = 'Code')

In [16]:
# Drop the repeated code
npi_specialty.drop(columns = ['Code'], inplace=True)

In [17]:
# Export to CSV
npi_specialty.to_csv('../data/npi_specialty.csv',index=False)

In [18]:
#npi_specialty = pd.read_csv('../data/npi_specialty.csv')

In [19]:
# Take a look at the first few rows of the newly-merged dataset
npi_specialty.head(10)

Unnamed: 0,npi,primary_taxonomy,Classification
0,1679576722,207X00000X,Orthopaedic Surgery
1,1588667638,207RC0000X,Internal Medicine
2,1497758544,251G00000X,"Hospice Care, Community Based"
3,1306849450,2085R0202X,Radiology
4,1215930367,207RH0003X,Internal Medicine
5,1023011178,251G00000X,"Hospice Care, Community Based"
6,1932102084,207RC0000X,Internal Medicine
7,1841293990,231H00000X,Audiologist
8,1750384806,207R00000X,Internal Medicine
9,1669475711,208000000X,Pediatrics


### Read in the cleaned up NPI specialty dataset into the database

In [20]:
# Connect to the database
db = sqlite3.connect('../data/hop_teaming.sqlite')

In [21]:
# If you need to rewrite the table...
cursor = db.cursor()

In [22]:
# Drop the table and return a line that says that it's gone
cursor.execute("DROP TABLE specialty")
print("Table dropped...")

Table dropped...


In [23]:
# Read the cleaned up specialty file into the specialty table
for chunk in tqdm(pd.read_csv('../data/npi_specialty.csv', chunksize = 50000)):
    chunk.to_sql('specialty', db, if_exists = 'append', index = False)            # Append the chunk to a calls table

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [24]:
# Create a test query to make sure the database was set up properly
query = '''
select * from specialty limit 10
'''

In [28]:
# Read the test query results into a dataframe
test = pd.read_sql(query,db)

In [29]:
# Return the test results
test
# test.shape

Unnamed: 0,count(*),count(distinct npi)
0,6714038,6714038


In [27]:
# Create another test query to ensure there is no duplication in the dataset
query = '''
select count(*), count(distinct npi) from specialty
'''