# Cleaning ___npidata___ (Taxonomy Switch Problem)

## 1. Importing Stuff

In [1]:
# importing stuff
import pandas as pd
import sqlite3
from tqdm.notebook import tqdm

# showing lots of rows and columns
pd.set_option('display.max_columns', 999)
pd.set_option('display.max_rows', 999)

# hiding annoying pink warning boxes
import warnings
warnings.filterwarnings("ignore")

## 2. Reading in as many rows as possible

In [2]:
#testing how much I can load in
npidata = pd.read_csv('data/NPPES_Data_Dissemination_February_2021/npidata_pfile_20050523-20210207.csv', 
                          nrows = 150000)

## 3. Making a new column that gives the number where the Taxonomy_Switch = 'Y'

In [3]:
npidata['taxonomy_switch_num'] = 0

#looping through each row in df
for i in range(npidata.shape[0]):
    j = 1
    while j <= 15:
        column_name = "Healthcare Provider Primary Taxonomy Switch_" + str(j)
        if npidata[column_name].iloc[i] == 'Y':
            npidata['taxonomy_switch_num'].iloc[i] = j
            j = 16
        j+=1

## 4. Getting rid of all rows that did not have a Taxonmy_Switch = 'Y' anywhere

In [4]:
npidata = npidata[npidata['taxonomy_switch_num'] != 0]

## 5. Creating 3 new columns with the provider info to keep

In [5]:
# Setting up New Columns to Eventually Keep
npidata['Healthcare_Provider_Taxonomy_Code'] = 'Nothing Yet'
npidata['Provider_License_Number'] = 'Nothing Yet'
npidata['Provider_License_Number_State'] = 'Nothing Yet'

## 6. Filling in new columns with the correct info based on the taxonomy_switch number

In [6]:
# iterating through the number of rows
for k in range(npidata.shape[0]):
    col_1 = "Healthcare Provider Taxonomy Code_" + str(npidata['taxonomy_switch_num'].iloc[k])
    npidata['Healthcare_Provider_Taxonomy_Code'].iloc[k] = npidata[col_1].iloc[k]
    
    col_2 = "Provider License Number_" + str(npidata['taxonomy_switch_num'].iloc[k])
    npidata['Provider_License_Number'].iloc[k] = npidata[col_2].iloc[k]
    
    col_3 = "Provider License Number State Code_" + str(npidata['taxonomy_switch_num'].iloc[k])
    npidata['Provider_License_Number_State'].iloc[k] = npidata[col_3].iloc[k]

## 7. Subsetting the df for only the needed columns

In [7]:
#Subsetting for needed columns
#choosing not to include middle name, Replacement NPI, and (EIN) b/c they are mostly NaN (Can add later if needed)
npidata = npidata[['NPI','Entity Type Code','Provider Organization Name (Legal Business Name)',
                   'Provider Last Name (Legal Name)','Provider First Name',
                   'Provider Credential Text','Provider Gender Code','taxonomy_switch_num',
                  'Healthcare_Provider_Taxonomy_Code','Provider_License_Number','Provider_License_Number_State']]

## 8. Renaming the columns so the df is easier to work with

In [8]:
#Renaming Columns
npidata = npidata.rename(columns = {"Entity Type Code": "Entity_Type_Code",
                                    "Provider Organization Name (Legal Business Name)":"Provider_Organization_Name",
                                   "Provider Last Name (Legal Name)":"Provider_Last_Name",
                                   "Provider First Name":"Provider_First_Name",
                                   "Provider Credential Text":"Provider_Credential_Text",
                                   "Provider Gender Code":"Gender"})

## 9. Taking a look at the clean data

In [9]:
#Looking at clean data
npidata.tail()

Unnamed: 0,NPI,Entity_Type_Code,Provider_Organization_Name,Provider_Last_Name,Provider_First_Name,Provider_Credential_Text,Gender,taxonomy_switch_num,Healthcare_Provider_Taxonomy_Code,Provider_License_Number,Provider_License_Number_State
149995,1760479257,1.0,,NEELY,BRIAN,MD,M,1,207Q00000X,0101228375,VA
149996,1679560163,1.0,,HAMILTON,DWIGHT,MD,M,1,208G00000X,MD2009-0566,NM
149997,1588651079,2.0,BOSTON CHILDRENS HEALTH PHYSICIANS LLP,,,,,1,291U00000X,=========,NY
149998,1396732889,1.0,,CARTER,COY,CRNA,M,1,367500000X,C00561,AR
149999,1205823796,1.0,,MENDOZA,GLENN,MD,M,1,2080N0001X,148338,NY


## 10. Exporting Data to CSV so we have something to start working with

In [10]:
npidata.to_csv(r'data/NPPES_Data_Dissemination_February_2021/npidata_clean.csv', header = True)