# Cleaning ___npidata___ (Taxonomy Switch Problem)

## 1. Importing Stuff

In [1]:
# importing stuff
import pandas as pd
import sqlite3
from tqdm.notebook import tqdm
import re

# showing lots of rows and columns
pd.set_option('display.max_columns', 999)
pd.set_option('display.max_rows', 999)

# hiding annoying pink warning boxes
import warnings
warnings.filterwarnings("ignore")

## 2. Defining a Function that pulls the Get_Taxonomy_Code

In [2]:
# defining a function to get the taxonomy code
def Get_Taxonomy_Code(df_row):
    j = 1
    switch_code = 0
    while j <= 15:
        column_name = "Healthcare Provider Primary Taxonomy Switch_" + str(j)
        if df_row[column_name] == 'Y':
            switch_code = j
            j = 16
        j+=1
    if switch_code != 0:
        col_1 = "Healthcare Provider Taxonomy Code_" + str(switch_code)
        return df_row[col_1]
    else:
        return -1

## 3. Setting up an Empty Database

In [7]:
#Setting up empty data base
db = sqlite3.connect('data/NPPES_Data_Dissemination_February_2021/npidata2.sqlite')

## 4. Looping to Fill the Tables in Database and Clean Data 

### ------------------------ Npidata Table ------------------------

In [4]:
# filling in the database
for chunk in tqdm(pd.read_csv('data/NPPES_Data_Dissemination_February_2021/npidata_pfile_20050523-20210207.csv', chunksize = 10000)):
    
    # Keeping only if in TN
    chunk = chunk[chunk['Provider Business Practice Location Address State Name'] == 'TN']
    
    # making int then str for easier zipcodes later
    chunk['Provider Business Practice Location Address Postal Code'] = chunk['Provider Business Practice Location Address Postal Code'].astype(int)
    chunk['Provider Business Practice Location Address Postal Code'] = chunk['Provider Business Practice Location Address Postal Code'].astype(str)
    
    # applying function to get 
    chunk['Healthcare_Provider_Taxonomy_Code'] = chunk.apply(Get_Taxonomy_Code, axis = 1)
    
    # Only Keeping if it has a Taxonomy code
    chunk = chunk[chunk['Healthcare_Provider_Taxonomy_Code'] != -1]
    
    # Cleaning Column names
    chunk.columns = [x.replace(' ', '_') for x in chunk.columns]    
    
    # Only keeping some columns
    chunk = chunk[['NPI','Entity_Type_Code','Provider_Organization_Name_(Legal_Business_Name)',
                   'Provider_Last_Name_(Legal_Name)','Provider_First_Name',
                   'Provider_First_Line_Business_Practice_Location_Address',
                   'Provider_Second_Line_Business_Practice_Location_Address',
                   'Provider_Business_Practice_Location_Address_City_Name',
                   'Provider_Business_Practice_Location_Address_Postal_Code',
                   'Provider_Credential_Text','Provider_Gender_Code',
                  'Healthcare_Provider_Taxonomy_Code']]
    
    # Renaming columns
    chunk = chunk.rename(columns = {"Entity_Type_Code": "Entity_Type_Code",
                                    "Provider_Organization_Name_(Legal_Business_Name)":"Provider_Organization_Name",
                                   "Provider_Last_Name_(Legal_Name)":"Provider_Last_Name",
                                   "Provider_First_Name":"Provider_First_Name",
                                   "Provider_Credential_Text":"Provider_Credential_Text",
                                   "Provider_Gender_Code":"Gender",
                                   'Provider_First_Line_Business_Practice_Location_Address':'First_Line_Address',
                                   'Provider_Second_Line_Business_Practice_Location_Address':'Second_Line_Address',
                                   'Provider_Business_Practice_Location_Address_City_Name':'Address_City',
                                   'Provider_Business_Practice_Location_Address_Postal_Code':'Address_Postal_Codes'})
    
    # regex to isolate first five numbers of address postal codes
    chunk['Address_Postal_Codes'] = chunk['Address_Postal_Codes'].str.extract('(^\d{5})')
    
    chunk.to_sql('npidata', db, if_exists = 'append', index = False)            # Append the chunk to a calls table

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




### ------------------------ Taxonomy Table ------------------------

In [5]:
# adding table taxonomy to our database
pd.read_csv('data/nucc_taxonomy_210.csv').to_sql('taxonomy', db, if_exists = 'append', index = False)

### ------------------------ CBSA Table ------------------------

In [None]:
# adding table CBSA to our database
pd.read_excel('data/ZIP_CBSA Crosswalk.xlsx').to_sql('CBSA', db, if_exists = 'append', index = False)

### ------------------------ Hops Table ------------------------

In [8]:
# adding table hops to our database
# filling in the database
for chunk in tqdm(pd.read_csv('data/DocGraph_Hop_Teaming_2017_Non_Commercial/DocGraph_Hop_Teaming_2017.csv', chunksize = 10000)):
    chunk.to_sql('hops', db, if_exists = 'append', index = False)            # Append the chunk to a calls table

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

KeyboardInterrupt: 

In [None]:
db.close()