# NPPES to SQLite Table

## Step By Step (For Testing Only, Check Below for Production)

In [1]:
import pandas as pd
import sqlite3
from tqdm.notebook import tqdm

In [2]:
pd.set_option("display.max_columns", 500)
pd.set_option('display.max_rows', 1000)

**Read in the npidata file (npidata_pfile_20050523-20210207.csv) and only return the relevant columns**

In [3]:
npidata_raw = pd.read_csv("../data/npidata_pfile_20050523-20210207.csv", nrows=1000)

**Only keep the needed columns**

In [4]:
npidata = pd.concat([
    npidata_raw[['NPI']],
    # Entity Type Code: 1 = Provider (doctors, nurses, etc.) / 2 = Facility (Hospitals, Urgent Care, Doctors Offices) 
    npidata_raw[['Entity Type Code']],
    # Entity Name: Either First/Last or Organization or Other Organization Name contained in the following fields:
    npidata_raw.loc[:, 'Provider Organization Name (Legal Business Name)':'Provider Credential Text'],
    # Address: Business Practice Location (not mailing), contained in the following fields:
    npidata_raw.loc[:, 'Provider First Line Business Practice Location Address':'Provider Business Practice Location Address Postal Code'],
], axis=1)

**Handling Primary Taxonomy Separately**

A provider can have up to 15 taxonomy codes, but we want the one which has Primary Switch = Y in the associated 'Healthcare Provider Primary Taxonomy Switch*' field. Note that this does not always occur in spot 1.

In [5]:
npi_taxonomy = pd.concat([
    npidata_raw[['NPI']],
    # The provider's taxonomy code, which is contained in one of the 'Healthcare Provider Taxonomy Code*' columns
    npidata_raw[npidata_raw.columns[pd.Series(npidata_raw.columns).str.startswith('Healthcare Provider Taxonomy Code_')]],
    npidata_raw[npidata_raw.columns[pd.Series(npidata_raw.columns).str.startswith('Healthcare Provider Primary Taxonomy Switch_')]]
], axis=1)

Apply `wide_to_long` to get the matrix of taxonomy in a long format and filter for only the primary taxonomies (**warning: This would get rid of NPI's without primary taxonomies. Do a left-join on `npidata` to fix this**)

In [6]:
# Pivot from wide to long format
npi_taxonomy = pd.wide_to_long(
    npi_taxonomy,
    stubnames=['Healthcare Provider Taxonomy Code', 'Healthcare Provider Primary Taxonomy Switch'],
    i=['NPI'], 
    j='primary_taxonomy_index',
    sep="_"
)

# Only keep the primary taxonomy
npi_taxonomy = npi_taxonomy[npi_taxonomy['Healthcare Provider Primary Taxonomy Switch'] == 'Y']

# Housekeeping
npi_taxonomy = npi_taxonomy.reset_index()\
    .drop(columns=['primary_taxonomy_index', 'Healthcare Provider Primary Taxonomy Switch'])\
    .rename({ 'Healthcare Provider Taxonomy Code': 'primary_taxonomy' }, axis=1)

**Merge `npidata` and `npi_taxonomy`**

Finally, merge back `npi_taxonomy` to `npidata`. Account for npis without `primary_taxonomy` by doing a left-join

In [7]:
npidata = npidata.merge(
    npi_taxonomy,
    how='left',
    on='NPI'
)

**We are only concerned with Entity Types 1 or 2. Drop the NAs Entity Types**

In [8]:
# npidata = npidata[~npidata['Entity Type Code'].isna()]

**Rename columns**

In [9]:
npidata = npidata.reset_index().rename({
        'NPI': 'npi',
        'Entity Type Code': 'entity_type_code',
        'Provider Organization Name (Legal Business Name)': 'provider_org_name',
        'Provider Last Name (Legal Name)': 'provider_last_name',
        'Provider First Name': 'provider_first_name',
        'Provider Middle Name': 'provider_middle_name',
        'Provider Name Prefix Text': 'provider_name_prefix',
        'Provider Name Suffix Text': 'provider_name_suffix',
        'Provider Credential Text': 'provider_credential',
        'Provider First Line Business Practice Location Address': 'provider_business_address_1',
        'Provider Second Line Business Practice Location Address': 'provider_business_address_2',
        'Provider Business Practice Location Address City Name': 'provider_business_city',
        'Provider Business Practice Location Address State Name': 'provider_business_state',
        'Provider Business Practice Location Address Postal Code': 'provider_business_zip'
    }, axis=1)

**Add a new column / overwrite the existing zip code column with a 5-digit zip**

In [10]:
npidata['provider_business_zip5'] = [str(i)[0:5] for i in npidata['provider_business_zip']]
npidata[npidata['provider_business_zip5'] == 'nan'] = None

**Correcting datatypes**

In [11]:
npidata['npi'] = npidata['npi'].astype(str).str.split('.').str[0] # This step would convert NaN to string
npidata[npidata['npi'] == 'nan'] = None # Convert NaN back to None
npidata['entity_type_code'] = npidata['entity_type_code'].astype(str).str.split('.').str[0] # This step would convert NaN to string
npidata[npidata['entity_type_code'] == 'nan'] = None  # Convert NaN back to None

**Dropping unneeded columns**

In [12]:
npidata = npidata.drop('index', axis=1)
npidata = npidata.drop('provider_business_zip', axis=1)

## Now re-applying all of that at once and batch-import into sqlite (For Production)

**IMPORTANT! This loading into the database should only be run once.** If you run this multiple times, it will create duplicate entries in the database. For the security of not re-running this code by accident, the code here is converted into markdown. **If you need to rebuild the database, delete the `data/hcbb_group_reviews.sqlite` file and re-run this cell as code. You will also need to make sure to re-run any other related scripts that builds other tables in the database.**

## Testing Final DB Load

In [13]:
# Get list of currently existing tables
with sqlite3.connect('../data/hcbb_group_reviews.sqlite') as db :
    query = """
    SELECT name
    FROM sqlite_master 
    WHERE type ='table' 
    AND name NOT LIKE 'sqlite_%';
    """ 
    
    db_table_list = pd.read_sql(query, db)

display(db_table_list)

Unnamed: 0,name
0,cbsa
1,npidata


**This should return 6,714,038 rows each representing a unique npi**

In [14]:
with sqlite3.connect('../data/hcbb_group_reviews.sqlite') as db :
    query = """
    SELECT COUNT(*) 
    FROM npidata;
    """ 
    
    test_df = pd.read_sql(query, db)

display(test_df)

Unnamed: 0,COUNT(*)
0,6714038


**This should return 5,077,318 of entity_type_code = 1 and 1,449,471 of entity_type_code = 2 and 187,249 of entity_type_code nan**

In [15]:
with sqlite3.connect('../data/hcbb_group_reviews.sqlite') as db :
    query = """
    SELECT entity_type_code, COUNT(*) 
    FROM npidata
    GROUP BY entity_type_code;
    """ 
    
    test_df = pd.read_sql(query, db)

display(test_df)

Unnamed: 0,entity_type_code,COUNT(*)
0,1.0,5077318
1,2.0,1449471
2,,187249


In [16]:
with sqlite3.connect('../data/hcbb_group_reviews.sqlite') as db :
    query = """
    SELECT * 
    FROM npidata
    LIMIT 10;
    """ 
    
    test_df = pd.read_sql(query, db)

display(test_df)

Unnamed: 0,npi,entity_type_code,provider_org_name,provider_last_name,provider_first_name,provider_middle_name,provider_name_prefix,provider_name_suffix,provider_credential,provider_business_address_1,provider_business_address_2,provider_business_city,provider_business_state,primary_taxonomy,provider_business_zip5
0,1679576722,1,,WIEBE,DAVID,A,,,M.D.,3500 CENTRAL AVE,,KEARNEY,NE,207X00000X,68847
1,1588667638,1,,PILCHER,WILLIAM,C,DR.,,MD,1824 KING STREET,SUITE 300,JACKSONVILLE,FL,207RC0000X,32204
2,1497758544,2,"CUMBERLAND COUNTY HOSPITAL SYSTEM, INC",,,,,,,3418 VILLAGE DR,,FAYETTEVILLE,NC,251G00000X,28304
3,1306849450,1,,SMITSON,HAROLD,LEROY,DR.,II,M.D.,810 LUCAS DR,,ATHENS,TX,2085R0202X,75751
4,1215930367,1,,GRESSOT,LAURENT,,DR.,,M.D.,17323 RED OAK DR,,HOUSTON,TX,207RH0003X,77090
5,1023011178,2,COLLABRIA CARE,,,,,,,414 S JEFFERSON ST,,NAPA,CA,251G00000X,94559
6,1932102084,1,,ADUSUMILLI,RAVI,K,,,MD,2940 N MCCORD RD,,TOLEDO,OH,207RC0000X,43615
7,1841293990,1,,WORTSMAN,SUSAN,,,,MA-CCC,425 E 25TH ST,,NEW YORK,NY,231H00000X,10010
8,1750384806,1,,BISBEE,ROBERT,,DR.,,MD,808 JOLIET AVE UNIT 120,,LUBBOCK,TX,207R00000X,79415
9,1669475711,1,,SUNG,BIN,SHENG,,,M. D.,7629 TIKI DR,,FULSHEAR,TX,208000000X,77441


**Quick Fix for dropping tables (DO NOT RUN UNLESS FOR RECREATING TABLES)**