In [1]:
import pandas as pd
import sqlite3
from tqdm.notebook import tqdm

In [2]:
pd.set_option("display.max_columns", 500)

In [None]:
zips = pd.read_excel("../data/ZIP_CBSA_122017.xlsx", converters={'zip': lambda x: str(x)})
zips = zips[zips['cbsa'] == 34980]

with sqlite3.connect('../data/hcbb.sqlite') as db:

    npidata_raw = pd.read_csv("../data/npidata_pfile_20050523-20210207.csv", chunksize = 10000)
    for chunk in tqdm(npidata_raw):

        npidata = pd.concat([
            chunk[['NPI']],
            # Entity Type Code: 1 = Provider (doctors, nurses, etc.) / 2 = Facility (Hospitals, Urgent Care, Doctors Offices) 
            chunk[['Entity Type Code']],
            # Entity Name: Either First/Last or Organization or Other Organization Name contained in the following fields:
            chunk.loc[:, 'Provider Organization Name (Legal Business Name)':'Provider Credential Text'],
            # Address: Business Practice Location (not mailing), contained in the following fields:
            chunk.loc[:, 'Provider First Line Business Mailing Address':'Provider Business Mailing Address Postal Code'],
        ], axis=1)

        npi_taxonomy = pd.concat([
            chunk[['NPI']],
            # The provider's taxonomy code, which is contained in one of the 'Healthcare Provider Taxonomy Code*' columns
            chunk[chunk.columns[pd.Series(chunk.columns).str.startswith('Healthcare Provider Taxonomy Code_')]],
            chunk[chunk.columns[pd.Series(chunk.columns).str.startswith('Healthcare Provider Primary Taxonomy Switch_')]]
        ], axis=1)

        # Pivot from widet to long format
        npi_taxonomy = pd.wide_to_long(
            npi_taxonomy,
            stubnames=['Healthcare Provider Taxonomy Code', 'Healthcare Provider Primary Taxonomy Switch'],
            i=['NPI'], 
            j='primary_taxonomy_index',
            sep="_"
        )

        # Only keep the primary taxonomy
        npi_taxonomy = npi_taxonomy[npi_taxonomy['Healthcare Provider Primary Taxonomy Switch'] == 'Y']

        # Housekeeping
        npi_taxonomy = npi_taxonomy.reset_index()\
            .drop(columns=['primary_taxonomy_index', 'Healthcare Provider Primary Taxonomy Switch'])\
            .rename({ 'Healthcare Provider Taxonomy Code': 'taxonomy_code' }, axis=1)

        npidata = npidata.merge(
            npi_taxonomy,
            how='left',
            on='NPI'
        )

        # Rename columns
        npidata = npidata.reset_index().rename({
            'NPI': 'npi',
            'Entity Type Code': 'entity_type_code',
            'Provider Organization Name (Legal Business Name)': 'provider_org_name',
            'Provider Last Name (Legal Name)': 'provider_last_name',
            'Provider First Name': 'provider_first_name',
            'Provider Middle Name': 'provider_middle_name',
            'Provider Name Prefix Text': 'provider_name_prefix',
            'Provider Name Suffix Text': 'provider_name_suffix',
            'Provider Credential Text': 'provider_credential',
            'Provider First Line Business Mailing Address': 'provider_business_address_1',
            'Provider Second Line Business Mailing Address': 'provider_business_address_2',
            'Provider Business Mailing Address City Name': 'provider_business_city',
            'Provider Business Mailing Address State Name': 'provider_business_state',
            'Provider Business Mailing Address Postal Code': 'provider_business_zip'
        }, axis=1)

        # Create Zip5 column to merge down the road
        npidata['provider_business_zip5'] = [str(i)[0:5] for i in npidata['provider_business_zip']]

        # Correct data types
        
        
        npidata = npidata[npidata['provider_business_zip5'].isin(zips['zip'])]

        npidata.to_sql('npidata', db, if_exists = 'append', index = False)                           

    print('task done')

In [11]:
with sqlite3.connect('../data/hcbb.sqlite') as db :
    query = """
    SELECT *
    FROM npidata
    LIMIT 5;
    """ 
    
    test = pd.read_sql(query, db)

test

Unnamed: 0,index,npi,entity_type_code,provider_org_name,provider_last_name,provider_first_name,provider_middle_name,provider_name_prefix,provider_name_suffix,provider_credential,provider_business_address_1,provider_business_address_2,provider_business_city,provider_business_state,provider_business_zip,primary_taxonomy,provider_business_zip5
0,67,1003819046,1.0,,NYLANDER,BARBARA,H,,,M.D.,3024 BUSINESS PARK CIR,,GOODLETTSVILLE,TN,370723132.0,207VG0400X,37072
1,76,1750384780,1.0,,PERRIGIN,JULIE,A,DR.,,MD,PO BOX 778,,DICKSON,TN,370560778.0,207Q00000X,37056
2,105,1073516001,1.0,,ROSS,DAVID,L,DR.,,MD,127 CRESTVIEW PARK DR,,DICKSON,TN,370552850.0,207R00000X,37055
3,117,1780687830,1.0,,MANI,VENK,,DR.,,MD,127 CRESTVIEW PARK DR,,DICKSON,TN,370552850.0,207ZC0500X,37055
4,171,1437152485,1.0,,MORGAN,LISA,BROOKS,,,MD,2201 MURPHY AVE STE 407,,NASHVILLE,TN,372031864.0,207V00000X,37203


In [13]:
with sqlite3.connect('../data/hcbb.sqlite') as db:   
    taxonomy = pd.read_csv("../data/nucc_taxonomy_210.csv")
    taxonomy = taxonomy[['Code', 'Grouping', 'Classification', 'Specialization']]
    taxonomy.columns = ['taxonomy_code', 'grouping', 'classification', 'specialization']
    taxonomy.to_sql('taxonomy', db, if_exists = 'append', index = False)  
    
    print('task done')

task done


In [14]:
with sqlite3.connect('../data/hcbb.sqlite') as db :
    query = """
    SELECT *
    FROM taxonomy
    LIMIT 5;
    """ 
    
    test = pd.read_sql(query, db)

test

Unnamed: 0,Code,Grouping,Classification,Specialization
0,193200000X,Group,Multi-Specialty,
1,193400000X,Group,Single Specialty,
2,207K00000X,Allopathic & Osteopathic Physicians,Allergy & Immunology,
3,207KA0200X,Allopathic & Osteopathic Physicians,Allergy & Immunology,Allergy
4,207KI0005X,Allopathic & Osteopathic Physicians,Allergy & Immunology,Clinical & Laboratory Immunology


In [56]:
npidata_test = pd.read_csv("../data/npidata_pfile_20050523-20210207.csv", chunksize = 10000, nrows = 50000)
for chunk in tqdm(npidata_test):

    npidata = pd.concat([
        chunk[['NPI']],
        # Entity Type Code: 1 = Provider (doctors, nurses, etc.) / 2 = Facility (Hospitals, Urgent Care, Doctors Offices) 
        chunk[['Entity Type Code']],
        # Entity Name: Either First/Last or Organization or Other Organization Name contained in the following fields:
        chunk.loc[:, 'Provider Organization Name (Legal Business Name)':'Provider Credential Text'],
        # Address: Business Practice Location (not mailing), contained in the following fields:
        chunk.loc[:, 'Provider First Line Business Mailing Address':'Provider Business Mailing Address Postal Code'],
    ], axis=1)

    npi_taxonomy = pd.concat([
        chunk[['NPI']],
        # The provider's taxonomy code, which is contained in one of the 'Healthcare Provider Taxonomy Code*' columns
        chunk[chunk.columns[pd.Series(chunk.columns).str.startswith('Healthcare Provider Taxonomy Code_')]],
        chunk[chunk.columns[pd.Series(chunk.columns).str.startswith('Healthcare Provider Primary Taxonomy Switch_')]]
    ], axis=1)

    # Pivot from widet to long format
    npi_taxonomy = pd.wide_to_long(
        npi_taxonomy,
        stubnames=['Healthcare Provider Taxonomy Code', 'Healthcare Provider Primary Taxonomy Switch'],
        i=['NPI'], 
        j='primary_taxonomy_index',
        sep="_"
    )

    # Only keep the primary taxonomy
    npi_taxonomy = npi_taxonomy[npi_taxonomy['Healthcare Provider Primary Taxonomy Switch'] == 'Y']

    # Housekeeping
    npi_taxonomy = npi_taxonomy.reset_index()\
        .drop(columns=['primary_taxonomy_index', 'Healthcare Provider Primary Taxonomy Switch'])\
        .rename({ 'Healthcare Provider Taxonomy Code': 'taxonomy_code' }, axis=1)

    npidata = npidata.merge(
        npi_taxonomy,
        how='left',
        on='NPI'
    )

    # Rename columns
    npidata = npidata.reset_index(drop = True).rename({
        'NPI': 'npi',
        'Entity Type Code': 'entity_type_code',
        'Provider Organization Name (Legal Business Name)': 'provider_org_name',
        'Provider Last Name (Legal Name)': 'provider_last_name',
        'Provider First Name': 'provider_first_name',
        'Provider Middle Name': 'provider_middle_name',
        'Provider Name Prefix Text': 'provider_name_prefix',
        'Provider Name Suffix Text': 'provider_name_suffix',
        'Provider Credential Text': 'provider_credential',
        'Provider First Line Business Mailing Address': 'provider_business_address_1',
        'Provider Second Line Business Mailing Address': 'provider_business_address_2',
        'Provider Business Mailing Address City Name': 'provider_business_city',
        'Provider Business Mailing Address State Name': 'provider_business_state',
        'Provider Business Mailing Address Postal Code': 'provider_business_zip'
    }, axis=1)

    # Create Zip5 column to merge down the road
    npidata['provider_business_zip5'] = [str(i)[0:5] for i in npidata['provider_business_zip']]

    # Correct data types
    npidata['npi'] = npidata['npi'].astype(str)
    npidata['entity_type_code'] = npidata['entity_type_code'].astype(str).str.split(',').str[0]

    npidata_1 = npidata[npidata['provider_business_zip5'].isin(zips['zip'])]

npidata_1


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

  await eval(code_obj, self.user_global_ns, self.user_ns)
  await eval(code_obj, self.user_global_ns, self.user_ns)
  await eval(code_obj, self.user_global_ns, self.user_ns)
  await eval(code_obj, self.user_global_ns, self.user_ns)





Unnamed: 0,npi,entity_type_code,provider_org_name,provider_last_name,provider_first_name,provider_middle_name,provider_name_prefix,provider_name_suffix,provider_credential,provider_business_address_1,provider_business_address_2,provider_business_city,provider_business_state,provider_business_zip,taxonomy_code,provider_business_zip5
64,1164428785,1.0,,ROSSELLO,REBECCA,JILL,,,M.D.,3400 LEBANON RD,,MURFREESBORO,TN,371291237.0,2084P0800X,37129
73,1063418689,1.0,,NUNN,PAULA,SAHAKIAN,,,M.D.,2015 TERRACE PL,,NASHVILLE,TN,372032412.0,2084P0804X,37203
292,1144227778,1.0,,ROTH,JAMES,M,DR.,,M.D.,125 CRESTVIEW PARK DR,SUITE 2,DICKSON,TN,370552850.0,207Y00000X,37055
503,1043217623,1.0,,CRAWFORD,WALTER,,,,M.D.,110 29TH AVE N,STE 202,NASHVILLE,TN,372031448.0,207L00000X,37203
815,1124025747,1.0,,AMMERMAN,CATHY,E,,,FNP,353 NEW SHACKLE ISLAND RD,SUITE 122B,HENDERSONVILLE,TN,37075.0,363LF0000X,37075
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9192,1003813908,2.0,"AMENITY HEALTHCARE, LLC",,,,,,,10 CADILLAC DR,SUITE 400,BRENTWOOD,TN,370275078.0,251G00000X,37027
9262,1982601829,1.0,,PROVENCE,TIMOTHY,D,DR.,,DDS,207 23RD AVE N,,NASHVILLE,TN,372031501.0,1223S0112X,37203
9268,1245237189,1.0,,BOLLENBAUGH,JASON,,DR.,,D.C.,2191 HILLSBORO RD,SUITE A,FRANKLIN,TN,370696223.0,111N00000X,37069
9569,1891792602,1.0,,GARRARD,CLIFFORD,LOUIS,,,,3601 TVC,,NASHVILLE,TN,372320001.0,2086S0129X,37232


In [57]:
npidata_1['entity_type_code']

64      1.0
73      1.0
292     1.0
503     1.0
815     1.0
       ... 
9192    2.0
9262    1.0
9268    1.0
9569    1.0
9717    1.0
Name: entity_type_code, Length: 92, dtype: object