In [1]:
import sqlite3
import pandas as pd
from tqdm.notebook import tqdm

## NPPES data filtering
The NPPES dataset contains a large number of fields, only a few of which are relevant to this project:

'NPI'
Entity Type, indicated by the 'Entity Type Code' field:
1 = Provider (doctors, nurses, etc.)
2 = Facility (Hospitals, Urgent Care, Doctors Offices)
Entity Name: Either First/Last or Organization or Other Organization Name contained in the following fields:
'Provider Organization Name (Legal Business Name)'
'Provider Last Name (Legal Name)'
'Provider First Name'
'Provider Middle Name'
'Provider Name Prefix Text'
'Provider Name Suffix Text'
'Provider Credential Text'
Address: Business Practice Location (not mailing), contained in the following fields:
'Provider First Line Business Mailing Address'
'Provider Second Line Business Mailing Address'
'Provider Business Mailing Address City Name'
'Provider Business Mailing Address State Name'
'Provider Business Mailing Address Postal Code'
The provider's taxonomy code, which is contained in one of the 'Healthcare Provider Taxonomy Code*' columns. A provider can have up to 15 taxonomy codes, but we want the one which has Primary Switch = Y in the associated 'Healthcare Provider Primary Taxonomy Switch*' field. Note that this does not always occur in spot 1.

In [2]:
nppes = pd.DataFrame()

db = sqlite3.connect('data/nppes_lite.sqlite')


def run_query(n):
    query = '''
    SELECT npi, 
    entity_type_code,
    [provider_organization_name_(legal_business_name)],
    [provider_last_name_(legal_name)],
    provider_first_name,
    provider_middle_name,
    provider_name_prefix_text,
    provider_name_suffix_text,
    provider_credential_text,
    provider_first_line_business_mailing_address,
    provider_second_line_business_mailing_address,
    provider_business_mailing_address_city_name,
    provider_business_mailing_address_state_name,
    provider_business_mailing_address_postal_code,
    healthcare_provider_taxonomy_code_{0} AS healthcare_provider_taxonomy_code
    FROM nppes_raw
    WHERE healthcare_provider_primary_taxonomy_switch_{0} = 'Y' AND
    entity_type_code IN (1,2)
    '''.format(n)
    
    nppes = pd.read_sql(query, db)
    return nppes 

for i in range (1,16):
    results = run_query(i)
    nppes = nppes.append(results)


#### converting a dataframe to a table in the database

In [3]:
nppes.to_sql('nppes', db, if_exists = 'append', index = False) 

In [9]:
db.close()

## Checking if all tables went into db

In [5]:
db = sqlite3.connect('data/nppes_lite.sqlite')

query= '''
SELECT *
FROM nppes_raw
LIMIT 5
'''
df = pd.read_sql(query, db)
df.head()

Unnamed: 0,npi,entity_type_code,replacement_npi,employer_identification_number_(ein),provider_organization_name_(legal_business_name),provider_last_name_(legal_name),provider_first_name,provider_middle_name,provider_name_prefix_text,provider_name_suffix_text,...,healthcare_provider_taxonomy_group_7,healthcare_provider_taxonomy_group_8,healthcare_provider_taxonomy_group_9,healthcare_provider_taxonomy_group_10,healthcare_provider_taxonomy_group_11,healthcare_provider_taxonomy_group_12,healthcare_provider_taxonomy_group_13,healthcare_provider_taxonomy_group_14,healthcare_provider_taxonomy_group_15,certification_date
0,1841293891,1.0,,,,GIBBS,ELMER,RICKEY,DR.,,...,,,,,,,,,,
1,1659374601,1.0,,,,OBERDICK,WENDY,TIPTON,,,...,,,,,,,,,,
2,1699778647,2.0,,<UNAVAIL>,"VISITING HOME HEALTH SERVICES, INC",,,,,,...,,,,,,,,,,
3,1134122187,1.0,,,,RUDNICKE,CHERYL,DENISE,MRS.,,...,,,,,,,,,,
4,1003819046,1.0,,,,NYLANDER,BARBARA,H,,,...,,,,,,,,,,


In [6]:
db = sqlite3.connect('data/nppes_lite.sqlite')

query= '''
SELECT *
FROM hop_team
LIMIT 5
'''
df = pd.read_sql(query, db)
df.head()

Unnamed: 0,from_npi,to_npi,patient_count,transaction_count,average_day_wait,std_day_wait
0,1033142146,1000000004,491,535,10.232,36.558
1,1013977990,1003000126,134,145,27.352,51.137
2,1013996669,1003000126,91,92,35.152,68.009
3,1033102504,1003000126,52,64,15.328,38.3
4,1003029620,1003000126,111,121,33.058,58.981


In [7]:
db = sqlite3.connect('data/nppes_lite.sqlite')

query= '''
SELECT *
FROM nucc_taxonomy
LIMIT 5
'''
df = pd.read_sql(query, db)
df.head()

Unnamed: 0,code,grouping,classification,specialization,definition,effective_date,deactivation_date,last_modified_date,notes,display_name
0,193200000X,Group,Multi-Specialty,,A business group of one or more individual pra...,10/1/2003,,,[7/1/2003: new],Multi-Specialty Group
1,193400000X,Group,Single Specialty,,A business group of one or more individual pra...,10/1/2003,,,[7/1/2003: new],Single Specialty Group
2,207K00000X,Allopathic & Osteopathic Physicians,Allergy & Immunology,,An allergist-immunologist is trained in evalua...,4/1/2003,,7/1/2007,"Source: American Board of Medical Specialties,...",Allergy & Immunology Physician
3,207KA0200X,Allopathic & Osteopathic Physicians,Allergy & Immunology,Allergy,Definition to come...,4/1/2003,,,,Allergy Physician
4,207KI0005X,Allopathic & Osteopathic Physicians,Allergy & Immunology,Clinical & Laboratory Immunology,Definition to come...,4/1/2003,,,,Clinical & Laboratory Immunology (Allergy & Im...


In [8]:
db = sqlite3.connect('data/nppes_lite.sqlite')

query= '''
SELECT *
FROM zip_cbsa
LIMIT 5
'''
df = pd.read_sql(query, db)
df.head()

Unnamed: 0,zip,cbsa,res_ratio,bus_ratio,oth_ratio,tot_ratio
0,501,35620,0.0,1.0,0.0,1.0
1,601,38660,1.0,1.0,1.0,1.0
2,602,10380,1.0,1.0,1.0,1.0
3,603,10380,1.0,1.0,1.0,1.0
4,604,10380,1.0,1.0,1.0,1.0


In [10]:
db = sqlite3.connect('data/nppes_lite.sqlite')

query= '''
SELECT *
FROM nppes
LIMIT 5
'''
df = pd.read_sql(query, db)
df.head()

Unnamed: 0,npi,entity_type_code,provider_organization_name_(legal_business_name),provider_last_name_(legal_name),provider_first_name,provider_middle_name,provider_name_prefix_text,provider_name_suffix_text,provider_credential_text,provider_first_line_business_mailing_address,provider_second_line_business_mailing_address,provider_business_mailing_address_city_name,provider_business_mailing_address_state_name,provider_business_mailing_address_postal_code,healthcare_provider_taxonomy_code
0,1841293891,1.0,,GIBBS,ELMER,RICKEY,DR.,,M.D.,49 CLEVELAND ST 310,,CROSSVILLE,TN,385552854.0,208600000X
1,1659374601,1.0,,OBERDICK,WENDY,TIPTON,,,MD,PO BOX 9,,KINGSPORT,TN,376620009.0,207Q00000X
2,1699778647,2.0,"VISITING HOME HEALTH SERVICES, INC",,,,,,,3001 KEITH ST NW,,CLEVELAND,TN,373123713.0,251E00000X
3,1134122187,1.0,,RUDNICKE,CHERYL,DENISE,MRS.,,CRNP,PO BOX 440100,,NASHVILLE,TN,372440100.0,363L00000X
4,1003819046,1.0,,NYLANDER,BARBARA,H,,,M.D.,3024 BUSINESS PARK CIR,,GOODLETTSVILLE,TN,370723132.0,207VG0400X
