In [1]:
import pandas as pd
import sqlite3
from tqdm.notebook import tqdm
import warnings

# Ignore warnings throughout the code
warnings.filterwarnings("ignore")

In [2]:
# Chunking hop_teaming csv and adding it as a table to a SQLite DB
db = sqlite3.connect('data/healthcare_bluebook.sqlite')

for chunk in tqdm(pd.read_csv('data/DocGraph_Hop_Teaming_2018.csv', chunksize = 100000)):
    chunk.columns = [x.lower().replace(' ', '_') for x in chunk.columns]
    chunk = chunk[(chunk['transaction_count'] >= 50) & (chunk['average_day_wait'] < 50)]
    chunk.to_sql('hop', db, if_exists = 'append', index = False)            
                  
db.close()

HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




In [3]:
# Chunking npi csv and adding it as a table to a SQLite DB
db = sqlite3.connect('data/healthcare_bluebook.sqlite')

for chunk in tqdm(pd.read_csv('data/npidata_pfile_20050523-20220213.csv', chunksize = 100000)):
    
    for counter in range(1,16):
        taxonomy_switch = chunk['Healthcare Provider Primary Taxonomy Switch_{}'.format(counter)] == 'Y'
        taxonomy_code_check = 'Healthcare Provider Taxonomy Code_{}'.format(counter)
        taxonomy_code = chunk.loc[taxonomy_switch, taxonomy_code_check]
        chunk.loc[taxonomy_switch, 'Healthcare Provider Taxonomy Code'] = taxonomy_code
    
    chunk = chunk[['NPI',
     'Entity Type Code',
     'Provider Organization Name (Legal Business Name)',
     'Provider Last Name (Legal Name)',
     'Provider First Name',
     'Provider Middle Name',
     'Provider Name Prefix Text',
     'Provider Name Suffix Text',
     'Provider Credential Text',
     'Provider First Line Business Practice Location Address',
     'Provider Second Line Business Practice Location Address',
     'Provider Business Practice Location Address City Name',
     'Provider Business Practice Location Address State Name',
     'Provider Business Practice Location Address Postal Code',
     'Healthcare Provider Taxonomy Code']]
    
    chunk.columns = [x.lower().replace(' ', '_') for x in chunk.columns]
    chunk.to_sql('npi', db, if_exists = 'append', index = False)            
                  
db.close()

HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




In [4]:
# Chunking taxonomy csv and adding it as a table to a SQLite DB
db = sqlite3.connect('data/healthcare_bluebook.sqlite')

for chunk in tqdm(pd.read_csv('data/nucc_taxonomy_220.csv', chunksize = 100000)):
    chunk.columns = [x.lower().replace(' ', '_') for x in chunk.columns]
    chunk.to_sql('taxonomy_classification', db, if_exists = 'append', index = False)            
                  
db.close()

HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




In [5]:
# Chunking cbsa csv and adding it as a table to a SQLite DB
db = sqlite3.connect('data/healthcare_bluebook.sqlite')

for chunk in tqdm(pd.read_csv('data/ZIP_TRACT_122021.csv', chunksize = 10000)):
    chunk.columns = [x.lower().replace(' ', '_') for x in chunk.columns]
    chunk.to_sql('cbsa', db, if_exists = 'append', index = False)            
                  
db.close()

HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




In [17]:
db = sqlite3.connect('data/healthcare_bluebook.sqlite')

query = """
SELECT 
    h.from_npi,
    h.to_npi,
    h.patient_count,
    h.transaction_count,
    h.average_day_wait,
    h.std_day_wait,
    n1.entity_type_code AS from_npi_entity_code,
    n2.entity_type_code AS to_npi_entity_code,
    n1.healthcare_provider_taxonomy_code AS from_npi_taxonomy_code,
    n2.healthcare_provider_taxonomy_code AS to_npi_taxonomy_code,
    t1.grouping AS from_npi_taxonomy_grouping,
    t1.classification AS from_npi_taxonomy_classification,
    t1.specialization AS from_npi_taxonomy_specialization,
    t2.grouping AS to_npi_taxonomy_grouping,
    t2.classification AS to_npi_taxonomy_classification,
    t2.specialization AS to_npi_taxonomy_specialization
FROM hop AS h
INNER JOIN npi AS n1
ON h.from_npi = n1.npi
INNER JOIN npi AS n2
ON h.to_npi = n2.npi
INNER JOIN taxonomy_classification AS t1
ON t1.code = n2.healthcare_provider_taxonomy_code
INNER JOIN taxonomy_classification AS t2
ON t2.code = n2.healthcare_provider_taxonomy_code
WHERE from_npi_entity_code = 1.0
    AND to_npi_entity_code = 2.0
"""

hop_npi_taxonomy_sqlite = pd.read_sql(query, db)

db.close()

In [6]:
db = sqlite3.connect('data/healthcare_bluebook.sqlite')

query = "SELECT * FROM npi"

npi_sqlite = pd.read_sql(query, db)

db.close()

In [14]:
db = sqlite3.connect('data/healthcare_bluebook.sqlite')

query = "SELECT * FROM taxonomy_classification"

taxonomy_sqlite = pd.read_sql(query, db)

db.close()

In [13]:
hop_npi_sqlite[(hop_npi_sqlite['from_npi_entity_code'] == 1.0) & (hop_npi_sqlite['to_npi_entity_code'] == 2.0)]

Unnamed: 0,from_npi,to_npi,patient_count,transaction_count,average_day_wait,std_day_wait,from_npi_entity_code,to_npi_entity_code,from_npi_taxonomy_code,to_npi_taxonomy_code
4,1508011040,1730166224,132,145,8.579,28.053,1.0,2.0,207RC0001X,282N00000X
6,1508163577,1730166224,67,127,8.346,26.050,1.0,2.0,207X00000X,282N00000X
7,1508159674,1730166224,145,212,2.939,10.660,1.0,2.0,207RP1001X,282N00000X
8,1508205808,1730166224,48,64,14.156,35.859,1.0,2.0,207V00000X,282N00000X
26,1508863630,1730167974,33,61,23.787,23.102,1.0,2.0,213E00000X,332BX2000X
...,...,...,...,...,...,...,...,...,...,...
34176075,1417900523,1497934517,576,741,0.000,0.000,1.0,2.0,367500000X,367500000X
34176078,1417115205,1497935506,43,56,37.107,66.309,1.0,2.0,2086S0129X,213ES0103X
34176083,1417082173,1497935936,58,66,15.591,28.403,1.0,2.0,363A00000X,207R00000X
34176099,1417194903,1497939599,22,70,16.629,30.598,1.0,2.0,363LF0000X,2084P0800X


In [9]:
hop_sqlite

Unnamed: 0,from_npi,to_npi,patient_count,transaction_count,average_day_wait,std_day_wait
0,1508085911,1730166125,58,67,23.925,43.923
1,1508167040,1730166125,51,51,28.196,52.876
2,1508863549,1730166125,340,391,18.302,42.422
3,1508867870,1730166125,50,79,12.658,26.402
4,1508011040,1730166224,132,145,8.579,28.053
...,...,...,...,...,...,...
34176933,1417037664,1497939599,36,106,19.330,42.407
34176934,1417194903,1497939599,22,70,16.629,30.598
34176935,1417406372,1497939599,21,65,20.123,37.750
34176936,1417064825,1497940605,75,79,10.418,34.744


In [7]:
npi_sqlite

Unnamed: 0,npi,entity_type_code,provider_organization_name_(legal_business_name),provider_last_name_(legal_name),provider_first_name,provider_middle_name,provider_name_prefix_text,provider_name_suffix_text,provider_credential_text,provider_first_line_business_practice_location_address,provider_second_line_business_practice_location_address,provider_business_practice_location_address_city_name,provider_business_practice_location_address_state_name,provider_business_practice_location_address_postal_code,healthcare_provider_taxonomy_code
0,1679576722,1.0,,WIEBE,DAVID,A,,,M.D.,3500 CENTRAL AVE,,KEARNEY,NE,688472944.0,207X00000X
1,1588667638,1.0,,PILCHER,WILLIAM,C,DR.,,MD,1824 KING STREET,SUITE 300,JACKSONVILLE,FL,322044736.0,207RC0000X
2,1497758544,2.0,"CUMBERLAND COUNTY HOSPITAL SYSTEM, INC",,,,,,,3418 VILLAGE DR,,FAYETTEVILLE,NC,283044552.0,251G00000X
3,1306849450,,,,,,,,,,,,,,
4,1215930367,1.0,,GRESSOT,LAURENT,,DR.,,M.D.,17323 RED OAK DR,,HOUSTON,TX,770901243.0,207RH0003X
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7189073,1649925215,1.0,,MARTIN,ASHLEY,,,,LMT,1703 DELONEY ST UNIT B,,AUSTIN,TX,787211111,225700000X
7189074,1528713195,1.0,,HONG,RYAN,,,,PHARMD,2 BERGEN TPKE,,RIDGEFIELD PARK,NJ,76602390,183500000X
7189075,1437804002,1.0,,BROWN,CAMERON,DAVID,,,,233 AUGUSTINE DR,,SPARTANBURG,SC,293066927,390200000X
7189076,1851046437,1.0,,MOHAMED,WALEED,,,,,5021 S 13TH ST,,MILWAUKEE,WI,532213600,1041C0700X


In [15]:
taxonomy_sqlite

Unnamed: 0,code,grouping,classification,specialization,definition,notes,display_name,section
0,193200000X,Group,Multi-Specialty,,A business group of one or more individual pra...,[7/1/2003: new],Multi-Specialty Group,Individual
1,193400000X,Group,Single Specialty,,A business group of one or more individual pra...,[7/1/2003: new],Single Specialty Group,Individual
2,207K00000X,Allopathic & Osteopathic Physicians,Allergy & Immunology,,An allergist-immunologist is trained in evalua...,"Source: American Board of Medical Specialties,...",Allergy & Immunology Physician,Individual
3,207KA0200X,Allopathic & Osteopathic Physicians,Allergy & Immunology,Allergy,Definition to come...,,Allergy Physician,Individual
4,207KI0005X,Allopathic & Osteopathic Physicians,Allergy & Immunology,Clinical & Laboratory Immunology,Definition to come...,,Clinical & Laboratory Immunology (Allergy & Im...,Individual
...,...,...,...,...,...,...,...,...
863,343800000X,Transportation Services,Secured Medical Transport (VAN),,A public or privately owned transportation ser...,,Secured Medical Transport (VAN),Non-Individual
864,344600000X,Transportation Services,Taxi,,A land commercial vehicle used for the transpo...,,Taxi,Non-Individual
865,347D00000X,Transportation Services,Train,,An organization or business licensed to provid...,,Train,Non-Individual
866,347E00000X,Transportation Services,Transportation Broker,,An organization that provides transportation f...,Source: Section 6083 of the Deficit Reduction ...,Transportation Broker,Non-Individual


In [22]:
hop_npi_taxonomy_sqlite.to_csv('data/hop_npi_taxonomy_sqlite.csv', index=False)