In [2]:
import pandas as pd
from tqdm.notebook import tqdm
import sqlite3

In [2]:
chunks = pd.read_csv('../data/DocGraph_Hop_Teaming_2018.csv', chunksize = 10000)
test_chunk = next(chunks)
test_chunk.loc[(test_chunk['transaction_count'] >= 50) & (test_chunk['average_day_wait'] < 50)]

Unnamed: 0,from_npi,to_npi,patient_count,transaction_count,average_day_wait,std_day_wait
7,1508085911,1730166125,58,67,23.925,43.923
10,1508167040,1730166125,51,51,28.196,52.876
15,1508863549,1730166125,340,391,18.302,42.422
18,1508867870,1730166125,50,79,12.658,26.402
25,1508011040,1730166224,132,145,8.579,28.053
...,...,...,...,...,...,...
9951,1508804113,1730476490,97,102,32.882,54.294
9976,1508834862,1730476979,53,60,34.983,54.899
9979,1508118555,1730476995,84,91,22.791,38.622
9982,1508131087,1730477225,96,323,3.613,21.439


In [3]:
db = sqlite3.connect('../data/hop_teaming_database.sqlite')

for chunk in tqdm(pd.read_csv('../data/DocGraph_Hop_Teaming_2018.csv', chunksize = 10000)):
    # filter the required conditions (trying to eliminate accidental referrals)
    chunk = chunk.loc[(chunk['transaction_count'] >= 50) & (chunk['average_day_wait'] < 50)] 
    # Append the chunk to a calls table
    chunk.to_sql('hop_teaming', db, if_exists = 'append', index = False)

db.execute('CREATE INDEX from_to_npi ON hop_teaming(from_npi, to_npi)')

db.close()

0it [00:00, ?it/s]

In [4]:
pd.set_option('display.max_columns', None)

In [7]:
def find_taxonomy(col):
    for i in range(1, 16):
        taxonomy_switch = f'Healthcare Provider Primary Taxonomy Switch_{i}'
        taxonomy_value = f'Healthcare Provider Taxonomy Code_{i}'
        if col.get(taxonomy_switch) == 'Y':
            return col.get(taxonomy_value)
    return 'no primary taxonomy'

In [8]:
%%capture [--no-stderr]
# the capture above is here so that it doesn't show warnings about columns types and so that I avoid manually setting dozens of columns dtypes!

db = sqlite3.connect('../data/hop_teaming_database.sqlite')

for chunk in tqdm(pd.read_csv('../data/npidata_pfile_20050523-20230212.csv', chunksize = 10000, dtype={'Provider Business Practice Location Address Postal Code': object})):

    # first extract the primary taxonomy
    chunk['Primary Taxonomy'] = chunk.apply(lambda col: find_taxonomy(col), axis=1)

    # Take only first 5 digits from postal code column
    chunk['Provider Business Practice Location Address Postal Code'] = chunk['Provider Business Practice Location Address Postal Code'].str[:5]

    # next, only keep columns we're interested in and renaming so that there are no ()
    chunk = (
        chunk 
        [['NPI',
        'Entity Type Code',
        'Provider Organization Name (Legal Business Name)',
        'Provider Last Name (Legal Name)',
        'Provider First Name',
        'Provider Middle Name',
        'Provider Name Prefix Text',
        'Provider Name Suffix Text',
        'Provider Credential Text',
        'Provider First Line Business Practice Location Address',
        'Provider Second Line Business Practice Location Address',
        'Provider Business Practice Location Address City Name',
        'Provider Business Practice Location Address State Name',
        'Provider Business Practice Location Address Postal Code',
        'Primary Taxonomy']]
        .rename(columns={'Provider Organization Name (Legal Business Name)': 'Organization Name',
        'Provider Last Name (Legal Name)': 'Last Name',
        'Provider Name Prefix Text': 'Name Prefix',
        'Provider Name Suffix Text': 'Name Suffix',
        'Provider Business Practice Location Address City Name': 'City',
        'Provider Business Practice Location Address State Name': 'State',
        'Provider Business Practice Location Address Postal Code': 'Postal Code'})
    )

    # Then clean up the column names
     
    chunk.columns = [x.lower().replace(' ', '_') for x in chunk.columns]

    # Finally, the chunk to a calls table
    chunk.to_sql('nnpes', db, if_exists = 'append', index = False)

db.execute('CREATE INDEX npi ON nnpes(npi)')

db.close()

In [9]:
query = """
SELECT *
FROM nnpes
WHERE city = 'NASHVILLE'
AND state = 'TN'
"""
with sqlite3.connect('../data/hop_teaming_database.sqlite') as db: 
    npidata_nashville = pd.read_sql_query(query, db)
npidata_nashville

Unnamed: 0,npi,entity_type_code,organization_name,last_name,provider_first_name,provider_middle_name,name_prefix,name_suffix,provider_credential_text,provider_first_line_business_practice_location_address,provider_second_line_business_practice_location_address,city,state,postal_code,primary_taxonomy
0,1922001957,1.0,,PRESLEY,RICHARD,E,,,M.D.,2011 MURPHY AVE,STE 302,NASHVILLE,TN,37203,207V00000X
1,1760485817,1.0,,MORAN,SAM,HOUSTON,,,M.D.,329 21ST AVE N,STE 4,NASHVILLE,TN,37203,207V00000X
2,1154324192,1.0,,OLDFIELD,ELIZABETH,L,,,M.D.,2011 MURPHY AVE,STE 200,NASHVILLE,TN,37203,207V00000X
3,1770586786,1.0,,RICHARDS,SHERRIE,ANDERSON,,,M.D.,2201 MURPHY AVE,STE 410,NASHVILLE,TN,37203,207V00000X
4,1629071832,1.0,,ADKINS,ROYCE,T,,,M.D.,2011 MURPHY AVE,STE 200,NASHVILLE,TN,37203,207V00000X
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22569,1245690585,1.0,,HARRISON,ASHLEY,,MS.,,,1005 DR. D.B. TODD JR. BLVD,,NASHVILLE,TN,37208,122300000X
22570,1710638523,2.0,"VORI MEDICAL NY, PLLC",,,,,,,100 POWELL PL # 1441,,NASHVILLE,TN,37204,208100000X
22571,1053753541,1.0,,LESUEUR,JESSICA,J,DR.,,OD,3443 DICKERSON PIKE,STE. 240,NASHVILLE,TN,37207,152W00000X
22572,1922039346,1.0,,KOSTAMAA,HEIKKI,E,DR.,,M.D.,28 WHITE BRIDGE PIKE,STE.208,NASHVILLE,TN,37205,207W00000X


Connecting Taxonomy Codes to SQL Database

In [10]:
db = sqlite3.connect('../data/hop_teaming_database.sqlite')

nucc_taxonomy = pd.read_csv('../data/nucc_taxonomy_230.csv', encoding = 'unicode_escape')

# lowercase column names and replace spaces
nucc_taxonomy.columns = [x.lower().replace(' ', '_') for x in nucc_taxonomy.columns]

# add table to database
nucc_taxonomy.to_sql('nucc_taxonomy', db, if_exists = 'append', index = False)

#create index

db.execute('CREATE INDEX code ON nucc_taxonomy(code)')

db.close()

nucc_taxonomy

Unnamed: 0,code,grouping,classification,specialization,definition,notes,display_name,section
0,193200000X,Group,Multi-Specialty,,A business group of one or more individual pra...,[7/1/2003: new],Multi-Specialty Group,Individual
1,193400000X,Group,Single Specialty,,A business group of one or more individual pra...,[7/1/2003: new],Single Specialty Group,Individual
2,207K00000X,Allopathic & Osteopathic Physicians,Allergy & Immunology,,An allergist-immunologist is trained in evalua...,"Source: American Board of Medical Specialties,...",Allergy & Immunology Physician,Individual
3,207KA0200X,Allopathic & Osteopathic Physicians,Allergy & Immunology,Allergy,"A physician who specializes in the diagnosis, ...",Source: National Uniform Claim Committee,Allergy Physician,Individual
4,207KI0005X,Allopathic & Osteopathic Physicians,Allergy & Immunology,Clinical & Laboratory Immunology,An allergy and immunology physician who specia...,"Source: National Uniform Claim Committee, 2022...",Clinical & Laboratory Immunology (Allergy & Im...,Individual
...,...,...,...,...,...,...,...,...
868,343800000X,Transportation Services,Secured Medical Transport (VAN),,A public or privately owned transportation ser...,,Secured Medical Transport (VAN),Non-Individual
869,344600000X,Transportation Services,Taxi,,A land commercial vehicle used for the transpo...,,Taxi,Non-Individual
870,347D00000X,Transportation Services,Train,,An organization or business licensed to provid...,,Train,Non-Individual
871,347E00000X,Transportation Services,Transportation Broker,,An organization that provides transportation f...,Source: Section 6083 of the Deficit Reduction ...,Transportation Broker,Non-Individual


Connecting ZIP CBSA to SQL Database

In [11]:
db = sqlite3.connect('../data/hop_teaming_database.sqlite')

zip_cbsa = pd.read_excel('../data/ZIP_CBSA_122021.xlsx', index_col = None, header = 0, dtype={'zip': object})

# add table to database
zip_cbsa.to_sql('zip_cbsa', db, if_exists = 'append', index = False)

#create index
db.execute('CREATE INDEX zip ON zip_cbsa(zip)')

db.close()

zip_cbsa

Unnamed: 0,zip,cbsa,usps_zip_pref_city,usps_zip_pref_state,res_ratio,bus_ratio,oth_ratio,tot_ratio
0,00683,41900,SAN GERMAN,PR,0.999842,1.0,1.0,0.999855
1,00683,32420,SAN GERMAN,PR,0.000158,0.0,0.0,0.000145
2,00923,41980,SAN JUAN,PR,1.000000,1.0,1.0,1.000000
3,01010,44140,BRIMFIELD,MA,0.976896,1.0,1.0,0.977816
4,01010,49340,BRIMFIELD,MA,0.023104,0.0,0.0,0.022184
...,...,...,...,...,...,...,...,...
47479,60684,16980,CHICAGO,IL,0.000000,1.0,0.0,1.000000
47480,33945,15980,PINELAND,FL,0.000000,0.0,1.0,1.000000
47481,78144,99999,PANNA MARIA,TX,0.000000,1.0,0.0,1.000000
47482,12257,10580,ALBANY,NY,0.000000,1.0,0.0,1.000000


Connecting Facility Affiliations to SQL Database

In [8]:
db = sqlite3.connect('../data/hop_teaming_database.sqlite')

facility_affiliation = pd.read_csv('../data/Facility_Affiliation.csv', encoding = 'unicode_escape')

# lowercase column names and replace spaces
facility_affiliation.columns = [x.lower().replace(' ', '_') for x in facility_affiliation.columns]

# add table to database
facility_affiliation.to_sql('facility_affiliation', db, if_exists = 'append', index = False)

#create index
#db.execute('CREATE INDEX code ON facility_affiliation(code)')

db.close()

facility_affiliation

Unnamed: 0,npi,ind_pac_id,lst_nm,frst_nm,mid_nm,suff,facility_type,facility_afl_ccn,parent_ccn
0,1003000126,7517003643,ENKESHAFI,ARDALAN,,,Hospital,210003,
1,1003000126,7517003643,ENKESHAFI,ARDALAN,,,Hospital,210022,
2,1003000126,7517003643,ENKESHAFI,ARDALAN,,,Hospital,490063,
3,1003000126,7517003643,ENKESHAFI,ARDALAN,,,Hospital,490145,
4,1003000134,4284706367,CIBULL,THOMAS,L,,Hospital,140010,
...,...,...,...,...,...,...,...,...,...
1563884,1992999270,9436230539,BENNETT,STEPHANIE,R,,Hospital,100030,
1563885,1992999551,42376873,MOLAI,INDIRA,,,Home health agency,057505,
1563886,1992999817,7113191032,TAKENISHI,GREG,S,,Hospital,050748,
1563887,1992999825,143414284,DESCHENES,GEOFFREY,R,,Hospital,500005,


Connecting TN General Hospital Info to SQL Database

In [7]:
db = sqlite3.connect('../data/hop_teaming_database.sqlite')

hospital_info = pd.read_csv('../data/TN_Hospital_General_Info.csv', encoding = 'unicode_escape')

# lowercase column names and replace spaces
hospital_info.columns = [x.lower().replace(' ', '_') for x in hospital_info.columns]

# add table to database
hospital_info.to_sql('hospital_info', db, if_exists = 'append', index = False)

#create index
#db.execute('CREATE INDEX code ON hospital_info(code)')

db.close()

hospital_info

Unnamed: 0,facility_id,facility_name,address,city,state,zip_code,county_name,phone_number,hospital_type,hospital_ownership,...,count_of_readm_measures_better,count_of_readm_measures_no_different,count_of_readm_measures_worse,readm_group_footnote,pt_exp_group_measure_count,count_of_facility_pt_exp_measures,pt_exp_group_footnote,te_group_measure_count,count_of_facility_te_measures,te_group_footnote
0,440001,UNICOI COUNTY HOSPITAL,2030 TEMPLE HILL ROAD,ERWIN,TN,37650,UNICOI,(423) 743-3141,Acute Care Hospitals,Voluntary non-profit - Private,...,0,4,0,,8,Not Available,5.0,12,5,
1,440002,JACKSON-MADISON COUNTY GENERAL HOSPITAL,620 SKYLINE DRIVE,JACKSON,TN,38301,MADISON,(731) 541-5000,Acute Care Hospitals,Government - Hospital District or Authority,...,1,7,3,,8,8,,12,6,
2,440003,SUMNER REGIONAL MEDICAL CENTER,555 HARTSVILLE PIKE,GALLATIN,TN,37066,SUMNER,(615) 452-4210,Acute Care Hospitals,Proprietary,...,0,7,1,,8,8,,12,9,
3,440006,TRISTAR SKYLINE MEDICAL CENTER,3441 DICKERSON PIKE,NASHVILLE,TN,37207,DAVIDSON,(615) 769-2000,Acute Care Hospitals,Voluntary non-profit - Private,...,0,6,2,,8,8,,12,8,
4,440007,UNITY MEDICAL CENTER,481 INTERSTATE DRIVE,MANCHESTER,TN,37355,COFFEE,(931) 728-6354,Acute Care Hospitals,Voluntary non-profit - Private,...,1,5,0,,8,8,,12,4,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
113,444026,"ERLANGER BEHAVIORAL HOSPITAL, LLC",804 NORTH HOLTZCLAW AVENUE,CHATTANOOGA,TN,37404,HAMILTON,(877) 249-2737,Psychiatric,Proprietary,...,Not Available,Not Available,Not Available,19.0,Not Available,Not Available,19.0,Not Available,Not Available,19.0
114,444027,CREEKSIDE BEHAVIORAL HEALTH,1025 EXECUTIVE PARK BLVD,KINGSPORT,TN,37660,SULLIVAN,(423) 830-8207,Psychiatric,Proprietary,...,Not Available,Not Available,Not Available,19.0,Not Available,Not Available,19.0,Not Available,Not Available,19.0
115,444028,"BEHAVIORAL HEALTH OF ROCKY TOP, LLC",210 NDUSTRIAL PARK DRIVE,ROCKY TOP,TN,37769,ANDERSON,(865) 630-9200,Psychiatric,Proprietary,...,Not Available,Not Available,Not Available,19.0,Not Available,Not Available,19.0,Not Available,Not Available,19.0
116,444029,PINEWOOD SPRINGS,1001 N JAMES CAMPBELL BLVD,COLUMBIA,TN,38401,MAURY,(931) 777-6000,Psychiatric,Proprietary,...,Not Available,Not Available,Not Available,19.0,Not Available,Not Available,19.0,Not Available,Not Available,19.0
