In [10]:
import pandas as pd
from tqdm.notebook import tqdm
import sqlite3

In [12]:
db = sqlite3.connect('../data/hop_teaming_database.sqlite')

for chunk in tqdm(pd.read_csv('../data/DocGraph_Hop_Teaming_2018.csv', chunksize = 10000)):
    # filter the required conditions (trying to eliminate accidental referrals)
    chunk.loc[(chunk['transaction_count'] >= 50) & (chunk['average_day_wait'] < 50)] 
    # Append the chunk to a calls table
    chunk.to_sql('hop_teaming', db, if_exists = 'append', index = False)

0it [00:00, ?it/s]

KeyboardInterrupt: 

In [None]:
db.execute('CREATE INDEX from_to_npi ON hop_teaming(from_npi, to_npi)')

In [None]:
db.close()

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
chunks = pd.read_csv('../data/npidata_pfile_20050523-20230212.csv', chunksize = 10000)
test_chunk = next(chunks)

In [None]:
test_chunk.loc[(test_chunk['Healthcare Provider Primary Taxonomy Switch_1'] != 'Y') & (test_chunk['Healthcare Provider Primary Taxonomy Switch_1'] != 'N')]

In [None]:
for i in range(1,16):
    print(test_chunk.loc[test_chunk[f'Healthcare Provider Primary Taxonomy Switch_{i}'] == 'Y', f'Healthcare Provider Taxonomy Code_{i}'].count())

In [None]:
def find_taxonomy(col):
    for i in range(1, 16):
        taxonomy_switch = f'Healthcare Provider Primary Taxonomy Switch_{i}'
        taxonomy_value = f'Healthcare Provider Taxonomy Code_{i}'
        if col.get(taxonomy_switch) == 'Y':
            return col.get(taxonomy_value)
    return 'no primary taxonomy'

In [None]:
%%capture [--no-stderr]
# the capture above is here so that it doesn't show warnings about columns types and so that I avoid manually setting dozens of columns dtypes!

db = sqlite3.connect('../data/hop_teaming_database.sqlite')

for chunk in tqdm(pd.read_csv('../data/npidata_pfile_20050523-20230212.csv', chunksize = 10000, dtype={'Provider Business Practice Location Address Postal Code': object})):

    # first extract the primary taxonomy
    chunk['Primary Taxonomy'] = chunk.apply(lambda col: find_taxonomy(col), axis=1)

    # Take only first 5 digits from postal code column
    chunk['Provider Business Practice Location Address Postal Code'] = chunk['Provider Business Practice Location Address Postal Code'].str[:5]

    # next, only keep columns we're interested in and renaming so that there are no ()
    chunk = (
        chunk 
        [['NPI',
        'Entity Type Code',
        'Provider Organization Name (Legal Business Name)',
        'Provider Last Name (Legal Name)',
        'Provider First Name',
        'Provider Middle Name',
        'Provider Name Prefix Text',
        'Provider Name Suffix Text',
        'Provider Credential Text',
        'Provider First Line Business Practice Location Address',
        'Provider Second Line Business Practice Location Address',
        'Provider Business Practice Location Address City Name',
        'Provider Business Practice Location Address State Name',
        'Provider Business Practice Location Address Postal Code',
        'Primary Taxonomy']]
        .rename(columns={'Provider Organization Name (Legal Business Name)': 'Provider Organization Name',
        'Provider Last Name (Legal Name)': 'Provider Last Name',
        'Provider Name Prefix Text': 'Provider Name Prefix',
        'Provider Name Suffix Text': 'Provider Name Suffix',
        'Provider Business Practice Location Address City Name': 'City',
        'Provider Business Practice Location Address State Name': 'State',
        'Provider Business Practice Location Address Postal Code': 'Postal Code'})
    )

    # Then clean up the column names
    chunk.columns = [x.lower().replace(' ', '_') for x in chunk.columns]

    # Finally, the chunk to a calls table
    chunk.to_sql('npidata_pfile', db, if_exists = 'append', index = False)

In [None]:
query = """
SELECT *
FROM npidata_pfile
WHERE provider_business_practice_location_address_city_name = 'NASHVILLE'
AND provider_business_practice_location_address_state_name = 'TN'
"""
with sqlite3.connect('../data/hop_teaming_database.sqlite') as db: 
    npidata_nashville = pd.read_sql_query(query, db)
npidata_nashville

In [None]:
db.execute('CREATE INDEX npi ON npidata_pfile(npi)')

In [None]:
db.close()

In [None]:
db = sqlite3.connect('../data/hop_teaming_database.sqlite')

nucc_taxonomy = pd.read_csv('../data/nucc_taxonomy_230.csv', encoding = 'unicode_escape')

# lowercase column names and replace spaces
nucc_taxonomy.columns = [x.lower().replace(' ', '_') for x in nucc_taxonomy.columns]

# add table to database
nucc_taxonomy.to_sql('nucc_taxonomy', db, if_exists = 'append', index = False)

#create index
db.execute('CREATE INDEX code ON nucc_taxonomy(code)')

db.close()

nucc_taxonomy

In [None]:
db = sqlite3.connect('../data/hop_teaming_database.sqlite')

zip_cbsa = pd.read_excel('../data/ZIP_CBSA_122021.xlsx', index_col = None, header = 0, dtype={'zip': object})

# add table to database
zip_cbsa.to_sql('zip_cbsa', db, if_exists = 'append', index = False)

#create index
db.execute('CREATE INDEX zip ON zip_cbsa(zip)')

db.close()

zip_cbsa