In [1]:
import pandas as pd
import sqlite3
from tqdm.notebook import tqdm

We want to eliminate "accidental" referrals, so filter the hop teaming data so that the transaction_count is at least 50 and the average_day_wait is less than 50.

In [15]:
db = sqlite3.connect('data/NPPES_Data_Dissemination.sqlite')

chunks = pd.read_csv('data/DocGraph_Hop_Teaming_2018.csv', chunksize = 10000)
hop = []
for chunk in chunks: 
    chunk = chunk[(chunk['transaction_count'] >= 50) & (chunk['average_day_wait'] < 50)]
    hop.append(chunk)
    chunk.to_sql('hop', db, if_exists = 'append', index = False) 

The NPPES dataset contains a large number of fields, only a few of which are relevant to this project:

In [2]:
cols = ['NPI','Entity Type Code','Provider Organization Name (Legal Business Name)',
        'Provider Last Name (Legal Name)','Provider First Name', 'Provider Middle Name',
        'Provider Name Prefix Text', 'Provider Name Suffix Text','Provider Credential Text', 
        'Provider First Line Business Practice Location Address', 
        'Provider Second Line Business Practice Location Address',
        'Provider Business Practice Location Address City Name',
        'Provider Business Practice Location Address State Name',
        'Provider Business Practice Location Address Postal Code', 
        'Healthcare Provider Taxonomy Code_1', 
        'Healthcare Provider Primary Taxonomy Switch_1', 
        'Healthcare Provider Taxonomy Code_2', 
        'Healthcare Provider Primary Taxonomy Switch_2', 
        'Healthcare Provider Taxonomy Code_3', 
        'Healthcare Provider Primary Taxonomy Switch_3', 
        'Healthcare Provider Taxonomy Code_4', 
        'Healthcare Provider Primary Taxonomy Switch_4', 
        'Healthcare Provider Taxonomy Code_5', 
        'Healthcare Provider Primary Taxonomy Switch_5', 
        'Healthcare Provider Taxonomy Code_6', 
        'Healthcare Provider Primary Taxonomy Switch_6', 
        'Healthcare Provider Taxonomy Code_7', 
        'Healthcare Provider Primary Taxonomy Switch_7', 
        'Healthcare Provider Taxonomy Code_8', 
        'Healthcare Provider Primary Taxonomy Switch_8', 
        'Healthcare Provider Taxonomy Code_9', 
        'Healthcare Provider Primary Taxonomy Switch_9', 
        'Healthcare Provider Taxonomy Code_10', 
        'Healthcare Provider Primary Taxonomy Switch_10', 
        'Healthcare Provider Taxonomy Code_11', 
        'Healthcare Provider Primary Taxonomy Switch_11',
        'Healthcare Provider Taxonomy Code_12', 
        'Healthcare Provider Primary Taxonomy Switch_12', 
        'Healthcare Provider Taxonomy Code_13', 
        'Healthcare Provider Primary Taxonomy Switch_13', 
        'Healthcare Provider Taxonomy Code_14', 
        'Healthcare Provider Primary Taxonomy Switch_14', 
        'Healthcare Provider Taxonomy Code_15', 
        'Healthcare Provider Primary Taxonomy Switch_15']

nppes_chunks = pd.read_csv('data/NPPES_Data_Dissemination_February_2023/npidata_pfile_20050523-20230212.csv', usecols=cols, iterator=True, chunksize=1000)

In [3]:
state_list = ['TN', 'AR', 'NC', 'MS', 'GA']
nppes = pd.concat([chunk[chunk['Provider Business Practice Location Address State Name'].isin(state_list)] for chunk in nppes_chunks])

In [4]:
var_cols = ['Healthcare Provider Taxonomy Code', 'Healthcare Provider Primary Taxonomy Switch']

In [5]:
state_list = ['TN', 'AR', 'NC', 'MS', 'GA']
nppes = pd.DataFrame()
id_cols = ['NPI','Entity Type Code','Provider Organization Name (Legal Business Name)', 'Provider Last Name (Legal Name)','Provider First Name', 'Provider Middle Name','Provider Name Prefix Text', 'Provider Name Suffix Text','Provider Credential Text', 'Provider First Line Business Practice Location Address', 'Provider Second Line Business Practice Location Address','Provider Business Practice Location Address City Name','Provider Business Practice Location Address State Name','Provider Business Practice Location Address Postal Code']

for chunk in nppes_chunks:
    filtered_chunk = chunk[chunk['Provider Business Practice Location Address State Name'].isin(state_list)]
    filtered_chunk = pd.wide_to_long(filtered_chunk, var_cols, i = id_cols , j = "", sep = "_")
    
    nppes = pd.concat([nppes, filtered_chunk])

In [8]:
nppes.reset_index()

Unnamed: 0,index


In [9]:
nppes.head()

Converting the data into a sqlite database

In [10]:
db = sqlite3.connect('data/NPPES_Data_Dissemination.sqlite')

cols = ['NPI','Entity Type Code','Provider Organization Name (Legal Business Name)', 'Provider Last Name (Legal Name)','Provider First Name', 'Provider Middle Name','Provider Name Prefix Text',  'Provider Name Suffix Text','Provider Credential Text', 
        'Provider First Line Business Practice Location Address', 
        'Provider Second Line Business Practice Location Address',
        'Provider Business Practice Location Address City Name',
        'Provider Business Practice Location Address State Name',
        'Provider Business Practice Location Address Postal Code', 
        'Healthcare Provider Taxonomy Code_1', 
        'Healthcare Provider Primary Taxonomy Switch_1', 
        'Healthcare Provider Taxonomy Code_2', 'Healthcare Provider Primary Taxonomy Switch_2', 'Healthcare Provider Taxonomy Code_3', 'Healthcare Provider Primary Taxonomy Switch_3', 'Healthcare Provider Taxonomy Code_4', 'Healthcare Provider Primary Taxonomy Switch_4', 'Healthcare Provider Taxonomy Code_5', 'Healthcare Provider Primary Taxonomy Switch_5', 'Healthcare Provider Taxonomy Code_6', 'Healthcare Provider Primary Taxonomy Switch_6', 'Healthcare Provider Taxonomy Code_7', 'Healthcare Provider Primary Taxonomy Switch_7', 'Healthcare Provider Taxonomy Code_8', 'Healthcare Provider Primary Taxonomy Switch_8', 'Healthcare Provider Taxonomy Code_9', 'Healthcare Provider Primary Taxonomy Switch_9', 'Healthcare Provider Taxonomy Code_10', 'Healthcare Provider Primary Taxonomy Switch_10', 'Healthcare Provider Taxonomy Code_11', 'Healthcare Provider Primary Taxonomy Switch_11', 'Healthcare Provider Taxonomy Code_12', 'Healthcare Provider Primary Taxonomy Switch_12', 'Healthcare Provider Taxonomy Code_13', 'Healthcare Provider Primary Taxonomy Switch_13', 'Healthcare Provider Taxonomy Code_14', 'Healthcare Provider Primary Taxonomy Switch_14', 'Healthcare Provider Taxonomy Code_15', 'Healthcare Provider Primary Taxonomy Switch_15']

nppes_chunks = pd.read_csv('data/NPPES_Data_Dissemination_February_2023/npidata_pfile_20050523-20230212.csv', usecols=cols, iterator=True, chunksize=100000, low_memory=False)
var_cols = ['Healthcare Provider Taxonomy Code', 'Healthcare Provider Primary Taxonomy Switch']

nppes_full = []

state_list = ['TN', 'AR', 'NC', 'MS', 'GA']
nppes = pd.DataFrame()
id_cols = ['NPI','Entity Type Code','Provider Organization Name (Legal Business Name)', 'Provider Last Name (Legal Name)','Provider First Name', 'Provider Middle Name','Provider Name Prefix Text', 'Provider Name Suffix Text','Provider Credential Text', 'Provider First Line Business Practice Location Address', 'Provider Second Line Business Practice Location Address','Provider Business Practice Location Address City Name','Provider Business Practice Location Address State Name','Provider Business Practice Location Address Postal Code']

for chunk in nppes_chunks:
    filtered_chunk = chunk[chunk['Provider Business Practice Location Address State Name'].isin(state_list)]
    filtered_chunk = pd.wide_to_long(filtered_chunk, var_cols, i = id_cols , j = "", sep = "_")
    
    nppes = pd.concat([nppes, filtered_chunk])

nppes = nppes.reset_index()
nppes.columns = nppes.columns.str.replace(" ", "")
nppes = nppes.drop(columns = "")


nppes.to_sql('nppes', con = db, if_exists = 'append', index = False) 


db.close()

In [11]:
db.close()

Verifying the data.

In [16]:
db = sqlite3.connect('data/NPPES_Data_Dissemination.sqlite')
query = "SELECT * FROM nppes"

In [17]:
with sqlite3.connect('data/NPPES_Data_Dissemination.sqlite') as db: 
    nppes_sqlite = pd.read_sql(query, db)

In [18]:
nppes_sqlite

Unnamed: 0,NPI,EntityTypeCode,ProviderOrganizationName(LegalBusinessName),ProviderLastName(LegalName),ProviderFirstName,ProviderMiddleName,ProviderNamePrefixText,ProviderNameSuffixText,ProviderCredentialText,ProviderFirstLineBusinessPracticeLocationAddress,ProviderSecondLineBusinessPracticeLocationAddress,ProviderBusinessPracticeLocationAddressCityName,ProviderBusinessPracticeLocationAddressStateName,ProviderBusinessPracticeLocationAddressPostalCode,HealthcareProviderTaxonomyCode,HealthcareProviderPrimaryTaxonomySwitch
0,1750384210,1.0,,GILMER,CARISSIA,,,,PHARMD,1410 S 4TH ST,,NASHVILLE,AR,718523009,183500000X,Y
1,1750384210,1.0,,GILMER,CARISSIA,,,,PHARMD,1410 S 4TH ST,,NASHVILLE,AR,718523009,,
2,1750384210,1.0,,GILMER,CARISSIA,,,,PHARMD,1410 S 4TH ST,,NASHVILLE,AR,718523009,,
3,1750384210,1.0,,GILMER,CARISSIA,,,,PHARMD,1410 S 4TH ST,,NASHVILLE,AR,718523009,,
4,1750384210,1.0,,GILMER,CARISSIA,,,,PHARMD,1410 S 4TH ST,,NASHVILLE,AR,718523009,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9760360,1104850130,2.0,HEIKKI E KOSTAMAA MD PC,,,,,,,28 WHITE BRIDGE PIKE,STE. 208,NASHVILLE,TN,37205,,
9760361,1104850130,2.0,HEIKKI E KOSTAMAA MD PC,,,,,,,28 WHITE BRIDGE PIKE,STE. 208,NASHVILLE,TN,37205,,
9760362,1104850130,2.0,HEIKKI E KOSTAMAA MD PC,,,,,,,28 WHITE BRIDGE PIKE,STE. 208,NASHVILLE,TN,37205,,
9760363,1104850130,2.0,HEIKKI E KOSTAMAA MD PC,,,,,,,28 WHITE BRIDGE PIKE,STE. 208,NASHVILLE,TN,37205,,


In [20]:
db = sqlite3.connect('data/NPPES_Data_Dissemination.sqlite')
query = "SELECT * FROM hop"

with sqlite3.connect('data/NPPES_Data_Dissemination.sqlite') as db: 
    hop_sqlite = pd.read_sql(query, db)
    
hop_sqlite

Unnamed: 0,from_npi,to_npi,patient_count,transaction_count,average_day_wait,std_day_wait
0,1508085911,1730166125,58,67,23.925,43.923
1,1508167040,1730166125,51,51,28.196,52.876
2,1508863549,1730166125,340,391,18.302,42.422
3,1508867870,1730166125,50,79,12.658,26.402
4,1508011040,1730166224,132,145,8.579,28.053
...,...,...,...,...,...,...
34176933,1417037664,1497939599,36,106,19.330,42.407
34176934,1417194903,1497939599,22,70,16.629,30.598
34176935,1417406372,1497939599,21,65,20.123,37.750
34176936,1417064825,1497940605,75,79,10.418,34.744


Using the primary taxonomy code, match each provider to a classification (from the Classification column


- load it in a separate table, nppes merge and have classification. 

In [None]:
db = sqlite3.connect('data/nucc_taxonomy.sqlite')

chunks = pd.read_csv('data/nucc_taxonomy_230.csv', chunksize = 10000)


Match each provider to a CBSA using the Business Zip code. Narrow down to providers in other areas. Based on the zip code I can check what is contain in the nnpes. Merge in front end. 

In [None]:
db = sqlite3.connect('data/CBSA.sqlite')
CBSA = pd.read_excel('data/ZIP_CBSA_122021.xlsx')
CBSA