# Chunking and SQLite

In this notebook, we'll see a couple of techniques that can be used when working with large file in Python.

In [1]:
import pandas as pd
import numpy as np
import sqlite3
from tqdm.notebook import tqdm

Before we do anything else, let's see how many rows are contained in the Calls for Service file. 

In [None]:
#using terminal  "wc -l file.txt"

In [None]:
npi_cols=['NPI',
 'Entity Type Code',
 'Replacement NPI',
 'Employer Identification Number (EIN)',
 'Provider Organization Name (Legal Business Name)',
 'Provider Last Name (Legal Name)',
 'Provider First Name',
 'Provider Middle Name',
 'Provider Name Prefix Text',
 'Provider Name Suffix Text',
 'Provider Credential Text',
 'Provider First Line Business Practice Location Address',
 'Provider Second Line Business Practice Location Address',
 'Provider Business Practice Location Address City Name',
 'Provider Business Practice Location Address State Name',
 'Provider Business Practice Location Address Postal Code',
 'Provider Gender Code',
 'Healthcare Provider Taxonomy Code_1',
 'Healthcare Provider Primary Taxonomy Switch_1',
 'Healthcare Provider Taxonomy Code_2',
 'Healthcare Provider Primary Taxonomy Switch_2',
 'Healthcare Provider Taxonomy Code_3',
 'Healthcare Provider Primary Taxonomy Switch_3',
 'Healthcare Provider Taxonomy Code_4',
 'Healthcare Provider Primary Taxonomy Switch_4',
 'Healthcare Provider Taxonomy Code_5',
 'Healthcare Provider Primary Taxonomy Switch_5',
 'Healthcare Provider Taxonomy Code_6',
 'Healthcare Provider Primary Taxonomy Switch_6',
 'Healthcare Provider Taxonomy Code_7',
 'Healthcare Provider Primary Taxonomy Switch_7',
 'Healthcare Provider Taxonomy Code_8',
 'Healthcare Provider Primary Taxonomy Switch_8',
 'Healthcare Provider Taxonomy Code_9',
 'Healthcare Provider Primary Taxonomy Switch_9',
 'Healthcare Provider Taxonomy Code_10',
 'Healthcare Provider Primary Taxonomy Switch_10',
 'Healthcare Provider Taxonomy Code_11',
 'Healthcare Provider Primary Taxonomy Switch_11',
 'Healthcare Provider Taxonomy Code_12',
 'Healthcare Provider Primary Taxonomy Switch_12',
 'Healthcare Provider Taxonomy Code_13',
 'Healthcare Provider Primary Taxonomy Switch_13',
 'Healthcare Provider Taxonomy Code_14',
 'Healthcare Provider Primary Taxonomy Switch_14',
 'Healthcare Provider Taxonomy Code_15',
 'Healthcare Provider Primary Taxonomy Switch_15']

First, we need to connect to our database. The connect function will either create a new database if one does not already exist or connect to an existing one.

In [None]:
db = sqlite3.connect('nppes.sqlite')

Now, we can chunk through the data and for each row, add the rows to a table in our sqlite database. 
To keep track of how much progress has been made, we can use the `tqdm` library.

In [None]:
for chunk in tqdm(pd.read_csv('NPPES_Data_Dissemination_February_2021/npidata_pfile_20050523-20210207.csv', usecols=npi_cols, chunksize = 10000)):
    chunk.columns = [x.lower().replace(' ', '_').replace('(', '').replace(')','').replace('*','') for x in chunk.columns]      # Clean up the column names
    conditions = [
    (chunk['healthcare_provider_primary_taxonomy_switch_1']=='Y'),
    (chunk['healthcare_provider_primary_taxonomy_switch_2']=='Y'),
    (chunk['healthcare_provider_primary_taxonomy_switch_3']=='Y'),
    (chunk['healthcare_provider_primary_taxonomy_switch_4']=='Y'),
    (chunk['healthcare_provider_primary_taxonomy_switch_5']=='Y'),
    (chunk['healthcare_provider_primary_taxonomy_switch_6']=='Y'),
    (chunk['healthcare_provider_primary_taxonomy_switch_7']=='Y'),
    (chunk['healthcare_provider_primary_taxonomy_switch_8']=='Y')]

# create a list of the values we want to assign for each condition
    values = [chunk['healthcare_provider_taxonomy_code_1'],
          chunk['healthcare_provider_taxonomy_code_2'],
          chunk['healthcare_provider_taxonomy_code_3'],
          chunk['healthcare_provider_taxonomy_code_4'],
          chunk['healthcare_provider_taxonomy_code_5'],
          chunk['healthcare_provider_taxonomy_code_6'],
          chunk['healthcare_provider_taxonomy_code_7'],
          chunk['healthcare_provider_taxonomy_code_8']]

# create a new column and use np.select to assign values to it using our lists as arguments
    chunk['taxonomy'] = np.select(conditions, values)
    chunk[chunk.columns.drop(list(chunk.filter(regex='healthcare_provider_')))]
    chunk.to_sql('npi_data', db, if_exists = 'append', index = False)

#figure out this later
#def find_primary_code(row):
    #col = 'Healthcare Provider Primary Taxonomy Switch_'
    #for i in range(1, 16):
        #if row[col + str(i)] == 'Y':
            #return row['Healthcare Provider Taxonomy Code_' + str(i)]

In [None]:
db.close()

In [None]:
db = sqlite3.connect('nppes.sqlite')
query='''
select *
from npi_data
order by npi;
'''
npi_data = pd.read_sql(query, db)
npi_data

In [None]:
db.close()

In [None]:
npi_data.duplicated(subset=['npi']).value_counts()

In [None]:
npi_data=npi_data.drop_duplicates(subset=['npi'])

In [None]:
db = sqlite3.connect('nppes.sqlite')
nucc_tax=pd.read_csv('NPPES_Data_Dissemination_February_2021/nucc_taxonomy_210.csv')
nucc_tax.columns = [x.lower().replace(' ', '_') for x in nucc_tax.columns]
nucc_tax.to_sql('nucc_tax', db,  if_exists='append', index = False)    


In [None]:
query='''
select *
from nucc_tax;
'''
nucc_tax = pd.read_sql(query, db)
nucc_tax

In [None]:
db.close()

In [None]:
nucc_tax.info()

To speed up queries which use a specific column, we can create an **index** on that column. This causes the database to store that column in a way that helps it to retrieve rows quicker.

In [None]:
db = sqlite3.connect('nppes.sqlite')
db.execute('CREATE INDEX taxonomy ON npi_data(taxonomy)')
db.execute('CREATE INDEX code ON nucc_tax(code)')

In [None]:
db = sqlite3.connect('nppes.sqlite')
cbsa=pd.read_csv('NPPES_Data_Dissemination_February_2021/ZIP_CBSA_122020.csv')
cbsa.columns = [x.lower().replace(' ', '_') for x in cbsa.columns]
cbsa.to_sql('cbsa', db,  if_exists='append', index = False)

In [None]:
db = sqlite3.connect('nppes.sqlite')
query= '''
select *       
from cbsa 
'''
cbsa = pd.read_sql(query, db)
cbsa

In [None]:
db = sqlite3.connect('nppes.sqlite')
query= '''
select *      
from npi_data as np
left join nucc_tax as nu
on np.taxonomy=nu.code 
join cbsa as c
on np.provider_business_practice_location_address_postal_code=c.zip 
'''
npp_nucc_tax = pd.read_sql(query, db)
npp_nucc_tax

In [None]:
df=npp_nucc_tax[['npi', 'provider_business_practice_location_address_postal_code', 'specialization', 'zip','cbsa']]
df.sort_values(by=['cbsa'])

In [None]:
cbsa.info()

In [None]:
df.duplicated(subset=['npi'], ).value_counts()

In [None]:
npp_nucc=pd.merge(npi_data, nucc_tax,left_on='taxonomy', right_on='code', how='left')

In [None]:
npp_nucc.duplicated(subset=['npi']).value_counts()

In [None]:
npp_nucc.head(50)


In [None]:
npp_nucc['postal_code']=npp_nucc['provider_business_practice_location_address_postal_code'].astype(str).str[0:5]

cbsa['zip1']=cbsa['zip'].astype(str).str[0:5]


In [None]:
npp_nucc_cbsa=pd.merge(npp_nucc, cbsa,left_on= 'postal_code', right_on='zip1', how='left')

In [None]:
npp_nucc_cbsa.duplicated(subset=['npi']).value_counts()

In [None]:
npp_nucc_cbsa_nash=npp_nucc_cbsa[npp_nucc_cbsa['cbsa']==34980]


In [None]:
npp_nucc_cbsa_nash.to_csv('npp_taxo_cbsa_nash.csv',index=False)

In [None]:
npp_nucc_cbsa_nash.duplicated(subset=['npi']).value_counts()

In [None]:
npp_nucc_cbsa[['npi','specialization','zip']].head(50)

In [None]:
pd.read_csv('DocGraph_Hop_Teaming_2017_Non_Commercial/DocGraph_Hop_Teaming_2017.csv', nrows = 1000).info()

In [2]:
db = sqlite3.connect('nppes.sqlite')
for chunk in tqdm(pd.read_csv('DocGraph_Hop_Teaming_2017_Non_Commercial/DocGraph_Hop_Teaming_2017.csv', chunksize = 100000)):
    chunk[(chunk['transaction_count']>=50) | (chunk['average_day_wait']<=50)]
    chunk.to_sql('hop', db, if_exists ='append', index = False)


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [3]:
db.close()

In [4]:
db = sqlite3.connect('nppes.sqlite')
query='''
select *
from hop;
'''
hop_team = pd.read_sql(query, db)
hop_team 

Unnamed: 0,from_npi,to_npi,patient_count,transaction_count,average_day_wait,std_day_wait
0,1003863580,1000000004,19,19,108.895,84.598
1,1043250400,1000000004,20,20,87.000,77.173
2,1033239413,1000000004,20,20,58.800,76.982
3,1033142146,1000000004,491,535,10.232,36.558
4,1013957562,1000000004,25,26,78.692,59.305
...,...,...,...,...,...,...
203330902,1235291360,1255367132,12,13,44.769,96.969
203330903,1235180266,1255367132,23,23,13.739,21.020
203330904,1265448617,1255367132,15,16,10.938,25.702
203330905,1265437644,1255367132,78,82,26.561,51.304


In [6]:
npp_nucc_cbsa_nash=pd.read_csv('npp_taxo_cbsa_nash.csv') 

In [None]:
#npp_nucc_cbsa_nash['npp_npi']=npp_nucc_cbsa_nash['npi'].astype(str)
#hop_team['hop_from_npi']=hop_team['from_npi'].astype(str)
#hop_team['hop_to_npi']=hop_team['to_npi'].astype(str)

In [7]:
npp_nucc_cbsa_nash_hop=pd.merge(npp_nucc_cbsa_nash, hop_team, left_on= 'npi', right_on='from_npi', how='inner')

In [8]:
npp_nucc_cbsa_nash_hop1=pd.merge(npp_nucc_cbsa_nash, hop_team, left_on= 'npi', right_on='to_npi', how='inner')

In [12]:
npp_nucc_cbsa_nash_hop1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1562518 entries, 0 to 1562517
Data columns (total 42 columns):
 #   Column                                                   Non-Null Count    Dtype  
---  ------                                                   --------------    -----  
 0   npi                                                      1562518 non-null  int64  
 1   entity_type_code                                         1562518 non-null  float64
 2   replacement_npi                                          0 non-null        float64
 3   employer_identification_number_ein                       650980 non-null   object 
 4   provider_organization_name_legal_business_name           650980 non-null   object 
 5   provider_last_name_legal_name                            911402 non-null   object 
 6   provider_first_name                                      911538 non-null   object 
 7   provider_middle_name                                     731230 non-null   object 
 8   pr

In [24]:
npp_nucc_cbsa_nash_hop=npp_nucc_cbsa_nash_hop[npp_nucc_cbsa_nash_hop['entity_type_code']==1]

In [25]:
npp_nucc_cbsa_nash_hop.to_csv('npp_nucc_cbsa_nash_hop_from.csv',index=False)

In [26]:
npp_nucc_cbsa_nash_hop1=npp_nucc_cbsa_nash_hop1[npp_nucc_cbsa_nash_hop1['entity_type_code']==2]

In [27]:
npp_nucc_cbsa_nash_hop1.to_csv('npp_nucc_cbsa_nash_hop_to.csv',index=False)

In [28]:
npp_hop_dfs=[npp_nucc_cbsa_nash_hop,npp_nucc_cbsa_nash_hop1]

In [29]:
npp_nucc_cbsa_nash_hop_all=pd.concat(npp_hop_dfs, keys=['from', 'to'])

Unnamed: 0,npi,entity_type_code,replacement_npi,employer_identification_number_ein,provider_organization_name_legal_business_name,provider_last_name_legal_name,provider_first_name,provider_middle_name,provider_name_prefix_text,provider_name_suffix_text,...,bus_ratio,oth_ratio,tot_ratio,zip1,from_npi,to_npi,patient_count,transaction_count,average_day_wait,std_day_wait
0,1003013160,1.0,,,,GRABENSTEIN,WILLIAM,P.,DR.,,...,0.001398,0.0,0.003284,37043,1003013160,1023055126,13,23,31.870,36.145
1,1003013160,1.0,,,,GRABENSTEIN,WILLIAM,P.,DR.,,...,0.001398,0.0,0.003284,37043,1003013160,1053316265,14,17,57.294,59.985
2,1003013160,1.0,,,,GRABENSTEIN,WILLIAM,P.,DR.,,...,0.001398,0.0,0.003284,37043,1003013160,1073585774,22,32,48.531,65.231
3,1003013160,1.0,,,,GRABENSTEIN,WILLIAM,P.,DR.,,...,0.001398,0.0,0.003284,37043,1003013160,1083657829,16,24,28.042,35.471
4,1003013160,1.0,,,,GRABENSTEIN,WILLIAM,P.,DR.,,...,0.001398,0.0,0.003284,37043,1003013160,1083819668,21,24,57.125,66.059
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1571414,1992987085,1.0,,,,HORTON,SUSANNE,M,,,...,1.000000,1.0,1.000000,37232,1992987085,1962606939,19,19,0.895,2.536
1571415,1992987085,1.0,,,,HORTON,SUSANNE,M,,,...,1.000000,1.0,1.000000,37232,1992987085,1962768663,15,15,23.800,57.407
1571416,1992998439,1.0,,,,DERLETH,CHRISTINA,,,,...,1.000000,1.0,1.000000,37232,1992998439,1104202761,18,44,0.159,1.055
1571417,1992998439,1.0,,,,DERLETH,CHRISTINA,,,,...,1.000000,1.0,1.000000,37232,1992998439,1396882205,19,45,0.000,0.000


In [None]:
npp_nucc_cbsa_nash_hop_all.to_csv('npp_nucc_cbsa_nash_hopall.csv',index=False)