# Get NPPES 2020 Data

In [1]:
# import libraries
import pandas as pd
import sqlite3
from tqdm.notebook import tqdm

### Identify the columns we need

In [14]:
# import the file header for the NPI data as a dataframe
nppes_header = pd.read_csv('../data/nppes/npidata_pfile_20050523-20210207_FileHeader.csv')

In [15]:
# write the file header columns to a list
nppes_headers_list = nppes_header.columns.to_list()

In [16]:
# write that list back to a dataframe in a column called 'variables'
nppes_headers_list = pd.DataFrame(nppes_headers_list, columns = ['variables'])

In [17]:
# Check out the variables generated
nppes_headers_list

Unnamed: 0,variables
0,NPI
1,Entity Type Code
2,Replacement NPI
3,Employer Identification Number (EIN)
4,Provider Organization Name (Legal Business Name)
...,...
325,Healthcare Provider Taxonomy Group_12
326,Healthcare Provider Taxonomy Group_13
327,Healthcare Provider Taxonomy Group_14
328,Healthcare Provider Taxonomy Group_15


In [18]:
# Select only the columns we want
nppes_headers_vars = nppes_headers_list[nppes_headers_list['variables'].isin([
    'NPI',
    'Provider Organization Name (Legal Business Name)',
    'Provider Last Name (Legal Name)',
    'Provider First Name',
    'Provider Middle Name',
    'Provider Name Prefix Text',
    'Provider Name Suffix Text',
    'Provider Credential Text',
    'Provider First Line Business Mailing Address',
    'Provider Second Line Business Mailing Address',
    'Provider Business Mailing Address City Name',
    'Provider Business Mailing Address State Name',
    'Provider Business Mailing Address Postal Code']) == True]
# Maggi will get the ‘Healthcare Provider Taxonomy Code*’ fields, with primary type, per NPI and we'll join that later

In [19]:
# Check out the variables
nppes_headers_vars

Unnamed: 0,variables
0,NPI
4,Provider Organization Name (Legal Business Name)
5,Provider Last Name (Legal Name)
6,Provider First Name
7,Provider Middle Name
8,Provider Name Prefix Text
9,Provider Name Suffix Text
10,Provider Credential Text
20,Provider First Line Business Mailing Address
21,Provider Second Line Business Mailing Address


In [20]:
# Write it to a list
nppes_headers_vars = nppes_headers_vars.variables.to_list()

In [21]:
# Check out that list
nppes_headers_vars

['NPI',
 'Provider Organization Name (Legal Business Name)',
 'Provider Last Name (Legal Name)',
 'Provider First Name',
 'Provider Middle Name',
 'Provider Name Prefix Text',
 'Provider Name Suffix Text',
 'Provider Credential Text',
 'Provider First Line Business Mailing Address',
 'Provider Second Line Business Mailing Address',
 'Provider Business Mailing Address City Name',
 'Provider Business Mailing Address State Name',
 'Provider Business Mailing Address Postal Code']

### Create a database to hold all the things

In [22]:
# create a database or connect to an existing one
db = sqlite3.connect('../data/hop_teaming.sqlite')

In [23]:
# if you need to rewrite the database...
cursor = db.cursor()

In [24]:
# Drop the table and return a line that says that it's gone
cursor.execute("DROP TABLE nppes")
print("Table dropped...")

Table dropped...


In [25]:
# Iterate over the dataframe, using only the selected columns, and build a table
# With a chunksize of 5000, it will take 1343 loops to build the database
# There will be a pink warning re: column types which can probably be disregarded
for chunk in tqdm(pd.read_csv('../data/nppes/npidata_pfile_20050523-20210207.csv', usecols = nppes_headers_vars, chunksize = 5000)):
    chunk.columns = [x.lower().replace(' ', '_') for x in chunk.columns]      # Clean up the column names
    chunk.to_sql('nppes', db, if_exists = 'append', index = False)            # Append the chunk to a calls table

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

  await eval(code_obj, self.user_global_ns, self.user_ns)





In [26]:
# Write a test query
query = '''
SELECT *
FROM nppes
LIMIT 10
'''

In [27]:
# Write the results to a test dataframe
test = pd.read_sql(query, db)

In [28]:
# Check out the test dataframe
test

Unnamed: 0,npi,provider_organization_name_(legal_business_name),provider_last_name_(legal_name),provider_first_name,provider_middle_name,provider_name_prefix_text,provider_name_suffix_text,provider_credential_text,provider_first_line_business_mailing_address,provider_second_line_business_mailing_address,provider_business_mailing_address_city_name,provider_business_mailing_address_state_name,provider_business_mailing_address_postal_code
0,1679576722,,WIEBE,DAVID,A,,,M.D.,PO BOX 2168,,KEARNEY,NE,688482168.0
1,1588667638,,PILCHER,WILLIAM,C,DR.,,MD,1824 KING STREET,SUITE 300,JACKSONVILLE,FL,322044736.0
2,1497758544,"CUMBERLAND COUNTY HOSPITAL SYSTEM, INC",,,,,,,3418 VILLAGE DR,,FAYETTEVILLE,NC,283044552.0
3,1306849450,,SMITSON,HAROLD,LEROY,DR.,II,M.D.,810 LUCAS DR,,ATHENS,TX,757513446.0
4,1215930367,,GRESSOT,LAURENT,,DR.,,M.D.,17323 RED OAK DR,,HOUSTON,TX,770901243.0
5,1023011178,COLLABRIA CARE,,,,,,,414 S JEFFERSON ST,,NAPA,CA,945594515.0
6,1932102084,,ADUSUMILLI,RAVI,K,,,MD,2940 N MCCORD RD,,TOLEDO,OH,436151753.0
7,1841293990,,WORTSMAN,SUSAN,,,,MA-CCC,68 ROCKLEDGE RD,APT 1C,HARTSDALE,NY,105303455.0
8,1750384806,,BISBEE,ROBERT,,DR.,,MD,5219 CITY BANK PKWY STE 214,,LUBBOCK,TX,794073537.0
9,1669475711,,SUNG,BIN,SHENG,,,M. D.,600 JEFFERSON ST STE 301,,LAFAYETTE,LA,705016987.0


In [29]:
db.close()