# Get NPPES 2020 Data

In [None]:
# import libraries
import pandas as pd
import re
import sqlite3
from tqdm.notebook import tqdm

### Identify the columns we need

In [None]:
# import the file header for the NPI data as a dataframe
nppes_header = pd.read_csv('../data/nppes/npidata_pfile_20050523-20210207_FileHeader.csv')

In [None]:
# write the file header columns to a list
nppes_headers_list = nppes_header.columns.to_list()

In [None]:
# write that list back to a dataframe in a column called 'variables'
nppes_headers_list = pd.DataFrame(nppes_headers_list, columns = ['variables'])

In [None]:
# Check out the variables generated
nppes_headers_list

In [None]:
# Select only the columns we want
nppes_headers_vars = nppes_headers_list[nppes_headers_list['variables'].isin([
    'NPI',
    'Entity Type Code',
    'Provider Organization Name (Legal Business Name)',
    'Provider Last Name (Legal Name)',
    'Provider First Name',
    'Provider Middle Name',
    'Provider Name Prefix Text',
    'Provider Name Suffix Text',
    'Provider Credential Text',
    'Provider First Line Business Practice Location Address',
    'Provider Second Line Business Practice Location Address',
    'Provider Business Practice Location Address City Name',
    'Provider Business Practice Location Address State Name',
    'Provider Business Practice Location Address Postal Code']) == True]
# Maggi will get the ‘Healthcare Provider Taxonomy Code *’ fields, with primary type, per NPI
# We'll put that in a different table and join that later

In [None]:
# Check out the variables
nppes_headers_vars

In [None]:
# Write it to a list
nppes_headers_vars = nppes_headers_vars.variables.to_list()

In [None]:
# Check out that list
nppes_headers_vars

### Create a database to hold all the things

In [None]:
# Create a database or connect to an existing one
db = sqlite3.connect('../data/hop_teaming.sqlite')

In [None]:
# If you need to rewrite the table...
cursor = db.cursor()

In [None]:
# Drop the table and return a line that says that it's gone
cursor.execute("DROP TABLE nppes")
print("Table dropped...")

In [None]:
# Iterate over the dataframe, using only the selected columns, and build a table
# With a chunksize of 5,000, it will take 1,343 loops to build the database
# There will be a pink warning re: column types which can probably be disregarded
for chunk in tqdm(pd.read_csv('../data/nppes/npidata_pfile_20050523-20210207.csv', usecols = nppes_headers_vars, chunksize = 5000)):
    chunk.columns = [re.sub(r'\ \(|\)|\ ', '_', x.lower()) for x in chunk.columns]      # Clean up the column names
    chunk.to_sql('nppes', db, if_exists = 'append', index = False)            # Append the chunk to a calls table

### Rename the column headers

In [None]:
# Get a list of all of the existing column headers
temp = db.execute("PRAGMA table_info(nppes)").fetchall()
old_headers = [e[1] for e in temp]
new_headers = [
    'npi',
    'entity_type_code',
    'provider_organization_name',
    'provider_last_name',
    'provider_first_name',
    'provider_middle_name',
    'provider_name_prefix',
    'provider_name_suffix',
    'provider_credential',
    'address_01',
    'address_02',
    'city',
    'state',
    'zip_9'
]

headers = dict(zip(old_headers, new_headers))
headers

In [None]:
headers.items()

In [None]:
for header in headers.items():
    cursor.execute("ALTER TABLE nppes RENAME COLUMN "+list(header)[0]+" TO "+list(header)[1]+";")

In [None]:
# Check to see if the column names have been updated
db.execute("PRAGMA table_info(nppes)").fetchall()

### Add a 5-digit zip code column

In [None]:
add_zip = "ALTER TABLE nppes ADD COLUMN zip_5 varchar(5)"

cursor.execute(add_zip)

In [None]:
calc_zip = "UPDATE nppes SET zip_5 = SUBSTR(zip_9, 1, 5)"

cursor.execute(calc_zip)

### Before closing the database, commit all of the changes

In [None]:
# This will ensure that the changes stick after closing the connection
db.commit()

In [None]:
# Write a test query
query = '''
SELECT *
FROM nppes
WHERE zip_5 IS NOT NULL
LIMIT 10
'''

In [None]:
# Write the results to a test dataframe
test = pd.read_sql(query, db)

In [None]:
# Check out the test dataframe
test

In [None]:
db.close()