In [15]:
import pandas as pd
import sqlite3
import sys

In [2]:
pd.options.display.max_columns = None
pd.options.display.max_rows = None

### Pull in zip_cbsa table

In [3]:
db = sqlite3.connect('../data/nppes_lite.sqlite')

query = """
SELECT * 
FROM zip_cbsa
"""
zip_cbsa = pd.read_sql(query,db)

db.close()

In [4]:
zip_cbsa.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47424 entries, 0 to 47423
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   zip        47424 non-null  object 
 1   cbsa       47424 non-null  int64  
 2   res_ratio  47424 non-null  float64
 3   bus_ratio  47424 non-null  float64
 4   oth_ratio  47424 non-null  float64
 5   tot_ratio  47424 non-null  float64
dtypes: float64(4), int64(1), object(1)
memory usage: 2.2+ MB


In [5]:
#drop unecessary columns
zip_cbsa = zip_cbsa.drop(["res_ratio", "bus_ratio", "oth_ratio"], axis=1)

In [6]:
# sort records by zip and zip ratio
zip_cbsa = zip_cbsa.sort_values(['zip', 'tot_ratio'], 
                                ascending=[False, False])

In [7]:
#where a given zip code is in multiple CBSAs
#keep only the zip CBSA records with the greatest % of that zip
zip_cbsa = zip_cbsa.drop_duplicates(subset='zip', keep='first')

In [36]:
zip_cbsa.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 39451 entries, 47423 to 0
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   zip        39451 non-null  object 
 1   cbsa       39451 non-null  int64  
 2   tot_ratio  39451 non-null  float64
dtypes: float64(1), int64(1), object(1)
memory usage: 1.2+ MB


### Pull in nppes table

In [46]:
#query= """
#    SELECT sql 
#    FROM sqlite_master 
#    WHERE name = 'nppes'
#    """
nppes_info.sql.unique()

array(['CREATE TABLE "nppes" (\n"npi" INTEGER,\n  "entity_type_code" REAL,\n  "org_name" TEXT,\n  "last_name" TEXT,\n  "first_name" TEXT,\n  "middle_name" TEXT,\n  "name_prefix" TEXT,\n  "name_suffix" TEXT,\n  "provider_credential" TEXT,\n  "address_1" TEXT,\n  "address_2" TEXT,\n  "city" TEXT,\n  "state" TEXT,\n  "zip" REAL,\n  "taxonomy_code" TEXT\n)'],
      dtype=object)

In [10]:
db = sqlite3.connect('../data/nppes_lite.sqlite') #reopen the connection

query = """
SELECT * 
FROM nppes
"""
nppes = pd.read_sql(query,db)

db.close()

In [13]:
#Random sample of 5 zip codes from table.
#Records will have either 5 or 9 digit zip codes.
nppes.zip.sample(5)

114423    371865060.0
113075    372032504.0
9497      381343895.0
34947     381053678.0
45794         37909.0
Name: zip, dtype: float64

In [116]:
#change column from float to string & keep only the first 5 characters
nppes['zip'] = nppes['zip'].astype(str).str[:5]

In [120]:
nppes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 115486 entries, 0 to 115485
Data columns (total 15 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   npi                  115486 non-null  int64  
 1   entity_type_code     115486 non-null  float64
 2   org_name             24501 non-null   object 
 3   last_name            90982 non-null   object 
 4   first_name           90985 non-null   object 
 5   middle_name          62237 non-null   object 
 6   name_prefix          34723 non-null   object 
 7   name_suffix          2915 non-null    object 
 8   provider_credential  74398 non-null   object 
 9   address_1            115486 non-null  object 
 10  address_2            26571 non-null   object 
 11  city                 115486 non-null  object 
 12  state                115486 non-null  object 
 13  zip                  115486 non-null  object 
 14  taxonomy_code        115486 non-null  object 
dtypes: float64(1), in

In [122]:
nppes.sample(2)

Unnamed: 0,npi,entity_type_code,org_name,last_name,first_name,middle_name,name_prefix,name_suffix,provider_credential,address_1,address_2,city,state,zip,taxonomy_code
77605,1801348982,1.0,,MILLER,VALERIE,JEAN,MRS.,,FNP,11416 GRIGSBY CHAPEL RD STE 104,,KNOXVILLE,TN,37934,363LF0000X
67892,1194127902,1.0,,WILLOUGHBY,ANESA,R.,,,FNP-C,166 E MAIN ST,,HENDERSONVILLE,TN,37075,363LF0000X


### Merge nppes and zip_cbsa dataframes, filter for Nashville CBSA

In [117]:
nppes_cbsa = nppes.merge(zip_cbsa, on="zip")

nppes_cbsa.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 114907 entries, 0 to 114906
Data columns (total 17 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   npi                  114907 non-null  int64  
 1   entity_type_code     114907 non-null  float64
 2   org_name             24481 non-null   object 
 3   last_name            90423 non-null   object 
 4   first_name           90426 non-null   object 
 5   middle_name          61882 non-null   object 
 6   name_prefix          34498 non-null   object 
 7   name_suffix          2904 non-null    object 
 8   provider_credential  73963 non-null   object 
 9   address_1            114907 non-null  object 
 10  address_2            26314 non-null   object 
 11  city                 114907 non-null  object 
 12  state                114907 non-null  object 
 13  zip                  114907 non-null  object 
 14  taxonomy_code        114907 non-null  object 
 15  cbsa             

In [118]:
#filter for Nashville CBSA code
nppes_cbsa[nppes_cbsa.cbsa==34980].zip.nunique()

110

In [119]:
nash_nppes=nppes_cbsa[nppes_cbsa.cbsa==34980]

#### Save nash_nppes as table in database

In [123]:
db = sqlite3.connect('../data/nppes_lite.sqlite') #open connection

nash_nppes.to_sql('nash_nppes', db, if_exists = 'append', index = False) #save nashville CBSA nppes

db.close() #close connection

#### Verify table data

In [126]:
db = sqlite3.connect('../data/nppes_lite.sqlite') #reopen the connection

query= """
    SELECT *
    FROM nash_nppes
    LIMIT 10
    """
nash_nppes = pd.read_sql(query,db)

db.close

<function Connection.close>

In [127]:
nash_nppes

Unnamed: 0,npi,entity_type_code,org_name,last_name,first_name,middle_name,name_prefix,name_suffix,provider_credential,address_1,address_2,city,state,zip,taxonomy_code,cbsa,tot_ratio
0,1134122187,1.0,,RUDNICKE,CHERYL,DENISE,MRS.,,CRNP,250 25TH AVE N,STE 412,NASHVILLE,TN,37203,363L00000X,34980,1.0
1,1003819046,1.0,,NYLANDER,BARBARA,H,,,M.D.,345 23RD AVE N,SUITE 209,NASHVILLE,TN,37203,207VG0400X,34980,1.0
2,1922001957,1.0,,PRESLEY,RICHARD,E,,,M.D.,2011 MURPHY AVE,STE 302,NASHVILLE,TN,37203,207V00000X,34980,1.0
3,1760485817,1.0,,MORAN,SAM,HOUSTON,,,M.D.,329 21ST AVE N,STE 4,NASHVILLE,TN,37203,207V00000X,34980,1.0
4,1437152485,1.0,,MORGAN,LISA,BROOKS,,,MD,2201 MURPHY AVE STE 407,,NASHVILLE,TN,37203,207V00000X,34980,1.0
5,1336142363,1.0,,REYNOLDS,MELISSA,G,,,M.D.,330 23RD AVE N,SUITE 604,NASHVILLE,TN,37203,207V00000X,34980,1.0
6,1154324192,1.0,,OLDFIELD,ELIZABETH,L,,,M.D.,2011 MURPHY AVE,STE 200,NASHVILLE,TN,37203,207V00000X,34980,1.0
7,1871596080,1.0,,ALTENBERN,DARRINGTON,PHILLIPS,,,M.D.,"329 23RD AVE NO, SUITE 328",,NASHVILLE,TN,37203,207V00000X,34980,1.0
8,1770586786,1.0,,RICHARDS,SHERRIE,ANDERSON,,,M.D.,2201 MURPHY AVE,STE 410,NASHVILLE,TN,37203,207V00000X,34980,1.0
9,1285637447,1.0,,BRESSMAN,PHILLIP,L,,,M.D.,300 20TH AVE N STE 302,,NASHVILLE,TN,37203,207V00000X,34980,1.0


#### Check memory usage of zip codes as float/int/str

https://stackoverflow.com/questions/20970279/how-to-do-a-left-right-and-mid-of-a-string-in-a-pandas-dataframe/20970328

https://towardsdatascience.com/the-strange-size-of-python-objects-in-memory-ce87bdfbb97f

https://www.geeksforgeeks.org/how-to-find-size-of-an-object-in-python/

https://sdsawtelle.github.io/blog/output/large-data-files-pandas-sqlite.html

In [48]:
nppes.memory_usage()

Index                     128
npi                    923888
entity_type_code       923888
org_name               923888
last_name              923888
first_name             923888
middle_name            923888
name_prefix            923888
name_suffix            923888
provider_credential    923888
address_1              923888
address_2              923888
city                   923888
state                  923888
zip                    923888
taxonomy_code          923888
dtype: int64

In [16]:
sys.getsizeof(nppes)

72233395

In [24]:
print(sys.getsizeof(nppes['zip'])) #zip column as float64

print(sys.getsizeof(nppes['zip'].astype(str))) #zip column as string

print(sys.getsizeof(nppes['zip'].astype(str).str[:5])) #just first 5 characters of zip col as str

924032
7805687


#### Isolate Nashville-Davidson CBSA zip codes
CBSA id is 34980

In [None]:
nash_zip_cbsa = zip_cbsa.copy()
nash_zip_cbsa = nash_zip_cbsa[nash_zip_cbsa.cbsa==34980]
nash_zip_cbsa.info()

#### Alternately, do datatype conversion & string manipulation in SQL query

In [98]:
db = sqlite3.connect('../data/nppes_lite.sqlite') #reopen the connection

query= """
    SELECT
        npi,
        entity_type_code,
        org_name,
        last_name,
        first_name,
        middle_name,
        name_prefix,
        name_suffix,
        provider_credential,
        address_1,
        address_2,
        city,
        state,
        SUBSTR(nppes.zip, 1, 5) AS zip,
        taxonomy_code
    FROM nppes
    """
nppes_zip5 = pd.read_sql(query,db)

db.close


<function Connection.close>

In [99]:
nppes_zip5.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 115486 entries, 0 to 115485
Data columns (total 15 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   npi                  115486 non-null  int64  
 1   entity_type_code     115486 non-null  float64
 2   org_name             24501 non-null   object 
 3   last_name            90982 non-null   object 
 4   first_name           90985 non-null   object 
 5   middle_name          62237 non-null   object 
 6   name_prefix          34723 non-null   object 
 7   name_suffix          2915 non-null    object 
 8   provider_credential  74398 non-null   object 
 9   address_1            115486 non-null  object 
 10  address_2            26571 non-null   object 
 11  city                 115486 non-null  object 
 12  state                115486 non-null  object 
 13  zip                  115486 non-null  object 
 14  taxonomy_code        115486 non-null  object 
dtypes: float64(1), in

In [105]:
#check random samples to make sure the zipcode is formatted correctly
nppes_zip5.sample()

Unnamed: 0,npi,entity_type_code,org_name,last_name,first_name,middle_name,name_prefix,name_suffix,provider_credential,address_1,address_2,city,state,zip,taxonomy_code
91681,1790339075,1.0,,BARNES,APRIL,,,,,2250 WILMA RUDOLPH BLVD STE F259,,CLARKSVILLE,TN,37040,106S00000X


In [101]:
#merge dataframes
nppes_nash = nppes_zip5.merge(nash_zip_cbsa, on="zip")

In [102]:
nppes_nash.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 38148 entries, 0 to 38147
Data columns (total 17 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   npi                  38148 non-null  int64  
 1   entity_type_code     38148 non-null  float64
 2   org_name             7466 non-null   object 
 3   last_name            30681 non-null  object 
 4   first_name           30682 non-null  object 
 5   middle_name          19790 non-null  object 
 6   name_prefix          11470 non-null  object 
 7   name_suffix          895 non-null    object 
 8   provider_credential  25175 non-null  object 
 9   address_1            38148 non-null  object 
 10  address_2            9877 non-null   object 
 11  city                 38148 non-null  object 
 12  state                38148 non-null  object 
 13  zip                  38148 non-null  object 
 14  taxonomy_code        38148 non-null  object 
 15  cbsa                 38148 non-null 

In [108]:
nppes_nash.zip.nunique()

110

In [103]:
nppes_all = nppes_zip5.merge(zip_cbsa, on="zip")

In [104]:
nppes_all.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 114907 entries, 0 to 114906
Data columns (total 17 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   npi                  114907 non-null  int64  
 1   entity_type_code     114907 non-null  float64
 2   org_name             24481 non-null   object 
 3   last_name            90423 non-null   object 
 4   first_name           90426 non-null   object 
 5   middle_name          61882 non-null   object 
 6   name_prefix          34498 non-null   object 
 7   name_suffix          2904 non-null    object 
 8   provider_credential  73963 non-null   object 
 9   address_1            114907 non-null  object 
 10  address_2            26314 non-null   object 
 11  city                 114907 non-null  object 
 12  state                114907 non-null  object 
 13  zip                  114907 non-null  object 
 14  taxonomy_code        114907 non-null  object 
 15  cbsa             

In [109]:
nppes_all[nppes_all.cbsa==34980].zip.nunique()

110

In [None]:
1841293891