In [15]:
import pandas as pd
import sqlite3
import sys

In [2]:
pd.options.display.max_columns = None
pd.options.display.max_rows = None

### Pull in zip_cbsa table

In [3]:
db = sqlite3.connect('../data/nppes_lite.sqlite')

query = """
SELECT * 
FROM zip_cbsa
"""
zip_cbsa = pd.read_sql(query,db)

db.close()

In [4]:
zip_cbsa.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47424 entries, 0 to 47423
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   zip        47424 non-null  object 
 1   cbsa       47424 non-null  int64  
 2   res_ratio  47424 non-null  float64
 3   bus_ratio  47424 non-null  float64
 4   oth_ratio  47424 non-null  float64
 5   tot_ratio  47424 non-null  float64
dtypes: float64(4), int64(1), object(1)
memory usage: 2.2+ MB


In [5]:
#drop unecessary columns
zip_cbsa = zip_cbsa.drop(["res_ratio", "bus_ratio", "oth_ratio"], axis=1)

In [6]:
# sort records by zip and zip ratio
zip_cbsa = zip_cbsa.sort_values(['zip', 'tot_ratio'], 
                                ascending=[False, False])

In [7]:
#where a given zip code is in multiple CBSAs
#keep only the zip CBSA records with the greatest % of that zip
zip_cbsa = zip_cbsa.drop_duplicates(subset='zip', keep='first')

In [36]:
zip_cbsa.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 39451 entries, 47423 to 0
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   zip        39451 non-null  object 
 1   cbsa       39451 non-null  int64  
 2   tot_ratio  39451 non-null  float64
dtypes: float64(1), int64(1), object(1)
memory usage: 1.2+ MB


#### Isolate Nashville-Davidson CBSA zip codes
CBSA id is 34980

In [40]:
nash_zip_cbsa = zip_cbsa.copy()
nash_zip_cbsa = nash_zip_cbsa[nash_zip_cbsa.cbsa==34980]
nash_zip_cbsa.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 135 entries, 17963 to 16986
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   zip        135 non-null    object 
 1   cbsa       135 non-null    int64  
 2   tot_ratio  135 non-null    float64
dtypes: float64(1), int64(1), object(1)
memory usage: 4.2+ KB


In [66]:
filter_zip_list = nash_zip_cbsa.zip.tolist()

In [None]:
#df.to_sql('nppes', db, if_exists = 'append', index = False) 

### Pull in nppes table

In [46]:
#query= """
#    SELECT sql 
#    FROM sqlite_master 
#    WHERE name = 'nppes'
#    """
nppes_info.sql.unique()

array(['CREATE TABLE "nppes" (\n"npi" INTEGER,\n  "entity_type_code" REAL,\n  "org_name" TEXT,\n  "last_name" TEXT,\n  "first_name" TEXT,\n  "middle_name" TEXT,\n  "name_prefix" TEXT,\n  "name_suffix" TEXT,\n  "provider_credential" TEXT,\n  "address_1" TEXT,\n  "address_2" TEXT,\n  "city" TEXT,\n  "state" TEXT,\n  "zip" REAL,\n  "taxonomy_code" TEXT\n)'],
      dtype=object)

In [10]:
db = sqlite3.connect('../data/nppes_lite.sqlite') #reopen the connection

query = """
SELECT * 
FROM nppes
"""
nppes = pd.read_sql(query,db)

db.close()

In [11]:
nppes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 115486 entries, 0 to 115485
Data columns (total 15 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   npi                  115486 non-null  int64  
 1   entity_type_code     115486 non-null  float64
 2   org_name             24501 non-null   object 
 3   last_name            90982 non-null   object 
 4   first_name           90985 non-null   object 
 5   middle_name          62237 non-null   object 
 6   name_prefix          34723 non-null   object 
 7   name_suffix          2915 non-null    object 
 8   provider_credential  74398 non-null   object 
 9   address_1            115486 non-null  object 
 10  address_2            26571 non-null   object 
 11  city                 115486 non-null  object 
 12  state                115486 non-null  object 
 13  zip                  115486 non-null  float64
 14  taxonomy_code        115486 non-null  object 
dtypes: float64(2), in

In [13]:
#Random sample of 5 zip codes from table.
#Records will have either 5 or 9 digit zip codes.
nppes.zip.sample(5)

114423    371865060.0
113075    372032504.0
9497      381343895.0
34947     381053678.0
45794         37909.0
Name: zip, dtype: float64

https://stackoverflow.com/questions/20970279/how-to-do-a-left-right-and-mid-of-a-string-in-a-pandas-dataframe/20970328

https://towardsdatascience.com/the-strange-size-of-python-objects-in-memory-ce87bdfbb97f

https://www.geeksforgeeks.org/how-to-find-size-of-an-object-in-python/

https://sdsawtelle.github.io/blog/output/large-data-files-pandas-sqlite.html

In [None]:
#change column from float to string & keep only the first 5 characters
#nppes['zip'] = nppes['zip'].astype(str).str[:5]

In [48]:
nppes.memory_usage()

Index                     128
npi                    923888
entity_type_code       923888
org_name               923888
last_name              923888
first_name             923888
middle_name            923888
name_prefix            923888
name_suffix            923888
provider_credential    923888
address_1              923888
address_2              923888
city                   923888
state                  923888
zip                    923888
taxonomy_code          923888
dtype: int64

#### Check memory usage of zip codes as float/int/str

In [16]:
sys.getsizeof(nppes)

72233395

In [24]:
print(sys.getsizeof(nppes['zip'])) #zip column as float64

print(sys.getsizeof(nppes['zip'].astype(str))) #zip column as string

print(sys.getsizeof(nppes['zip'].astype(str).str[:5])) #just first 5 characters of zip col as str

924032
7805687


In [73]:
nppes.columns

Index(['npi', 'entity_type_code', 'org_name', 'last_name', 'first_name',
       'middle_name', 'name_prefix', 'name_suffix', 'provider_credential',
       'address_1', 'address_2', 'city', 'state', 'zip', 'taxonomy_code'],
      dtype='object')

### Test datatype conversion & string manipulation in SQL query to save memory

In [49]:
db = sqlite3.connect('../data/nppes_lite.sqlite') #reopen the connection

query= """
    SELECT npi, CAST(zip AS TEXT)
    FROM nppes
    LIMIT 15
    """
nppeszip = pd.read_sql(query,db)

db.close


<function Connection.close>

In [51]:
nppeszip.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15 entries, 0 to 14
Data columns (total 2 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   npi                15 non-null     int64 
 1   CAST(zip AS TEXT)  15 non-null     object
dtypes: int64(1), object(1)
memory usage: 368.0+ bytes


In [69]:
query= """
    SELECT
        npi,
        SUBSTR(zip, 1, 5)
    FROM nppes
    WHERE SUBSTR(ZIP, 1,5) IN """ + filter_zip_list + """
    LIMIT 50
    """



TypeError: can only concatenate str (not "list") to str

In [59]:
with sqlite3.connect('../data/nppes_lite.sqlite') as db:
    test = pd.read_sql(query,db)

In [60]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 2 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   npi                50 non-null     int64 
 1   SUBSTR(zip, 1, 5)  50 non-null     object
dtypes: int64(1), object(1)
memory usage: 928.0+ bytes


In [61]:
test

Unnamed: 0,npi,"SUBSTR(zip, 1, 5)"
0,1841293891,38555
1,1659374601,37660
2,1134122187,37203
3,1003819046,37203
4,1750384780,37055
5,1396748323,37711
6,1922001957,37203
7,1104829134,37664
8,1073516001,37055
9,1144223298,38120
