In [1]:
import pandas as pd
import sqlite3
import re

In [2]:
pd.options.display.max_columns = None
pd.options.display.max_rows = None

### 1-Cleaning zipcodes of zip_cbsa file

In [3]:
db = sqlite3.connect('../data/nppes_lite.sqlite') #reopen the connection

query = """
SELECT * 
FROM zip_cbsa
"""
zip_cbsa = pd.read_sql(query,db)

db.close() 

In [4]:
zip_cbsa.head()

Unnamed: 0,zip,cbsa,res_ratio,bus_ratio,oth_ratio,tot_ratio
0,501,35620,0.0,1.0,0.0,1.0
1,601,38660,1.0,1.0,1.0,1.0
2,602,10380,1.0,1.0,1.0,1.0
3,603,10380,1.0,1.0,1.0,1.0
4,604,10380,1.0,1.0,1.0,1.0


In [5]:
### Drop columns
zip_cbsa = zip_cbsa.drop(["res_ratio", "bus_ratio", "oth_ratio"], axis=1)

In [6]:
zip_cbsa.dtypes

zip           object
cbsa           int64
tot_ratio    float64
dtype: object

In [7]:
zip_cbsa.describe()

Unnamed: 0,cbsa,tot_ratio
count,47424.0,47424.0
mean,46799.180373,0.831878
std,31193.74286,0.341268
min,10100.0,2.7e-05
25%,24060.0,0.964681
50%,36740.0,1.0
75%,48900.0,1.0
max,99999.0,1.0


In [8]:
### check the zip values
#zip_cbsa["zip"].value_counts()

In [9]:
### Sort values
zip_cbsa = zip_cbsa.sort_values(['zip', 'tot_ratio'], ascending=[False, False])

In [10]:
## Keep only the highest ratio
zip_cbsa = zip_cbsa.drop_duplicates(subset='zip', keep='first')

In [11]:
zip_cbsa.shape

(39451, 3)

In [12]:
zip_cbsa.describe()

Unnamed: 0,cbsa,tot_ratio
count,39451.0,39451.0
mean,46439.739677,0.981053
std,30845.636665,0.068706
min,10100.0,0.337094
25%,24340.0,1.0
50%,36740.0,1.0
75%,48260.0,1.0
max,99999.0,1.0


### Cleaning zipcodes from Nppes file

In [13]:
db = sqlite3.connect('../data/nppes_lite.sqlite') #reopen the connection

query = """
SELECT * 
FROM nppes
"""
nppes = pd.read_sql(query,db)

db.close() 

In [14]:
nppes.shape

(115486, 15)

In [15]:
nppes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 115486 entries, 0 to 115485
Data columns (total 15 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   npi                  115486 non-null  int64  
 1   entity_type_code     115486 non-null  float64
 2   org_name             24501 non-null   object 
 3   last_name            90982 non-null   object 
 4   first_name           90985 non-null   object 
 5   middle_name          62237 non-null   object 
 6   name_prefix          34723 non-null   object 
 7   name_suffix          2915 non-null    object 
 8   provider_credential  74398 non-null   object 
 9   address_1            115486 non-null  object 
 10  address_2            26571 non-null   object 
 11  city                 115486 non-null  object 
 12  state                115486 non-null  object 
 13  zip                  115486 non-null  float64
 14  taxonomy_code        115486 non-null  object 
dtypes: float64(2), in

In [16]:
nppes["zip"].value_counts()

379175158.0    1988
372320001.0    1935
379201511.0     619
381042127.0     517
372122637.0     515
381033438.0     426
374032147.0     366
37203.0         351
37684.0         349
37232.0         349
372031401.0     342
381630001.0     336
372173841.0     335
37604.0         335
381053678.0     324
372325100.0     297
372052013.0     279
381043415.0     279
372281805.0     272
372320004.0     272
372031448.0     234
381032807.0     232
383013906.0     226
379161809.0     222
372320005.0     218
376207430.0     212
372320011.0     202
379215718.0     202
37129.0         201
372094129.0     189
371291237.0     189
381340181.0     188
374032136.0     187
372042235.0     184
38138.0         183
37421.0         183
372031562.0     181
38120.0         179
370275780.0     177
380193630.0     177
372320014.0     176
37208.0         173
38401.0         173
381276662.0     169
374043239.0     167
384014802.0     164
37027.0         162
374211894.0     161
376046035.0     161
37404.0         160


In [17]:
### float to object conversion for zip column
nppes['zip'] = nppes['zip'].astype(str)

In [18]:
### Remove .0
nppes['zip'] = nppes['zip'].str.replace('\.0','')

  


In [19]:
### Some zipcodes are less than 9, add 0 if it's the case
def zip_finder(zips):
    if len(zips)==9:
        return zips[:5]
    if len(zips)==8:
        return '0'+zips[:4]
    if len(zips)==7:
        return '00'+zips[:3]
    if len(zips)==5:
        return zips
    if len(zips)==4:
        return '0'+zips
    if len(zips)==3:
        return '00'+zips
    else:
        return 'no real zip'

In [20]:
nppes["zip"] = nppes.zip.apply(zip_finder)
nppes = nppes[nppes.zip!="no real zip"]

In [21]:
nppes.head()

Unnamed: 0,npi,entity_type_code,org_name,last_name,first_name,middle_name,name_prefix,name_suffix,provider_credential,address_1,address_2,city,state,zip,taxonomy_code
0,1841293891,1.0,,GIBBS,ELMER,RICKEY,DR.,,M.D.,49 CLEVELAND ST 310,,CROSSVILLE,TN,38555,208600000X
1,1659374601,1.0,,OBERDICK,WENDY,TIPTON,,,MD,105 W STONE DR,STE 1F,KINGSPORT,TN,37660,207Q00000X
2,1134122187,1.0,,RUDNICKE,CHERYL,DENISE,MRS.,,CRNP,250 25TH AVE N,STE 412,NASHVILLE,TN,37203,363L00000X
3,1003819046,1.0,,NYLANDER,BARBARA,H,,,M.D.,345 23RD AVE N,SUITE 209,NASHVILLE,TN,37203,207VG0400X
4,1750384780,1.0,,PERRIGIN,JULIE,A,DR.,,MD,219 CHURCH ST,,DICKSON,TN,37055,207Q00000X


In [22]:
nppes["zip"].value_counts()

37232    5395
37203    3646
37917    2615
37604    2208
37027    1935
37421    1929
38104    1916
37129    1738
38138    1654
37660    1573
37920    1547
37403    1518
37067    1489
38103    1450
37404    1414
38401    1377
37212    1367
38501    1367
38305    1315
37923    1303
37919    1267
37043    1265
38119    1262
38120    1259
37211    1250
37205    1166
38301    1159
37040    1149
37075    1053
37204    1005
38134     995
37620     963
37916     953
37601     895
37130     887
37909     849
37066     826
37167     813
37830     812
37814     786
37922     771
37388     747
38105     741
37064     727
37804     727
38018     716
37076     712
37228     706
37217     697
37921     685
37934     673
37343     663
37087     655
37209     647
37055     634
37122     634
37115     622
37208     619
37312     609
37215     608
37172     589
37745     583
38118     547
37862     539
38017     530
38117     508
38555     503
38163     499
37214     494
37311     494
38024     476
38133 

### Merge nppes and zip_cbsa

In [23]:
nppes_cbsa = nppes.merge(zip_cbsa, on="zip")

In [24]:
nppes_cbsa.head()

Unnamed: 0,npi,entity_type_code,org_name,last_name,first_name,middle_name,name_prefix,name_suffix,provider_credential,address_1,address_2,city,state,zip,taxonomy_code,cbsa,tot_ratio
0,1841293891,1.0,,GIBBS,ELMER,RICKEY,DR.,,M.D.,49 CLEVELAND ST 310,,CROSSVILLE,TN,38555,208600000X,18900,1.0
1,1871596403,2.0,"CUMBERLAND MEDICAL CENTER, INC.",,,,,,,421 S MAIN ST,,CROSSVILLE,TN,38555,282N00000X,18900,1.0
2,1851394241,1.0,,PATTERSON,LARRY,E.,DR.,,M.D.,15 IRIS LN,,CROSSVILLE,TN,38555,207W00000X,18900,1.0
3,1013911577,1.0,,VARCAK,RONALD,JAMES,DR.,,D.O.,133 HAYES ST,,CROSSVILLE,TN,38555,207Q00000X,18900,1.0
4,1639175573,1.0,,BERRY,PIERRE,KINDALL,DR.,,D.O.,13 BOB TOLLETT LOOP,,CROSSVILLE,TN,38555,207Q00000X,18900,1.0


In [25]:
nppes.shape

(115486, 15)

### Taxonomy 

In [27]:
db = sqlite3.connect('../data/nppes_lite.sqlite') #reopen the connection

query = """
SELECT code, grouping, classification, specialization 
FROM nucc_taxonomy
"""
taxonomy = pd.read_sql(query,db)

db.close() 

In [28]:
taxonomy.head()

Unnamed: 0,code,grouping,classification,specialization
0,193200000X,Group,Multi-Specialty,
1,193400000X,Group,Single Specialty,
2,207K00000X,Allopathic & Osteopathic Physicians,Allergy & Immunology,
3,207KA0200X,Allopathic & Osteopathic Physicians,Allergy & Immunology,Allergy
4,207KI0005X,Allopathic & Osteopathic Physicians,Allergy & Immunology,Clinical & Laboratory Immunology
