In [1]:
import pandas as pd
import sqlite3
import re

In [2]:
pd.options.display.max_columns = None
pd.options.display.max_rows = None

### 1-Cleaning zipcodes of zip_cbsa file

In [3]:
db = sqlite3.connect('../data/nppes_lite.sqlite') #reopen the connection

query = """
SELECT * 
FROM zip_cbsa
"""
zip_cbsa = pd.read_sql(query,db)

db.close() 

In [4]:
zip_cbsa.head()

Unnamed: 0,zip,cbsa,res_ratio,bus_ratio,oth_ratio,tot_ratio
0,501,35620,0.0,1.0,0.0,1.0
1,601,38660,1.0,1.0,1.0,1.0
2,602,10380,1.0,1.0,1.0,1.0
3,603,10380,1.0,1.0,1.0,1.0
4,604,10380,1.0,1.0,1.0,1.0


In [5]:
### Drop columns
zip_cbsa = zip_cbsa.drop(["res_ratio", "bus_ratio", "oth_ratio"], axis=1)

In [6]:
zip_cbsa.dtypes

zip           object
cbsa           int64
tot_ratio    float64
dtype: object

In [7]:
zip_cbsa.describe()

Unnamed: 0,cbsa,tot_ratio
count,47424.0,47424.0
mean,46799.180373,0.831878
std,31193.74286,0.341268
min,10100.0,2.7e-05
25%,24060.0,0.964681
50%,36740.0,1.0
75%,48900.0,1.0
max,99999.0,1.0


In [8]:
### check the zip values
#zip_cbsa["zip"].value_counts()

In [9]:
### Sort values
zip_cbsa = zip_cbsa.sort_values(['zip', 'tot_ratio'], ascending=[False, False])

In [10]:
## Keep only the highest ratio
zip_cbsa = zip_cbsa.drop_duplicates(subset='zip', keep='first')

In [11]:
zip_cbsa.shape

(39451, 3)

In [12]:
zip_cbsa.describe()

Unnamed: 0,cbsa,tot_ratio
count,39451.0,39451.0
mean,46439.739677,0.981053
std,30845.636665,0.068706
min,10100.0,0.337094
25%,24340.0,1.0
50%,36740.0,1.0
75%,48260.0,1.0
max,99999.0,1.0


### Cleaning zipcodes from Nppes file

In [13]:
db = sqlite3.connect('../data/nppes_lite.sqlite') #reopen the connection

query = """
SELECT * 
FROM nppes
"""
nppes = pd.read_sql(query,db)

db.close() 

In [14]:
nppes.shape

(115486, 15)

In [15]:
nppes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 115486 entries, 0 to 115485
Data columns (total 15 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   npi                  115486 non-null  int64  
 1   entity_type_code     115486 non-null  float64
 2   org_name             24501 non-null   object 
 3   last_name            90982 non-null   object 
 4   first_name           90985 non-null   object 
 5   middle_name          62237 non-null   object 
 6   name_prefix          34723 non-null   object 
 7   name_suffix          2915 non-null    object 
 8   provider_credential  74398 non-null   object 
 9   address_1            115486 non-null  object 
 10  address_2            26571 non-null   object 
 11  city                 115486 non-null  object 
 12  state                115486 non-null  object 
 13  zip                  115486 non-null  float64
 14  taxonomy_code        115486 non-null  object 
dtypes: float64(2), in

In [16]:
#nppes["zip"].value_counts()

In [17]:
### float to object conversion for zip column
nppes['zip'] = nppes['zip'].astype(str)

In [18]:
### Remove .0
nppes['zip'] = nppes['zip'].str.replace('\.0','')

  


In [19]:
### Some zipcodes are less than 9, add 0 if it's the case
def zip_finder(zips):
    if len(zips)==9:
        return zips[:5]
    if len(zips)==8:
        return '0'+zips[:4]
    if len(zips)==7:
        return '00'+zips[:3]
    if len(zips)==5:
        return zips
    if len(zips)==4:
        return '0'+zips
    if len(zips)==3:
        return '00'+zips
    else:
        return 'no real zip'

In [20]:
nppes["zip"] = nppes.zip.apply(zip_finder)
nppes = nppes[nppes.zip!="no real zip"]

In [21]:
nppes.head()

Unnamed: 0,npi,entity_type_code,org_name,last_name,first_name,middle_name,name_prefix,name_suffix,provider_credential,address_1,address_2,city,state,zip,taxonomy_code
0,1841293891,1.0,,GIBBS,ELMER,RICKEY,DR.,,M.D.,49 CLEVELAND ST 310,,CROSSVILLE,TN,38555,208600000X
1,1659374601,1.0,,OBERDICK,WENDY,TIPTON,,,MD,105 W STONE DR,STE 1F,KINGSPORT,TN,37660,207Q00000X
2,1134122187,1.0,,RUDNICKE,CHERYL,DENISE,MRS.,,CRNP,250 25TH AVE N,STE 412,NASHVILLE,TN,37203,363L00000X
3,1003819046,1.0,,NYLANDER,BARBARA,H,,,M.D.,345 23RD AVE N,SUITE 209,NASHVILLE,TN,37203,207VG0400X
4,1750384780,1.0,,PERRIGIN,JULIE,A,DR.,,MD,219 CHURCH ST,,DICKSON,TN,37055,207Q00000X


In [22]:
#nppes["zip"].value_counts()

### Merge nppes and zip_cbsa

In [23]:
nppes_cbsa = nppes.merge(zip_cbsa, on="zip")

In [24]:
nppes_cbsa.head()

Unnamed: 0,npi,entity_type_code,org_name,last_name,first_name,middle_name,name_prefix,name_suffix,provider_credential,address_1,address_2,city,state,zip,taxonomy_code,cbsa,tot_ratio
0,1841293891,1.0,,GIBBS,ELMER,RICKEY,DR.,,M.D.,49 CLEVELAND ST 310,,CROSSVILLE,TN,38555,208600000X,18900,1.0
1,1871596403,2.0,"CUMBERLAND MEDICAL CENTER, INC.",,,,,,,421 S MAIN ST,,CROSSVILLE,TN,38555,282N00000X,18900,1.0
2,1851394241,1.0,,PATTERSON,LARRY,E.,DR.,,M.D.,15 IRIS LN,,CROSSVILLE,TN,38555,207W00000X,18900,1.0
3,1013911577,1.0,,VARCAK,RONALD,JAMES,DR.,,D.O.,133 HAYES ST,,CROSSVILLE,TN,38555,207Q00000X,18900,1.0
4,1639175573,1.0,,BERRY,PIERRE,KINDALL,DR.,,D.O.,13 BOB TOLLETT LOOP,,CROSSVILLE,TN,38555,207Q00000X,18900,1.0


In [25]:
nppes_cbsa.shape

(114912, 17)

### Taxonomy 

In [26]:
db = sqlite3.connect('../data/nppes_lite.sqlite') #reopen the connection

query = """
SELECT code, grouping, classification, specialization 
FROM nucc_taxonomy
"""
taxonomy = pd.read_sql(query,db)

db.close() 

In [27]:
taxonomy.head()

Unnamed: 0,code,grouping,classification,specialization
0,193200000X,Group,Multi-Specialty,
1,193400000X,Group,Single Specialty,
2,207K00000X,Allopathic & Osteopathic Physicians,Allergy & Immunology,
3,207KA0200X,Allopathic & Osteopathic Physicians,Allergy & Immunology,Allergy
4,207KI0005X,Allopathic & Osteopathic Physicians,Allergy & Immunology,Clinical & Laboratory Immunology


### Merge Taxonomy to nppes

In [28]:
nppes_cbsa_tax = nppes_cbsa.merge(taxonomy, left_on='taxonomy_code', right_on='code')

In [29]:
nppes_cbsa_tax.shape

(114912, 21)

In [30]:
nppes_cbsa_tax['entity_type_code'] = nppes_cbsa_tax['entity_type_code'].astype(int)

In [31]:
nppes_cbsa_tax.head()

Unnamed: 0,npi,entity_type_code,org_name,last_name,first_name,middle_name,name_prefix,name_suffix,provider_credential,address_1,address_2,city,state,zip,taxonomy_code,cbsa,tot_ratio,code,grouping,classification,specialization
0,1841293891,1,,GIBBS,ELMER,RICKEY,DR.,,M.D.,49 CLEVELAND ST 310,,CROSSVILLE,TN,38555,208600000X,18900,1.0,208600000X,Allopathic & Osteopathic Physicians,Surgery,
1,1487647681,1,,BROCKMAN,J B,,DR.,,MD,100 LANTANA RD STE 202,,CROSSVILLE,TN,38555,208600000X,18900,1.0,208600000X,Allopathic & Osteopathic Physicians,Surgery,
2,1013973874,1,,OLAECHEA,REYNALDO,A,MR.,,MD,124 HAYES ST,,CROSSVILLE,TN,38555,208600000X,18900,1.0,208600000X,Allopathic & Osteopathic Physicians,Surgery,
3,1669424107,1,,BELL,CHRISTOPHER,M,DR.,,M.D.,124 HAYES ST,,CROSSVILLE,TN,38555,208600000X,18900,1.0,208600000X,Allopathic & Osteopathic Physicians,Surgery,
4,1073526596,1,,FOX,MARK,ALAN,MR.,,MD,100 LANTANA RD,SUITE 202,CROSSVILLE,TN,38555,208600000X,18900,1.0,208600000X,Allopathic & Osteopathic Physicians,Surgery,


### Filter for Nashville cbsa only:

In [32]:
#filter for Nashville CBSA code
nash_nppes = nppes_cbsa_tax[nppes_cbsa_tax.cbsa == 34980]

In [33]:
nash_nppes.shape

(38148, 21)

In [34]:
nash_nppes.head()

Unnamed: 0,npi,entity_type_code,org_name,last_name,first_name,middle_name,name_prefix,name_suffix,provider_credential,address_1,address_2,city,state,zip,taxonomy_code,cbsa,tot_ratio,code,grouping,classification,specialization
27,1497752489,1,,COOPER,MARK,,,,M.D.,356 24TH AVE N,SUITE 400,NASHVILLE,TN,37203,208600000X,34980,1.0,208600000X,Allopathic & Osteopathic Physicians,Surgery,
28,1437156270,1,,MCDOWELL,JAMES,G,,JR.,M.D.,356 24TH AVE N,SUITE 400,NASHVILLE,TN,37203,208600000X,34980,1.0,208600000X,Allopathic & Osteopathic Physicians,Surgery,
29,1447257274,1,,LYNCH,GEORGE,BRANDON,,,M.D.,300 20TH AVE N STE 301,,NASHVILLE,TN,37203,208600000X,34980,1.0,208600000X,Allopathic & Osteopathic Physicians,Surgery,
30,1881691780,1,,GEER,RICHARD,J,,,M.D.,356 24TH AVE N,SUITE 400,NASHVILLE,TN,37203,208600000X,34980,1.0,208600000X,Allopathic & Osteopathic Physicians,Surgery,
31,1205834777,1,,ROBERTS,JOHN,ROBERT,,,M.D.,2400 PATTERSON ST,SUITE 215,NASHVILLE,TN,37203,208600000X,34980,1.0,208600000X,Allopathic & Osteopathic Physicians,Surgery,


In [35]:
### remove useless columns
nash_nppes = nash_nppes.drop(columns=['code', 'tot_ratio','middle_name', 'name_prefix', 'name_suffix', 'address_2', 'city', 'state', 'cbsa'])

In [36]:
nash_nppes.head()

Unnamed: 0,npi,entity_type_code,org_name,last_name,first_name,provider_credential,address_1,zip,taxonomy_code,grouping,classification,specialization
27,1497752489,1,,COOPER,MARK,M.D.,356 24TH AVE N,37203,208600000X,Allopathic & Osteopathic Physicians,Surgery,
28,1437156270,1,,MCDOWELL,JAMES,M.D.,356 24TH AVE N,37203,208600000X,Allopathic & Osteopathic Physicians,Surgery,
29,1447257274,1,,LYNCH,GEORGE,M.D.,300 20TH AVE N STE 301,37203,208600000X,Allopathic & Osteopathic Physicians,Surgery,
30,1881691780,1,,GEER,RICHARD,M.D.,356 24TH AVE N,37203,208600000X,Allopathic & Osteopathic Physicians,Surgery,
31,1205834777,1,,ROBERTS,JOHN,M.D.,2400 PATTERSON ST,37203,208600000X,Allopathic & Osteopathic Physicians,Surgery,


In [37]:
nash_nppes.shape

(38148, 12)

In [38]:
#save as new table in database called nash_nppes
db = sqlite3.connect('../data/nppes_lite.sqlite') #open connection

nash_nppes.to_sql('nash_nppes', db, if_exists = 'append', index = False) #save nashville CBSA nppes

db.close() #close connection

In [None]:
# create a database or connect to an existing one
#db = sqlite3.connect('../data/nppes_lite.sqlite')
# if you need to edit the database...
#cursor = db.cursor()
# Drop the table and return a line that says that it's gone
#cursor.execute("DROP TABLE nash_nppes")
#print("Table dropped...")

### Explore provider_credential values

In [54]:
db = sqlite3.connect('../data/nppes_lite.sqlite') #reopen the connection

query = """
SELECT DISTINCT(provider_credential) 
FROM nash_nppes
WHERE provider_credential LIKE 'M%'
"""
test = pd.read_sql(query,db)

db.close() 

In [55]:
test.shape

(754, 1)

In [56]:
test.provider_credential.unique()

array(['M.D.', 'MD', 'M.D., F.A.C.S.', 'MD/PHD', 'MD, MPH', 'MD, MMS',
       'MD, MBA', 'M. D.', 'M.D., R.V.T', 'M.D', 'M.D,', 'MD PHD',
       'MD, PHD', 'M.D., PH.D.', 'MD, MPH, CPH', 'M.D:', 'MD MPH',
       'M.D, MPH', 'M.S., M.D.', 'M.D. MSPH', 'M.D., MBA',
       'MEDICAL DOCTOR', 'MSN, FNP-BC', 'MD, MS', 'MBBS', 'M.D., M.P.H.',
       'M.D., MSC', 'MD, BSC', 'MD MBA', 'MD, MS, MPH',
       'M.B., B.S., PH.D.,', 'M.D., P.A.', 'MD, MPHTM, CTROPMED',
       'M.D., M.ED.', 'M.D.,  MTR.', 'MD, MSTR', 'MD, FAAP', 'M.D., MPH',
       'M..D.', 'M.D., M.P.H., M.B.A.', 'M.D., M.H.S.', 'MS',
       'M.D. PH.D.', 'MA, BS, CPLC, CHT', 'MS OTR/L', 'M.ED.', 'M.D>',
       'M.ED, BCBA', 'M.A., BCBA', 'M.ED', 'M.D., P.C.', 'MA',
       'MD, PHD, THD', 'M.A.', 'M.S.', 'M.S., BCBA', 'M.D., PHD',
       'M.D., M.S.', 'MD, MCLINEPI, FANZCA', 'M.D., M.ED', 'MD, FRCPC',
       'M.B.B.S; M.D.', 'MBBCH', 'M.D. , PH.D.', 'MPAS, PA-C',
       'MHS, PA-C', 'MED; PA-C', 'MSM, PA-C', 'MSM,PA-C',
       'M.S