# Exploratory Data Analysis, Hop Teaming Dataset

### Import libraries, connect to the database, and take a look around

In [1]:
# import libraries
import pandas as pd
import sqlite3
from tqdm.notebook import tqdm

In [2]:
# Set the display to show more rows and columns
pd.options.display.max_rows = 500
pd.options.display.max_columns = 200

In [3]:
# Assign sqlite database to db variable
db = sqlite3.connect('../data/hop_teaming.sqlite')

In [4]:
# Check the tables in the database
tables = db.execute("SELECT name FROM sqlite_master WHERE type='table';")
for table in tables:
    print(table[0])

cbsa
referrals
nashville_referrals
nppes
specialty


In [5]:
# For reference: pull out the column names for a given table
col = db.execute("PRAGMA table_info(specialty)").fetchall()
print([e[1] for e in col])

['npi', 'primary_taxonomy', 'Classification']


### How many providers are in the nppes dataset?
*Note that `entity_type_code` 1 = Providers and `entity_type_code` 2 = Facilities.*

In [7]:
all_providers = """
SELECT COUNT(npi), COUNT(DISTINCT npi)
FROM nppes
"""
all_providers = pd.read_sql(all_providers, db)

In [8]:
all_providers

Unnamed: 0,COUNT(npi),COUNT(DISTINCT npi)
0,6714038,6714038


In [9]:
# Create a query to get all providers with a practice location within the Nashville CBSA
nash_providers = """
    SELECT n.*

    FROM nppes AS n

    JOIN cbsa AS c
        ON c.ZIP = CAST(n.zip_5 AS INTEGER)
"""

# Read the query results into a dataframe
nash_nppes = pd.read_sql(nash_providers, db)

In [10]:
# Take a look at the number of datapoints available
nash_nppes.info()
# There are 38,414 rows in the table

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38414 entries, 0 to 38413
Data columns (total 15 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   npi                         38414 non-null  int64  
 1   entity_type_code            38414 non-null  float64
 2   provider_organization_name  7574 non-null   object 
 3   provider_last_name          30839 non-null  object 
 4   provider_first_name         30840 non-null  object 
 5   provider_middle_name        19914 non-null  object 
 6   provider_name_prefix        11560 non-null  object 
 7   provider_name_suffix        907 non-null    object 
 8   provider_credential         25323 non-null  object 
 9   address_01                  38414 non-null  object 
 10  address_02                  9957 non-null   object 
 11  city                        38414 non-null  object 
 12  state                       38414 non-null  object 
 13  zip_9                       384

In [11]:
# Take a look at the unique provider credential types
nash_nppes.provider_credential.unique().tolist()

['CRNP',
 'M.D.',
 'MD',
 'FNP',
 None,
 'M. D.',
 'DO',
 'PHARMD',
 'PA',
 'D.M.D.',
 'RN, APN',
 'DDS',
 'CRNA, MS',
 'NP',
 'APN',
 'APRN',
 'C.R.N.A., A.P.N.',
 'FNP-C',
 'D.C.',
 'RPH, PHARMD,MS',
 'O.D.',
 'P.T., D.P.T',
 'D.D.S.',
 'N.P.',
 'APRN, BC, FNP',
 'PAC',
 'ACNP',
 'M.S.S.W.',
 'GNP',
 'MD, PHD',
 'MSN APRN-BC',
 'DPM',
 'PHARM. D.',
 'PH.D.',
 'CRNA',
 'NP-C',
 'PA-C',
 'PHARM.D.',
 'APRN, BC',
 'O.D',
 'M.D., PH.D.',
 'D.D.S',
 'LCSW',
 'LCSW, ACSW',
 'L.C.S.W.',
 'DDS, MD',
 'DO, MBA',
 'DC',
 'M.D., R.V.T',
 'ARNP',
 'PT',
 'D.O. F.A.C.O.G.',
 'DMD',
 'M.S., P.A.-C.',
 'D.O.',
 'MS, PT',
 'P.A-C..',
 'CNM',
 'MSPT',
 'DO, RDMS',
 'P.T.',
 'PHD CCC-SLP',
 'DPT',
 'DP',
 'D.P.T.',
 'PTA',
 'COTA',
 'O.T.',
 'D.PH.',
 'PHD, FNP- BC',
 'DNP,FNP- BC',
 'MD MBA',
 'D.P.M.',
 'ACNP-BC',
 'LDN',
 'MSN, APRN-BC, CNN',
 'DNP',
 'OD',
 'FNP C',
 'ED.D.',
 'ANPC',
 'MSSW, LCSW',
 'MD, MS, FIPP',
 'M.D., MPH',
 'DDS,PC',
 'WHNP CNM',
 'MSN',
 'CFNP',
 'PSY.D.',
 'ANP-C',
 'P.A.

*The credentials are VERY not-standardized and include multiple variants of the same thing with different spellings (e.g. `PharmD` vs. `Pharm.D.` vs. `PHARMD`) in addition to commas within the field itself (e.g. `APN-BC, NP-C`). If we need to use this field, we will need to do some data cleaning.*

In [12]:
# Subset to only individual providers
ind_nash_nppes = nash_nppes[nash_nppes['entity_type_code'] == 1]

# Take a look at how many individual providers there are in the Nashville area
ind_nash_nppes.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 30840 entries, 0 to 38413
Data columns (total 15 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   npi                         30840 non-null  int64  
 1   entity_type_code            30840 non-null  float64
 2   provider_organization_name  0 non-null      object 
 3   provider_last_name          30839 non-null  object 
 4   provider_first_name         30840 non-null  object 
 5   provider_middle_name        19914 non-null  object 
 6   provider_name_prefix        11560 non-null  object 
 7   provider_name_suffix        907 non-null    object 
 8   provider_credential         25323 non-null  object 
 9   address_01                  30840 non-null  object 
 10  address_02                  7556 non-null   object 
 11  city                        30840 non-null  object 
 12  state                       30840 non-null  object 
 13  zip_9                       308

In [13]:
# Who is the provider without a last name?
ind_nash_nppes[ind_nash_nppes['provider_last_name'].isna() == True]

Unnamed: 0,npi,entity_type_code,provider_organization_name,provider_last_name,provider_first_name,provider_middle_name,provider_name_prefix,provider_name_suffix,provider_credential,address_01,address_02,city,state,zip_9,zip_5
10346,1215064852,1.0,,,ELIZABETH,ANN,DR.,,M.D.,1222 TROTWOOD AVE,SUITE 101,COLUMBIA,TN,384016436.0,38401


*Hilariously, [this is a real provider](https://npiregistry.cms.hhs.gov/registry/provider-view/1215064852) whose last name is "Null".*

### Explore the specialties data

In [15]:
specialties_raw = """
    SELECT *
    FROM specialty
"""
specialties_raw = pd.read_sql(specialties_raw, db)

In [16]:
specialties_raw

Unnamed: 0,npi,primary_taxonomy,Classification
0,1679576722,207X00000X,Orthopaedic Surgery
1,1588667638,207RC0000X,Internal Medicine
2,1497758544,251G00000X,"Hospice Care, Community Based"
3,1306849450,2085R0202X,Radiology
4,1215930367,207RH0003X,Internal Medicine
...,...,...,...
6714033,1326630724,363LF0000X,Nurse Practitioner
6714034,1912599325,183500000X,Pharmacist
6714035,1821680232,163WP0808X,Registered Nurse
6714036,1730771148,390200000X,Student in an Organized Health Care Education/...


In [None]:
specialties = """
    SELECT n.*
        , s.primary_taxonomy
        , s.Classification

    FROM nppes AS n

    JOIN cbsa AS c
        ON c.ZIP = CAST(n.zip_5 AS INTEGER)
    
    LEFT JOIN specialty AS s
        ON s.npi = n.npi
    
    --WHERE entity_type_code = 1 -- only get individuals
"""
# Read the query results into a dataframe
nash_nppes = pd.read_sql(specialties, db)

In [None]:
nash_nppes.info()

In [None]:
# Take a look at the individual providers who do not have a primary specialty
nash_nppes[(nash_nppes['primary_taxonomy'].isna() == True)
           & (nash_nppes['entity_type_code'] == 1)]#.nunique() # Add this to count them

*There are 106 individual providers who do not have a specialty. At a quick glance, there is no clear connection between these providers - they are in different areas, have different degrees, etc.*

In [None]:
# Take a look at the types of organizations in the dataset
# There are 152 unique classifications plus null values
nash_nppes[nash_nppes['entity_type_code'] == 2].groupby('Classification')['npi'].count().sort_values(ascending = False)

In [None]:
# Count the number of organizations that do not have a primary specialty
nash_nppes[(nash_nppes['primary_taxonomy'].isna() == True)
           & (nash_nppes['entity_type_code'] == 2)].nunique() # Add this to count them
# There are 92 organizations that do not have a specialty

### Follow just one NPI through the tables

In [None]:
sample = """
WITH nashville_providers AS (
    SELECT n.*
        , s.Classification AS specialty

    FROM nppes AS n

    JOIN cbsa AS c
        ON c.ZIP = CAST(n.zip_5 AS INTEGER)
    
    LEFT JOIN specialty AS s
        ON s.npi = n.npi
)

SELECT n.npi
    , n.specialty
    , rto.*
    , rfrom.*
    
FROM nashville_providers AS n
    
LEFT JOIN nashville_referrals AS rto
    ON rto.to_npi = n.npi
    
LEFT JOIN nashville_referrals AS rfrom
    ON rfrom.from_npi = n.npi

WHERE npi = 1215064852
"""

In [None]:
liz_null = pd.read_sql(sample, db)

In [None]:
liz_null

In [None]:
tables = db.execute("SELECT name FROM sqlite_master WHERE type='table';")

In [None]:
query = """
SELECT COUNT(DISTINCT ZIP)
FROM CBSA
"""

In [None]:
zips = pd.read_sql(query,db)

In [None]:
zips

### Take a look at the total hop-teaming data joined up

In [None]:
# Write a query 
query = """
WITH nashville_providers AS (
    SELECT n.*

    FROM nppes AS n

    JOIN cbsa AS c
        ON c.ZIP = CAST(n.zip_5 AS INTEGER)
)

SELECT COUNT(*) AS tot_rows
    , COUNT(DISTINCT n.npi) AS unique_npis
    , COUNT(DISTINCT(CASE WHEN s.npi IS NOT NULL THEN s.npi ELSE NULL END)) AS npis_with_specialty
    , COUNT(DISTINCT(CASE WHEN rto.to_npi IS NOT NULL THEN rto.to_npi ELSE NULL END)) AS npis_who_refer_out
    , COUNT(DISTINCT(CASE WHEN rfrom.to_npi IS NOT NULL THEN rfrom.to_npi ELSE NULL END)) AS npis_who_get_referrals
    
FROM nashville_providers AS n

LEFT JOIN specialty AS s
    ON s.npi = n.npi
    
LEFT JOIN nashville_referrals AS rto
    ON rto.to_npi = n.npi
    
LEFT JOIN nashville_referrals AS rfrom
    ON rfrom.from_npi = n.npi
"""

In [None]:
nashville_providers = pd.read_sql(query, db)

In [None]:
nashville_providers