# Exploratory Data Analysis, Hop Teaming Dataset

### Import libraries, connect to the database, and take a look around

In [1]:
# import libraries
import pandas as pd
import sqlite3
from tqdm.notebook import tqdm

In [2]:
# Set the display to show more rows and columns
pd.options.display.max_rows = 500
pd.options.display.max_columns = 200

In [3]:
# Assign sqlite database to db variable
db = sqlite3.connect('../data/hop_teaming.sqlite')

In [4]:
# Check the tables in the database
tables = db.execute("SELECT name FROM sqlite_master WHERE type='table';")
for table in tables:
    print(table[0])

cbsa
referrals
nashville_referrals
nppes
specialty


In [5]:
# For reference: pull out the column names for a given table
col = db.execute("PRAGMA table_info(specialty)").fetchall()
print([e[1] for e in col])

['npi', 'primary_taxonomy', 'Classification']


### How many providers are in the nppes dataset?
*Note that `entity_type_code` 1 = Providers and `entity_type_code` 2 = Facilities.*

In [6]:
all_providers = """
SELECT COUNT(npi), COUNT(DISTINCT npi)
FROM nppes
"""
all_providers = pd.read_sql(all_providers, db)

In [7]:
all_providers

Unnamed: 0,COUNT(npi),COUNT(DISTINCT npi)
0,6714038,6714038


In [8]:
# Create a query to get all providers with a practice location within the Nashville CBSA
nash_providers = """
    SELECT n.*

    FROM nppes AS n

    JOIN cbsa AS c
        ON c.ZIP = CAST(n.zip_5 AS INTEGER)
"""

# Read the query results into a dataframe
nash_nppes = pd.read_sql(nash_providers, db)

In [9]:
# Take a look at the number of datapoints available
nash_nppes.info()
# There are 38,414 rows in the table

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38414 entries, 0 to 38413
Data columns (total 15 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   npi                         38414 non-null  int64  
 1   entity_type_code            38414 non-null  float64
 2   provider_organization_name  7574 non-null   object 
 3   provider_last_name          30839 non-null  object 
 4   provider_first_name         30840 non-null  object 
 5   provider_middle_name        19914 non-null  object 
 6   provider_name_prefix        11560 non-null  object 
 7   provider_name_suffix        907 non-null    object 
 8   provider_credential         25323 non-null  object 
 9   address_01                  38414 non-null  object 
 10  address_02                  9957 non-null   object 
 11  city                        38414 non-null  object 
 12  state                       38414 non-null  object 
 13  zip_9                       384

In [10]:
# Take a look at the unique provider credential types
nash_nppes.provider_credential.unique().tolist()

['CRNP',
 'M.D.',
 'MD',
 'FNP',
 None,
 'M. D.',
 'DO',
 'PHARMD',
 'PA',
 'D.M.D.',
 'RN, APN',
 'DDS',
 'CRNA, MS',
 'NP',
 'APN',
 'APRN',
 'C.R.N.A., A.P.N.',
 'FNP-C',
 'D.C.',
 'RPH, PHARMD,MS',
 'O.D.',
 'P.T., D.P.T',
 'D.D.S.',
 'N.P.',
 'APRN, BC, FNP',
 'PAC',
 'ACNP',
 'M.S.S.W.',
 'GNP',
 'MD, PHD',
 'MSN APRN-BC',
 'DPM',
 'PHARM. D.',
 'PH.D.',
 'CRNA',
 'NP-C',
 'PA-C',
 'PHARM.D.',
 'APRN, BC',
 'O.D',
 'M.D., PH.D.',
 'D.D.S',
 'LCSW',
 'LCSW, ACSW',
 'L.C.S.W.',
 'DDS, MD',
 'DO, MBA',
 'DC',
 'M.D., R.V.T',
 'ARNP',
 'PT',
 'D.O. F.A.C.O.G.',
 'DMD',
 'M.S., P.A.-C.',
 'D.O.',
 'MS, PT',
 'P.A-C..',
 'CNM',
 'MSPT',
 'DO, RDMS',
 'P.T.',
 'PHD CCC-SLP',
 'DPT',
 'DP',
 'D.P.T.',
 'PTA',
 'COTA',
 'O.T.',
 'D.PH.',
 'PHD, FNP- BC',
 'DNP,FNP- BC',
 'MD MBA',
 'D.P.M.',
 'ACNP-BC',
 'LDN',
 'MSN, APRN-BC, CNN',
 'DNP',
 'OD',
 'FNP C',
 'ED.D.',
 'ANPC',
 'MSSW, LCSW',
 'MD, MS, FIPP',
 'M.D., MPH',
 'DDS,PC',
 'WHNP CNM',
 'MSN',
 'CFNP',
 'PSY.D.',
 'ANP-C',
 'P.A.

*The credentials are VERY not-standardized and include multiple variants of the same thing with different spellings (e.g. `PharmD` vs. `Pharm.D.` vs. `PHARMD`) in addition to commas within the field itself (e.g. `APN-BC, NP-C`). If we need to use this field, we will need to do some data cleaning.*

In [11]:
# Subset to only individual providers
ind_nash_nppes = nash_nppes[nash_nppes['entity_type_code'] == 1]

# Take a look at how many individual providers there are in the Nashville area
ind_nash_nppes.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 30840 entries, 0 to 38413
Data columns (total 15 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   npi                         30840 non-null  int64  
 1   entity_type_code            30840 non-null  float64
 2   provider_organization_name  0 non-null      object 
 3   provider_last_name          30839 non-null  object 
 4   provider_first_name         30840 non-null  object 
 5   provider_middle_name        19914 non-null  object 
 6   provider_name_prefix        11560 non-null  object 
 7   provider_name_suffix        907 non-null    object 
 8   provider_credential         25323 non-null  object 
 9   address_01                  30840 non-null  object 
 10  address_02                  7556 non-null   object 
 11  city                        30840 non-null  object 
 12  state                       30840 non-null  object 
 13  zip_9                       308

In [12]:
# Who is the provider without a last name?
ind_nash_nppes[ind_nash_nppes['provider_last_name'].isna() == True]

Unnamed: 0,npi,entity_type_code,provider_organization_name,provider_last_name,provider_first_name,provider_middle_name,provider_name_prefix,provider_name_suffix,provider_credential,address_01,address_02,city,state,zip_9,zip_5
10346,1215064852,1.0,,,ELIZABETH,ANN,DR.,,M.D.,1222 TROTWOOD AVE,SUITE 101,COLUMBIA,TN,384016436.0,38401


*Hilariously, [this is a real provider](https://npiregistry.cms.hhs.gov/registry/provider-view/1215064852) whose last name is "Null".*

### Explore the specialties data

In [13]:
specialties_raw = """
    SELECT *
    FROM specialty
"""
specialties_raw = pd.read_sql(specialties_raw, db)

In [14]:
specialties_raw

Unnamed: 0,npi,primary_taxonomy,Classification
0,1679576722,207X00000X,Orthopaedic Surgery
1,1588667638,207RC0000X,Internal Medicine
2,1497758544,251G00000X,"Hospice Care, Community Based"
3,1306849450,2085R0202X,Radiology
4,1215930367,207RH0003X,Internal Medicine
...,...,...,...
6714033,1326630724,363LF0000X,Nurse Practitioner
6714034,1912599325,183500000X,Pharmacist
6714035,1821680232,163WP0808X,Registered Nurse
6714036,1730771148,390200000X,Student in an Organized Health Care Education/...


In [15]:
specialties = """
    SELECT n.*
        , s.primary_taxonomy
        , s.Classification

    FROM nppes AS n

    JOIN cbsa AS c
        ON c.ZIP = CAST(n.zip_5 AS INTEGER)
    
    LEFT JOIN specialty AS s
        ON s.npi = n.npi
    
    --WHERE entity_type_code = 1 -- only get individuals
"""
# Read the query results into a dataframe
nash_nppes = pd.read_sql(specialties, db)

In [16]:
nash_nppes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38414 entries, 0 to 38413
Data columns (total 17 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   npi                         38414 non-null  int64  
 1   entity_type_code            38414 non-null  float64
 2   provider_organization_name  7574 non-null   object 
 3   provider_last_name          30839 non-null  object 
 4   provider_first_name         30840 non-null  object 
 5   provider_middle_name        19914 non-null  object 
 6   provider_name_prefix        11560 non-null  object 
 7   provider_name_suffix        907 non-null    object 
 8   provider_credential         25323 non-null  object 
 9   address_01                  38414 non-null  object 
 10  address_02                  9957 non-null   object 
 11  city                        38414 non-null  object 
 12  state                       38414 non-null  object 
 13  zip_9                       384

In [17]:
# Take a look at the individual providers who do not have a primary specialty
nash_nppes[(nash_nppes['primary_taxonomy'].isna() == True)
           & (nash_nppes['entity_type_code'] == 1)]#.nunique() # Add this to count them

Unnamed: 0,npi,entity_type_code,provider_organization_name,provider_last_name,provider_first_name,provider_middle_name,provider_name_prefix,provider_name_suffix,provider_credential,address_01,address_02,city,state,zip_9,zip_5,primary_taxonomy,Classification
258,1114923737,1.0,,KOCH,JACK,L.,,JR.,M.D.,2125 BELCOURT AVE,,NASHVILLE,TN,372123503.0,37212,,
333,1801893631,1.0,,HERSH,CAROL,B,DR.,,M.D.,2409 21ST AVE S,SUITE 104,NASHVILLE,TN,372125317.0,37212,,
1411,1215918321,1.0,,MEYER,ALVIN,H.,DR.,JR.,M.D.,5651 FRIST BLVD,STE 509,HERMITAGE,TN,370762054.0,37076,,
1859,1477532448,1.0,,HARRIS,SANDRA,GAIL,DR.,,DDS,206 RIVERGATE PKWY,,GOODLETTSVILLE,TN,370722033.0,37072,,
1987,1346210457,1.0,,LIVENGOOD,JANICE,M,,,"PHD, HSP",4230 HARDING RD,SUITE 810,NASHVILLE,TN,372052013.0,37205,,
2072,1215908058,1.0,,FITZGERALD,MARY,TRABUE,,,"CCC-SLP, CCC-A",3704 SYCAMORE LN,,NASHVILLE,TN,372151914.0,37215,,
2114,1780656777,1.0,,RAMIREZ,MISTY,DAWN,,,MPT,7640 HIGHWAY 70 SOUTH,STE 210,NASHVILLE,TN,37221.0,37221,,
2133,1245202266,1.0,,JOHNSON,TRACY,ANN,,,"MSN, FNP",4230 HARDING RD,SUITE 330,NASHVILLE,TN,372052013.0,37205,,
2274,1588639561,1.0,,FETZER,WILLIAM,BRENT,MR.,,D.C.,3441 LEBANON PIKE,SUITE 117,HERMITAGE,TN,370762097.0,37076,,
2301,1558337097,1.0,,GREEN,BARBARA,RUTH,MS.,,LCSW,2105 EDWARD CURD LANE,,FRANKLIN,TN,37067.0,37067,,


*There are 104 individual providers who do not have a specialty. At a quick glance, there is no clear connection between these providers - they are in different areas, have different degrees, etc.*

In [18]:
# Take a look at the types of organizations in the dataset
# There are 152 unique classifications plus null values
nash_nppes[nash_nppes['entity_type_code'] == 2].groupby('Classification')['npi'].count().sort_values(ascending = False)

Classification
Clinic/Center                                                                                     981
Pharmacy                                                                                          560
Dentist                                                                                           519
Internal Medicine                                                                                 401
Durable Medical Equipment & Medical Supplies                                                      380
Chiropractor                                                                                      268
Family Medicine                                                                                   261
Specialist                                                                                        228
In Home Supportive Care                                                                           188
Nurse Practitioner                                                 

In [19]:
# Count the number of organizations that do not have a primary specialty
nash_nppes[(nash_nppes['primary_taxonomy'].isna() == True)
           & (nash_nppes['entity_type_code'] == 2)].nunique() # Add this to count them
# There are 85 organizations that do not have a specialty

npi                           85
entity_type_code               1
provider_organization_name    64
provider_last_name             0
provider_first_name            0
provider_middle_name           0
provider_name_prefix           0
provider_name_suffix           0
provider_credential            0
address_01                    81
address_02                    21
city                          20
state                          1
zip_9                         81
zip_5                         34
primary_taxonomy               0
Classification                 0
dtype: int64

### Follow just one NPI through the tables

In [20]:
sample = """
WITH nashville_providers AS (
    SELECT n.*
        , s.Classification AS specialty

    FROM nppes AS n

    JOIN cbsa AS c
        ON c.ZIP = CAST(n.zip_5 AS INTEGER)
    
    LEFT JOIN specialty AS s
        ON s.npi = n.npi
    
    WHERE n.npi = 1215064852 -- Limit to Elizabeth Null only
)
,

referrals_to AS (
    SELECT n.npi
    , n.specialty
    , rto.*
    
    FROM nashville_providers AS n

    JOIN nashville_referrals AS rto
        ON rto.to_npi = n.npi
)
,

referrals_from AS (
    SELECT n.npi
    , n.specialty
    , rfrom.*

    FROM nashville_providers AS n

    JOIN nashville_referrals AS rfrom
        ON rfrom.from_npi = n.npi
)
SELECT *

FROM referrals_to

UNION

SELECT *
FROM referrals_from
;
"""

# Read the query results into a dataframe
liz_null = pd.read_sql(sample, db)

In [21]:
# Take a look at the sample provider dataframe
liz_null

Unnamed: 0,npi,specialty,index,from_npi,to_npi,patient_count,transaction_count,average_day_wait,std_day_wait
0,1215064852,Psychiatry & Neurology,1777418,1073531042,1215064852,83,87,46.575,61.971
1,1215064852,Psychiatry & Neurology,2732631,1326130642,1215064852,47,66,41.848,50.439
2,1215064852,Psychiatry & Neurology,12565235,1467460725,1215064852,261,374,32.687,45.179
3,1215064852,Psychiatry & Neurology,15967368,1861479545,1215064852,247,378,30.696,47.267
4,1215064852,Psychiatry & Neurology,25617535,1295726032,1215064852,203,250,49.4,54.901
5,1215064852,Psychiatry & Neurology,28491271,1578591756,1215064852,50,75,39.04,48.248
6,1215064852,Psychiatry & Neurology,30072758,1215064852,1104202761,119,151,47.841,53.878
7,1215064852,Psychiatry & Neurology,30220192,1215064852,1205886264,363,637,2.036,14.145
8,1215064852,Psychiatry & Neurology,30225822,1205886264,1215064852,363,613,0.618,10.491
9,1215064852,Psychiatry & Neurology,30342405,1215064852,1295726032,174,224,44.263,50.661


### Take a look at the total hop-teaming data joined up

In [29]:
nashville_referrals_agg = """
WITH nashville_providers AS (
    SELECT n.*
        , s.Classification AS specialty

    FROM nppes AS n

    JOIN cbsa AS c
        ON c.ZIP = CAST(n.zip_5 AS INTEGER)
    
    LEFT JOIN specialty AS s
        ON s.npi = n.npi
)
,

referrals_to AS (
    SELECT rto.*
    , s.Classification AS from_npi_specialty
    , n.specialty AS to_npi_specialty
    
    FROM nashville_providers AS n

    JOIN nashville_referrals AS rto
        ON rto.to_npi = n.npi
    
    LEFT JOIN specialty AS s
        ON s.npi = rto.from_npi
    
    WHERE n.entity_type_code = 2 -- only get referrals made *to* organizations
)
,

referrals_from AS (
    SELECT rfrom.*
    , n.specialty AS from_npi_specialty
    , s.Classification AS to_npi_specialty

    FROM nashville_providers AS n

    JOIN nashville_referrals AS rfrom
        ON rfrom.from_npi = n.npi
    
    LEFT JOIN specialty AS s
        ON s.npi = rfrom.to_npi

    WHERE n.entity_type_code = 1 -- only get referrals made *by* individual providers
)
,

tot AS (
SELECT *

FROM referrals_to

UNION

SELECT *
FROM referrals_from
)

SELECT COUNT(*) AS tot_rows
    , COUNT(DISTINCT t.from_npi) AS unique_from_npis
    , COUNT(DISTINCT t.to_npi) AS unique_to_npis
    
FROM tot AS t
;
"""
nashville_referrals_agg = pd.read_sql(nashville_referrals_agg, db)

In [30]:
nashville_referrals_agg

Unnamed: 0,tot_rows,unique_from_npis,unique_to_npis
0,174382,28804,20660


*N.B.: The unique number of provider organizations (that is, entity type code 2) is 2.5 times the number of entities within the Nashville CBSA. This would be the case in Nashville area providers are referring to healthcare organizations outside of the Nashville CBSA...*

In [31]:
# Write a query to get every unique from-to NPI pair where
# The from_npi is an individual provider
# The to_npi is an organization
# And one or both of the NPIs are located in the Nashville CBSA

nashville_referrals = """
WITH nashville_providers AS (
    SELECT n.*
        , s.Classification AS specialty

    FROM nppes AS n

    JOIN cbsa AS c
        ON c.ZIP = CAST(n.zip_5 AS INTEGER)
    
    LEFT JOIN specialty AS s
        ON s.npi = n.npi
)
,

referrals_to AS (
    SELECT rto.*
    , s.Classification AS from_npi_specialty
    , n.specialty AS to_npi_specialty
    
    FROM nashville_providers AS n

    JOIN nashville_referrals AS rto
        ON rto.to_npi = n.npi
    
    LEFT JOIN specialty AS s
        ON s.npi = rto.from_npi
    
    WHERE n.entity_type_code = 2 -- only get referrals made *to* organizations
)
,

referrals_from AS (
    SELECT rfrom.*
    , n.specialty AS from_npi_specialty
    , s.Classification AS to_npi_specialty

    FROM nashville_providers AS n

    JOIN nashville_referrals AS rfrom
        ON rfrom.from_npi = n.npi
    
    LEFT JOIN specialty AS s
        ON s.npi = rfrom.to_npi

    WHERE n.entity_type_code = 1 -- only get referrals made *by* individual providers
)

SELECT *

FROM referrals_to

UNION -- use this to ensure that the results are de-duplicated

SELECT *

FROM referrals_from

;
"""
nashville_referrals = pd.read_sql(nashville_referrals, db)

In [32]:
# Take a look at the shape of the resulting dataset
nashville_referrals.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 174382 entries, 0 to 174381
Data columns (total 9 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   index               174382 non-null  int64  
 1   from_npi            174382 non-null  int64  
 2   to_npi              174382 non-null  int64  
 3   patient_count       174382 non-null  int64  
 4   transaction_count   174382 non-null  int64  
 5   average_day_wait    174382 non-null  float64
 6   std_day_wait        174382 non-null  float64
 7   from_npi_specialty  173414 non-null  object 
 8   to_npi_specialty    173494 non-null  object 
dtypes: float64(2), int64(5), object(2)
memory usage: 12.0+ MB


In [33]:
# Double check that the counts line up with the aggregate query above
nashville_referrals.from_npi.nunique()

28804

In [34]:
# Double check that the counts line up with the aggregate query above
nashville_referrals.to_npi.nunique()

20660

In [35]:
nashville_referrals.head(100)

Unnamed: 0,index,from_npi,to_npi,patient_count,transaction_count,average_day_wait,std_day_wait,from_npi_specialty,to_npi_specialty
0,705,1013154723,1003028770,32,50,41.94,52.32,Durable Medical Equipment & Medical Supplies,Dermatology
1,706,1003963976,1003028770,2535,3945,0.0,0.0,Dermatology,Dermatology
2,1139,1003854258,1003052697,31,55,20.436,26.403,Family Medicine,Home Health
3,1713,1043232879,1003076233,76,78,28.244,52.86,Radiology,Internal Medicine
4,1755,1023046901,1003077728,61,67,20.448,54.516,Radiology,Thoracic Surgery (Cardiothoracic Vascular Surg...
5,1757,1003847492,1003077934,170,192,32.26,63.009,Pathology,Internal Medicine
6,2353,1033109285,1003096553,354,373,10.745,36.743,Radiology,Emergency Medicine
7,3207,1023055126,1003152208,371,759,12.601,35.963,General Acute Care Hospital,Obstetrics & Gynecology
8,3208,1003863580,1003152208,509,760,8.5,25.963,Pathology,Obstetrics & Gynecology
9,3209,1043297542,1003152208,60,85,34.071,39.537,Urology,Obstetrics & Gynecology


*This dataset has all of the unique from-to NPI pairs where the from_npi is an individual provider, the to_npi is an organization, and one or both of them is located in the Nashville CBSA. The query shows that there are 174,382 unique from-to NPI pairs in Nashville. Once we validate the query, we should be able to use the resulting dataframe as a base for visualizing how different types of providers interact.*

In [36]:
# Write out the complete dataset to a csv to upload into neo4j
nashville_referrals.to_csv('../data/nashville_referrals.csv', index = False)