In [2]:
import pandas as pd
import sqlite3
from tqdm.notebook import tqdm
import numpy as np
import matplotlib.pyplot as plt
pd.options.display.max_rows = 1000

In [4]:
# Assign sqlite database to db variable
db = sqlite3.connect('../data/hop_teaming.sqlite')

In [15]:
#The nashville_referrals_internal query will not complete on my machine,
#for whatever reason, so breaking it down here into pieces
nashville_providers = """
    SELECT n.*
        , s.Classification AS specialty

    FROM nppes AS n

    JOIN cbsa AS c
        ON c.ZIP = CAST(n.zip_5 AS INTEGER)
    
    LEFT JOIN specialty AS s
        ON s.npi = n.npi
"""
nashville_providers = pd.read_sql(nashville_providers, db)

In [16]:
nashville_providers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38414 entries, 0 to 38413
Data columns (total 16 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   npi                         38414 non-null  int64  
 1   entity_type_code            38414 non-null  float64
 2   provider_organization_name  7574 non-null   object 
 3   provider_last_name          30839 non-null  object 
 4   provider_first_name         30840 non-null  object 
 5   provider_middle_name        19914 non-null  object 
 6   provider_name_prefix        11560 non-null  object 
 7   provider_name_suffix        907 non-null    object 
 8   provider_credential         25323 non-null  object 
 9   address_01                  38414 non-null  object 
 10  address_02                  9957 non-null   object 
 11  city                        38414 non-null  object 
 12  state                       38414 non-null  object 
 13  zip_9                       384

In [17]:
#Save a copy
nashville_providers.to_csv('../data/nashville_providers.csv')

In [18]:
cursor = db.cursor()
cursor.execute('drop table if exists nash_providers')
print('Table dropped...')

Table dropped...


In [19]:
nashville_providers.to_sql('nash_providers', db, if_exists = 'append',
                           index = False)

In [20]:
query = '''
select * from nash_providers
'''
test = pd.read_sql(query, db)
test

Unnamed: 0,npi,entity_type_code,provider_organization_name,provider_last_name,provider_first_name,provider_middle_name,provider_name_prefix,provider_name_suffix,provider_credential,address_01,address_02,city,state,zip_9,zip_5,specialty
0,1134122187,1.0,,RUDNICKE,CHERYL,DENISE,MRS.,,CRNP,250 25TH AVE N,STE 412,NASHVILLE,TN,372031632.0,37203,Nurse Practitioner
1,1003819046,1.0,,NYLANDER,BARBARA,H,,,M.D.,345 23RD AVE N,SUITE 209,NASHVILLE,TN,372031513.0,37203,Obstetrics & Gynecology
2,1750384780,1.0,,PERRIGIN,JULIE,A,DR.,,MD,219 CHURCH ST,,DICKSON,TN,370551303.0,37055,Family Medicine
3,1922001957,1.0,,PRESLEY,RICHARD,E,,,M.D.,2011 MURPHY AVE,STE 302,NASHVILLE,TN,372032023.0,37203,Obstetrics & Gynecology
4,1073516001,1.0,,ROSS,DAVID,L,DR.,,MD,127 CRESTVIEW PARK DR,,DICKSON,TN,370552850.0,37055,Internal Medicine
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38409,1477145944,1.0,,HILL,DOMINIQUE,DANIELLE,MRS.,,"M.ED., LPC",1712 BISCAYNE DR,,FRANKLIN,TN,370671475.0,37067,Counselor
38410,1447842935,1.0,,CLEVELAND,MEREDITH,B,MRS.,,"MA, LPC-MHSP",5205 MARYLAND WAY STE 310,,BRENTWOOD,TN,370271049.0,37027,Counselor
38411,1639761190,1.0,,ARNDT,VERONICA,LEE,,,NP,30 HAMILTON STATION XING APT 104,,LEBANON,TN,370879017.0,37087,Registered Nurse
38412,1588256051,1.0,,WRAY,DAWN,MICHELLE,,,"MSN, APRN, FNP-C",3712 OLD CLARKSVILLE PIKE,,JOELTON,TN,370808818.0,37080,Nurse Practitioner


### Sub the temp table from above into the nashville_referrals_internal query

In [21]:
# Write a query to get every unique from-to NPI pair where
# The from_npi is an individual provider
# The to_npi is an organization
# Both of the NPIs are located in the Nashville CBSA

nashville_referrals_internal = """
WITH nashville_providers AS (
    SELECT * from nash_providers
)
,

referrals_to AS (
    SELECT rto.*
    , CAST(n2.zip_5 AS INTEGER) AS from_zip
    , s.Classification AS from_npi_specialty
    , CAST(n.zip_5 AS INTEGER) AS to_zip
    , n.specialty AS to_npi_specialty
    
    FROM nashville_providers AS n

    JOIN nashville_referrals AS rto
        ON rto.to_npi = n.npi
    
    JOIN nashville_providers AS n2
        ON n2.npi = rto.from_npi
    
    LEFT JOIN specialty AS s
        ON s.npi = rto.from_npi
    
    -- TONY: Comment out the filters below to prepare the data for neo4j
    WHERE n.entity_type_code = 2 -- only get referrals made *to* organizations
    AND n2.entity_type_code = 1 -- only get referrrals made *by* individuals

)
,

referrals_from AS (
    SELECT rfrom.*
    , CAST(n.zip_5 AS INTEGER) AS from_zip
    , n.specialty AS from_npi_specialty
    , CAST(n2.zip_5 AS INTEGER) AS to_zip
    , s.Classification AS to_npi_specialty

    FROM nashville_providers AS n

    JOIN nashville_referrals AS rfrom
        ON rfrom.from_npi = n.npi

    JOIN nashville_providers AS n2
        ON n2.npi = rfrom.to_npi

    LEFT JOIN specialty AS s
        ON s.npi = rfrom.to_npi
    
    -- TONY: Comment out the filters below to prepare the data for neo4j
    WHERE n.entity_type_code = 1 -- only get referrals made *by* individual providers
    AND n2.entity_type_code = 2 -- only get referrals made *to* organizations
)

SELECT rto.*

FROM referrals_to AS rto

UNION -- use this to ensure that the results are de-duplicated

SELECT rfrom.*

FROM referrals_from AS rfrom

;
"""
nashville_referrals_internal = pd.read_sql(nashville_referrals_internal, db)

In [22]:
nashville_referrals_internal.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41419 entries, 0 to 41418
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   index               41419 non-null  int64  
 1   from_npi            41419 non-null  int64  
 2   to_npi              41419 non-null  int64  
 3   patient_count       41419 non-null  int64  
 4   transaction_count   41419 non-null  int64  
 5   average_day_wait    41419 non-null  float64
 6   std_day_wait        41419 non-null  float64
 7   from_zip            41419 non-null  int64  
 8   from_npi_specialty  41354 non-null  object 
 9   to_zip              41419 non-null  int64  
 10  to_npi_specialty    41193 non-null  object 
dtypes: float64(2), int64(7), object(2)
memory usage: 3.5+ MB


In [23]:
# Write out the complete dataset to a csv to upload into neo4j
nashville_referrals_internal.to_csv('../data/nashville_referrals_internal.csv', index = False)

In [24]:
db.close()

### Here is where the data exploration starts

In [89]:
competitor_referrals = pd.read_csv('../data/nashville_referrals_internal.csv')

In [90]:
competitor_referrals.head()

Unnamed: 0,index,from_npi,to_npi,patient_count,transaction_count,average_day_wait,std_day_wait,from_zip,from_npi_specialty,to_zip,to_npi_specialty
0,706,1003963976,1003028770,2535,3945,0.0,0.0,37174,Dermatology,37174,Dermatology
1,9754,1033246640,1003863580,58,58,45.603,56.574,37232,Family Medicine,37027,Pathology
2,9755,1033215157,1003863580,124,126,22.833,53.329,38401,Specialist,37027,Pathology
3,9760,1023223898,1003863580,1739,1872,0.169,5.185,37027,Pathology,37027,Pathology
4,9763,1023253549,1003863580,34,53,31.887,50.676,37214,Chiropractor,37027,Pathology


In [91]:
competitor_referrals.drop('index', axis = 1, inplace=True)

In [92]:
competitor_referrals = competitor_referrals[competitor_referrals['to_npi_specialty'] == 'General Acute Care Hospital']

In [93]:
competitor_referrals.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6436 entries, 38 to 41215
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   from_npi            6436 non-null   int64  
 1   to_npi              6436 non-null   int64  
 2   patient_count       6436 non-null   int64  
 3   transaction_count   6436 non-null   int64  
 4   average_day_wait    6436 non-null   float64
 5   std_day_wait        6436 non-null   float64
 6   from_zip            6436 non-null   int64  
 7   from_npi_specialty  6425 non-null   object 
 8   to_zip              6436 non-null   int64  
 9   to_npi_specialty    6436 non-null   object 
dtypes: float64(2), int64(6), object(2)
memory usage: 553.1+ KB


In [8]:
db = sqlite3.connect('../data/hop_teaming.sqlite')

In [36]:
query = """
select name from sqlite_master where type = 'table'
"""

In [80]:
query = """
SELECT name FROM PRAGMA_TABLE_INFO('specialty')
"""

In [81]:
test = pd.read_sql(query,db)
test

Unnamed: 0,name
0,npi
1,primary_taxonomy
2,Classification


In [97]:
query = '''
select npi as to_npi, provider_organization_name as to_npi_orgname
from nppes
'''

In [98]:
npi_orgname = pd.read_sql(query, db)

In [99]:
npi_orgname.head()

Unnamed: 0,to_npi,to_npi_orgname
0,1679576722,
1,1588667638,
2,1497758544,"CUMBERLAND COUNTY HOSPITAL SYSTEM, INC"
3,1306849450,
4,1215930367,


In [100]:
competitor_referrals = competitor_referrals.merge(npi_orgname, how = 'left', on='to_npi')

In [101]:
competitor_referrals.head()

Unnamed: 0,from_npi,to_npi,patient_count,transaction_count,average_day_wait,std_day_wait,from_zip,from_npi_specialty,to_zip,to_npi_specialty,to_npi_orgname
0,1033570015,1023055126,63,72,26.722,44.023,37203,Nurse Practitioner,37203,General Acute Care Hospital,"HCA HEALTH SERVICES OF TENNESSEE, INC."
1,1033532585,1023055126,152,214,40.439,59.85,37203,Nurse Practitioner,37203,General Acute Care Hospital,"HCA HEALTH SERVICES OF TENNESSEE, INC."
2,1043294747,1023055126,33,56,34.214,50.209,37146,Internal Medicine,37203,General Acute Care Hospital,"HCA HEALTH SERVICES OF TENNESSEE, INC."
3,1013179860,1023055126,95,96,32.021,37.595,37075,Internal Medicine,37203,General Acute Care Hospital,"HCA HEALTH SERVICES OF TENNESSEE, INC."
4,1003991167,1023055126,110,151,38.113,45.857,37203,Ophthalmology,37203,General Acute Care Hospital,"HCA HEALTH SERVICES OF TENNESSEE, INC."


In [102]:
competitor_referrals.to_npi_orgname.value_counts()

VANDERBILT UNIVERSITY MEDICAL CENTER       2190
HCA HEALTH SERVICES OF TENNESSEE, INC.     1198
SAINT THOMAS WEST HOSPITAL                  857
WILLIAMSON COUNTY HOSPITAL DISTRICT         332
SAINT THOMAS RUTHERFORD HOSPITAL            301
MAURY REGIONAL HOSPITAL                     292
HENDERSONVILLE HOSPITAL CORPORATION         278
HTI MEMORIAL HOSPITAL CORPORATION           270
SUMNER REGIONAL MEDICAL CENTER LLC          210
CENTRAL TENNESSEE HOSPITAL CORPORATION      172
NORTHCREST MEDICAL CENTER                   139
MACON COUNTY GENERAL HOSPITAL, INC.          59
RIVERVIEW MEDICAL CENTER LLC                 57
NASHVILLE GENERAL HOSPITAL                   43
TROUSDALE MEDICAL CENTER LLC                 24
SAINT THOMAS STONES RIVER HOSPITAL, LLC      14
Name: to_npi_orgname, dtype: int64

In [103]:
query = '''
select npi as from_npi, provider_first_name as from_fname, provider_last_name as from_lname
from nppes
'''

In [104]:
npi_orgname = pd.read_sql(query, db)

In [105]:
competitor_referrals = competitor_referrals.merge(npi_orgname, how = 'left', on='from_npi')

In [106]:
competitor_referrals.head()

Unnamed: 0,from_npi,to_npi,patient_count,transaction_count,average_day_wait,std_day_wait,from_zip,from_npi_specialty,to_zip,to_npi_specialty,to_npi_orgname,from_fname,from_lname
0,1033570015,1023055126,63,72,26.722,44.023,37203,Nurse Practitioner,37203,General Acute Care Hospital,"HCA HEALTH SERVICES OF TENNESSEE, INC.",LESLEY,BALLANCE
1,1033532585,1023055126,152,214,40.439,59.85,37203,Nurse Practitioner,37203,General Acute Care Hospital,"HCA HEALTH SERVICES OF TENNESSEE, INC.",DENISE,BROWN
2,1043294747,1023055126,33,56,34.214,50.209,37146,Internal Medicine,37203,General Acute Care Hospital,"HCA HEALTH SERVICES OF TENNESSEE, INC.",FRED,NORDQUIST
3,1013179860,1023055126,95,96,32.021,37.595,37075,Internal Medicine,37203,General Acute Care Hospital,"HCA HEALTH SERVICES OF TENNESSEE, INC.",GARY,YAWN
4,1003991167,1023055126,110,151,38.113,45.857,37203,Ophthalmology,37203,General Acute Care Hospital,"HCA HEALTH SERVICES OF TENNESSEE, INC.",PETER,SONKIN


In [38]:
competitor_referrals[].unique()

array(['SPRING HILL DERMATOLOGY PLC',
       'SUNCREST HEALTHCARE OF EAST TENNESSEE, LLC', None, ...,
       'LUBBOCK HERITAGE HOSPITAL, LLC.',
       'MIDWEST EMERGENCY DEPARTMENT SPECIALISTS LTD',
       'NORTHERN VALLEY ANESTHESIOLOGY, P.A.'], dtype=object)

In [112]:
competitor_referrals[competitor_referrals.to_npi_orgname=='VANDERBILT UNIVERSITY MEDICAL CENTER'].to_npi.unique()

array([1306889597, 1396882205, 1558408633], dtype=int64)

In [113]:
refers_to_vandy = competitor_referrals[competitor_referrals.to_npi_orgname=='VANDERBILT UNIVERSITY MEDICAL CENTER']

In [116]:
refers_to_vandy.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2190 entries, 60 to 6294
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   from_npi            2190 non-null   int64  
 1   to_npi              2190 non-null   int64  
 2   patient_count       2190 non-null   int64  
 3   transaction_count   2190 non-null   int64  
 4   average_day_wait    2190 non-null   float64
 5   std_day_wait        2190 non-null   float64
 6   from_zip            2190 non-null   int64  
 7   from_npi_specialty  2187 non-null   object 
 8   to_zip              2190 non-null   int64  
 9   to_npi_specialty    2190 non-null   object 
 10  to_npi_orgname      2190 non-null   object 
 11  from_fname          2190 non-null   object 
 12  from_lname          2190 non-null   object 
dtypes: float64(2), int64(6), object(5)
memory usage: 239.5+ KB


In [118]:
refers_to_vandy.from_npi.nunique()

2130

In [124]:
competitors_only = competitor_referrals.merge(refers_to_vandy['from_npi'], how = 'left', on = 'from_npi', 
                                              indicator = True).query('_merge =="left_only"')

In [125]:
competitors_only

Unnamed: 0,from_npi,to_npi,patient_count,transaction_count,average_day_wait,std_day_wait,from_zip,from_npi_specialty,to_zip,to_npi_specialty,to_npi_orgname,from_fname,from_lname,_merge
0,1033570015,1023055126,63,72,26.722,44.023,37203,Nurse Practitioner,37203,General Acute Care Hospital,"HCA HEALTH SERVICES OF TENNESSEE, INC.",LESLEY,BALLANCE,left_only
1,1033532585,1023055126,152,214,40.439,59.850,37203,Nurse Practitioner,37203,General Acute Care Hospital,"HCA HEALTH SERVICES OF TENNESSEE, INC.",DENISE,BROWN,left_only
6,1003813593,1023055126,106,106,3.406,9.345,37203,Anesthesiology,37203,General Acute Care Hospital,"HCA HEALTH SERVICES OF TENNESSEE, INC.",WILLIAM,RODES,left_only
7,1003802042,1023055126,104,106,3.811,5.524,37203,"Nurse Anesthetist, Certified Registered",37203,General Acute Care Hospital,"HCA HEALTH SERVICES OF TENNESSEE, INC.",JON,SCHWINDT,left_only
8,1003801788,1023055126,357,708,4.859,20.795,37203,Internal Medicine,37203,General Acute Care Hospital,"HCA HEALTH SERVICES OF TENNESSEE, INC.",JOHN,ANDERSON,left_only
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6648,1235172800,1164590386,172,199,2.327,13.727,37027,Internal Medicine,37129,General Acute Care Hospital,SAINT THOMAS RUTHERFORD HOSPITAL,ARCHANA,MEHTA,left_only
6649,1265602999,1164590386,647,702,2.031,5.379,37129,Emergency Medicine,37129,General Acute Care Hospital,SAINT THOMAS RUTHERFORD HOSPITAL,ARTHUR,SMOLENSKY,left_only
6650,1265502819,1164590386,67,153,2.059,11.127,37130,Surgery,37129,General Acute Care Hospital,SAINT THOMAS RUTHERFORD HOSPITAL,ROBERT,DURGIN,left_only
6651,1265469936,1164590386,401,424,2.377,13.314,37130,Emergency Medicine,37129,General Acute Care Hospital,SAINT THOMAS RUTHERFORD HOSPITAL,RUSSELL,GALLOWAY,left_only


In [130]:
competitors_only.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2862 entries, 0 to 6652
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype   
---  ------              --------------  -----   
 0   from_npi            2862 non-null   int64   
 1   to_npi              2862 non-null   int64   
 2   patient_count       2862 non-null   int64   
 3   transaction_count   2862 non-null   int64   
 4   average_day_wait    2862 non-null   float64 
 5   std_day_wait        2862 non-null   float64 
 6   from_zip            2862 non-null   int64   
 7   from_npi_specialty  2858 non-null   object  
 8   to_zip              2862 non-null   int64   
 9   to_npi_specialty    2862 non-null   object  
 10  to_npi_orgname      2862 non-null   object  
 11  from_fname          2862 non-null   object  
 12  from_lname          2861 non-null   object  
 13  _merge              2862 non-null   category
dtypes: category(1), float64(2), int64(6), object(5)
memory usage: 315.9+ KB
