In [2]:
import pandas as pd
import sqlite3
from tqdm.notebook import tqdm
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
pd.options.display.max_rows = 1000

# Ignore this part, which is desperation data refactoring

In [4]:
# Assign sqlite database to db variable
db = sqlite3.connect('../data/hop_teaming.sqlite')

In [15]:
#The nashville_referrals_internal query will not complete on my machine,
#for whatever reason, so breaking it down here into pieces
nashville_providers = """
    SELECT n.*
        , s.Classification AS specialty

    FROM nppes AS n

    JOIN cbsa AS c
        ON c.ZIP = CAST(n.zip_5 AS INTEGER)
    
    LEFT JOIN specialty AS s
        ON s.npi = n.npi
"""
nashville_providers = pd.read_sql(nashville_providers, db)

In [16]:
nashville_providers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38414 entries, 0 to 38413
Data columns (total 16 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   npi                         38414 non-null  int64  
 1   entity_type_code            38414 non-null  float64
 2   provider_organization_name  7574 non-null   object 
 3   provider_last_name          30839 non-null  object 
 4   provider_first_name         30840 non-null  object 
 5   provider_middle_name        19914 non-null  object 
 6   provider_name_prefix        11560 non-null  object 
 7   provider_name_suffix        907 non-null    object 
 8   provider_credential         25323 non-null  object 
 9   address_01                  38414 non-null  object 
 10  address_02                  9957 non-null   object 
 11  city                        38414 non-null  object 
 12  state                       38414 non-null  object 
 13  zip_9                       384

In [17]:
#Save a copy
nashville_providers.to_csv('../data/nashville_providers.csv')

In [18]:
cursor = db.cursor()
cursor.execute('drop table if exists nash_providers')
print('Table dropped...')

Table dropped...


In [19]:
nashville_providers.to_sql('nash_providers', db, if_exists = 'append',
                           index = False)

In [20]:
query = '''
select * from nash_providers
'''
test = pd.read_sql(query, db)
test

Unnamed: 0,npi,entity_type_code,provider_organization_name,provider_last_name,provider_first_name,provider_middle_name,provider_name_prefix,provider_name_suffix,provider_credential,address_01,address_02,city,state,zip_9,zip_5,specialty
0,1134122187,1.0,,RUDNICKE,CHERYL,DENISE,MRS.,,CRNP,250 25TH AVE N,STE 412,NASHVILLE,TN,372031632.0,37203,Nurse Practitioner
1,1003819046,1.0,,NYLANDER,BARBARA,H,,,M.D.,345 23RD AVE N,SUITE 209,NASHVILLE,TN,372031513.0,37203,Obstetrics & Gynecology
2,1750384780,1.0,,PERRIGIN,JULIE,A,DR.,,MD,219 CHURCH ST,,DICKSON,TN,370551303.0,37055,Family Medicine
3,1922001957,1.0,,PRESLEY,RICHARD,E,,,M.D.,2011 MURPHY AVE,STE 302,NASHVILLE,TN,372032023.0,37203,Obstetrics & Gynecology
4,1073516001,1.0,,ROSS,DAVID,L,DR.,,MD,127 CRESTVIEW PARK DR,,DICKSON,TN,370552850.0,37055,Internal Medicine
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38409,1477145944,1.0,,HILL,DOMINIQUE,DANIELLE,MRS.,,"M.ED., LPC",1712 BISCAYNE DR,,FRANKLIN,TN,370671475.0,37067,Counselor
38410,1447842935,1.0,,CLEVELAND,MEREDITH,B,MRS.,,"MA, LPC-MHSP",5205 MARYLAND WAY STE 310,,BRENTWOOD,TN,370271049.0,37027,Counselor
38411,1639761190,1.0,,ARNDT,VERONICA,LEE,,,NP,30 HAMILTON STATION XING APT 104,,LEBANON,TN,370879017.0,37087,Registered Nurse
38412,1588256051,1.0,,WRAY,DAWN,MICHELLE,,,"MSN, APRN, FNP-C",3712 OLD CLARKSVILLE PIKE,,JOELTON,TN,370808818.0,37080,Nurse Practitioner


### Also Ignore - Sub the temp table from above into the nashville_referrals_internal query

In [21]:
# Write a query to get every unique from-to NPI pair where
# The from_npi is an individual provider
# The to_npi is an organization
# Both of the NPIs are located in the Nashville CBSA

nashville_referrals_internal = """
WITH nashville_providers AS (
    SELECT * from nash_providers
)
,

referrals_to AS (
    SELECT rto.*
    , CAST(n2.zip_5 AS INTEGER) AS from_zip
    , s.Classification AS from_npi_specialty
    , CAST(n.zip_5 AS INTEGER) AS to_zip
    , n.specialty AS to_npi_specialty
    
    FROM nashville_providers AS n

    JOIN nashville_referrals AS rto
        ON rto.to_npi = n.npi
    
    JOIN nashville_providers AS n2
        ON n2.npi = rto.from_npi
    
    LEFT JOIN specialty AS s
        ON s.npi = rto.from_npi
    
    -- TONY: Comment out the filters below to prepare the data for neo4j
    WHERE n.entity_type_code = 2 -- only get referrals made *to* organizations
    AND n2.entity_type_code = 1 -- only get referrrals made *by* individuals

)
,

referrals_from AS (
    SELECT rfrom.*
    , CAST(n.zip_5 AS INTEGER) AS from_zip
    , n.specialty AS from_npi_specialty
    , CAST(n2.zip_5 AS INTEGER) AS to_zip
    , s.Classification AS to_npi_specialty

    FROM nashville_providers AS n

    JOIN nashville_referrals AS rfrom
        ON rfrom.from_npi = n.npi

    JOIN nashville_providers AS n2
        ON n2.npi = rfrom.to_npi

    LEFT JOIN specialty AS s
        ON s.npi = rfrom.to_npi
    
    -- TONY: Comment out the filters below to prepare the data for neo4j
    WHERE n.entity_type_code = 1 -- only get referrals made *by* individual providers
    AND n2.entity_type_code = 2 -- only get referrals made *to* organizations
)

SELECT rto.*

FROM referrals_to AS rto

UNION -- use this to ensure that the results are de-duplicated

SELECT rfrom.*

FROM referrals_from AS rfrom

;
"""
nashville_referrals_internal = pd.read_sql(nashville_referrals_internal, db)

In [22]:
nashville_referrals_internal.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41419 entries, 0 to 41418
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   index               41419 non-null  int64  
 1   from_npi            41419 non-null  int64  
 2   to_npi              41419 non-null  int64  
 3   patient_count       41419 non-null  int64  
 4   transaction_count   41419 non-null  int64  
 5   average_day_wait    41419 non-null  float64
 6   std_day_wait        41419 non-null  float64
 7   from_zip            41419 non-null  int64  
 8   from_npi_specialty  41354 non-null  object 
 9   to_zip              41419 non-null  int64  
 10  to_npi_specialty    41193 non-null  object 
dtypes: float64(2), int64(7), object(2)
memory usage: 3.5+ MB


In [23]:
# Write out the complete dataset to a csv to upload into neo4j
nashville_referrals_internal.to_csv('../data/nashville_referrals_internal.csv', index = False)

In [24]:
db.close()

# Start Here - this is where the data exploration starts

In [383]:
# Visual 1: All competitors-only providers with "significant" referrals to 
# other hospitals. So, top N quantile, by patient count. Will a sunburst work
# for this?

# Visual 2: All competitors-only providers by specialty with their total patient
# counts. Limit to top 10 specialties by volume, or choose some other type of 
# visual that will convey a larger amount of data in a usable way.

# Visual 3: For each specialty, who are the top providers?

# The 3 above answer the basic questions. What else can we do that's interesting?



In [55]:
competitor_referrals = pd.read_csv('../data/nashville_referrals_internal.csv')

In [56]:
competitor_referrals.head()

Unnamed: 0,index,from_npi,to_npi,patient_count,transaction_count,average_day_wait,std_day_wait,from_zip,from_npi_specialty,to_zip,to_npi_specialty
0,706,1003963976,1003028770,2535,3945,0.0,0.0,37174,Dermatology,37174,Dermatology
1,9754,1033246640,1003863580,58,58,45.603,56.574,37232,Family Medicine,37027,Pathology
2,9755,1033215157,1003863580,124,126,22.833,53.329,38401,Specialist,37027,Pathology
3,9760,1023223898,1003863580,1739,1872,0.169,5.185,37027,Pathology,37027,Pathology
4,9763,1023253549,1003863580,34,53,31.887,50.676,37214,Chiropractor,37027,Pathology


In [57]:
competitor_referrals.drop('index', axis = 1, inplace=True)

In [58]:
competitor_referrals = competitor_referrals[competitor_referrals['to_npi_specialty'] == 'General Acute Care Hospital']


In [59]:
competitor_referrals.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6436 entries, 38 to 41215
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   from_npi            6436 non-null   int64  
 1   to_npi              6436 non-null   int64  
 2   patient_count       6436 non-null   int64  
 3   transaction_count   6436 non-null   int64  
 4   average_day_wait    6436 non-null   float64
 5   std_day_wait        6436 non-null   float64
 6   from_zip            6436 non-null   int64  
 7   from_npi_specialty  6425 non-null   object 
 8   to_zip              6436 non-null   int64  
 9   to_npi_specialty    6436 non-null   object 
dtypes: float64(2), int64(6), object(2)
memory usage: 553.1+ KB


In [60]:
db = sqlite3.connect('../data/hop_teaming.sqlite')

In [61]:
query = """
select name from sqlite_master where type = 'table'
"""

In [62]:
query = """
SELECT name FROM PRAGMA_TABLE_INFO('specialty')
"""

In [63]:
test = pd.read_sql(query,db)
test

Unnamed: 0,name
0,npi
1,primary_taxonomy
2,Classification


In [64]:
query = '''
select npi as to_npi, provider_organization_name as to_npi_orgname
from nppes
'''

In [65]:
npi_orgname = pd.read_sql(query, db)

In [66]:
npi_orgname.head()

Unnamed: 0,to_npi,to_npi_orgname
0,1679576722,
1,1588667638,
2,1497758544,"CUMBERLAND COUNTY HOSPITAL SYSTEM, INC"
3,1306849450,
4,1215930367,


In [67]:
competitor_referrals = competitor_referrals.merge(npi_orgname, how = 'left', on='to_npi')

In [68]:
competitor_referrals.head()

Unnamed: 0,from_npi,to_npi,patient_count,transaction_count,average_day_wait,std_day_wait,from_zip,from_npi_specialty,to_zip,to_npi_specialty,to_npi_orgname
0,1033570015,1023055126,63,72,26.722,44.023,37203,Nurse Practitioner,37203,General Acute Care Hospital,"HCA HEALTH SERVICES OF TENNESSEE, INC."
1,1033532585,1023055126,152,214,40.439,59.85,37203,Nurse Practitioner,37203,General Acute Care Hospital,"HCA HEALTH SERVICES OF TENNESSEE, INC."
2,1043294747,1023055126,33,56,34.214,50.209,37146,Internal Medicine,37203,General Acute Care Hospital,"HCA HEALTH SERVICES OF TENNESSEE, INC."
3,1013179860,1023055126,95,96,32.021,37.595,37075,Internal Medicine,37203,General Acute Care Hospital,"HCA HEALTH SERVICES OF TENNESSEE, INC."
4,1003991167,1023055126,110,151,38.113,45.857,37203,Ophthalmology,37203,General Acute Care Hospital,"HCA HEALTH SERVICES OF TENNESSEE, INC."


In [69]:
competitor_referrals.to_npi_orgname.value_counts()

VANDERBILT UNIVERSITY MEDICAL CENTER       2190
HCA HEALTH SERVICES OF TENNESSEE, INC.     1198
SAINT THOMAS WEST HOSPITAL                  857
WILLIAMSON COUNTY HOSPITAL DISTRICT         332
SAINT THOMAS RUTHERFORD HOSPITAL            301
MAURY REGIONAL HOSPITAL                     292
HENDERSONVILLE HOSPITAL CORPORATION         278
HTI MEMORIAL HOSPITAL CORPORATION           270
SUMNER REGIONAL MEDICAL CENTER LLC          210
CENTRAL TENNESSEE HOSPITAL CORPORATION      172
NORTHCREST MEDICAL CENTER                   139
MACON COUNTY GENERAL HOSPITAL, INC.          59
RIVERVIEW MEDICAL CENTER LLC                 57
NASHVILLE GENERAL HOSPITAL                   43
TROUSDALE MEDICAL CENTER LLC                 24
SAINT THOMAS STONES RIVER HOSPITAL, LLC      14
Name: to_npi_orgname, dtype: int64

In [73]:
competitor_referrals.groupby(['to_npi_orgname', 'to_npi']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,from_npi,patient_count,transaction_count,average_day_wait,std_day_wait,from_zip,from_npi_specialty,to_zip,to_npi_specialty
to_npi_orgname,to_npi,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
CENTRAL TENNESSEE HOSPITAL CORPORATION,1871530832,172,172,172,172,172,172,171,172,172
"HCA HEALTH SERVICES OF TENNESSEE, INC.",1023055126,576,576,576,576,576,576,575,576,576
"HCA HEALTH SERVICES OF TENNESSEE, INC.",1265487193,26,26,26,26,26,26,26,26,26
"HCA HEALTH SERVICES OF TENNESSEE, INC.",1720032345,175,175,175,175,175,175,175,175,175
"HCA HEALTH SERVICES OF TENNESSEE, INC.",1932146032,1,1,1,1,1,1,1,1,1
"HCA HEALTH SERVICES OF TENNESSEE, INC.",1982650024,267,267,267,267,267,267,266,267,267
"HCA HEALTH SERVICES OF TENNESSEE, INC.",1992776405,153,153,153,153,153,153,152,153,153
HENDERSONVILLE HOSPITAL CORPORATION,1538114434,278,278,278,278,278,278,277,278,278
HTI MEMORIAL HOSPITAL CORPORATION,1295780476,270,270,270,270,270,270,270,270,270
"MACON COUNTY GENERAL HOSPITAL, INC.",1417938846,59,59,59,59,59,59,59,59,59


In [74]:
query = '''
select npi as from_npi, provider_first_name as from_fname, provider_last_name as from_lname
from nppes
'''

In [75]:
npi_orgname = pd.read_sql(query, db)

In [76]:
competitor_referrals = competitor_referrals.merge(npi_orgname, how = 'left', on='from_npi')

In [77]:
competitor_referrals.head()

Unnamed: 0,from_npi,to_npi,patient_count,transaction_count,average_day_wait,std_day_wait,from_zip,from_npi_specialty,to_zip,to_npi_specialty,to_npi_orgname,from_fname,from_lname
0,1033570015,1023055126,63,72,26.722,44.023,37203,Nurse Practitioner,37203,General Acute Care Hospital,"HCA HEALTH SERVICES OF TENNESSEE, INC.",LESLEY,BALLANCE
1,1033532585,1023055126,152,214,40.439,59.85,37203,Nurse Practitioner,37203,General Acute Care Hospital,"HCA HEALTH SERVICES OF TENNESSEE, INC.",DENISE,BROWN
2,1043294747,1023055126,33,56,34.214,50.209,37146,Internal Medicine,37203,General Acute Care Hospital,"HCA HEALTH SERVICES OF TENNESSEE, INC.",FRED,NORDQUIST
3,1013179860,1023055126,95,96,32.021,37.595,37075,Internal Medicine,37203,General Acute Care Hospital,"HCA HEALTH SERVICES OF TENNESSEE, INC.",GARY,YAWN
4,1003991167,1023055126,110,151,38.113,45.857,37203,Ophthalmology,37203,General Acute Care Hospital,"HCA HEALTH SERVICES OF TENNESSEE, INC.",PETER,SONKIN


In [78]:
# need from_npi's that are not in the list of from npi's where one of the vandy npi's is the to_npi

In [79]:
vandy_npis = competitor_referrals[competitor_referrals.to_npi_orgname=='VANDERBILT UNIVERSITY MEDICAL CENTER'].to_npi.unique()

In [80]:
print('vandy_npis')
print(vandy_npis)

vandy_npis
[1306889597 1396882205 1558408633]


In [81]:
refers_tovandy_npis = competitor_referrals[competitor_referrals['to_npi'].isin(vandy_npis)]['from_npi'].unique()

In [82]:
refers_tovandy_npis

array([1023099074, 1043205008, 1043232879, ..., 1194051987, 1194040535,
       1194016667], dtype=int64)

In [83]:
competitors_only = competitor_referrals[~competitor_referrals['from_npi'].isin(refers_tovandy_npis)]

In [84]:
competitors_only.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2862 entries, 0 to 6435
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   from_npi            2862 non-null   int64  
 1   to_npi              2862 non-null   int64  
 2   patient_count       2862 non-null   int64  
 3   transaction_count   2862 non-null   int64  
 4   average_day_wait    2862 non-null   float64
 5   std_day_wait        2862 non-null   float64
 6   from_zip            2862 non-null   int64  
 7   from_npi_specialty  2858 non-null   object 
 8   to_zip              2862 non-null   int64  
 9   to_npi_specialty    2862 non-null   object 
 10  to_npi_orgname      2862 non-null   object 
 11  from_fname          2862 non-null   object 
 12  from_lname          2861 non-null   object 
dtypes: float64(2), int64(6), object(5)
memory usage: 313.0+ KB


In [85]:
competitors_only.to_npi_orgname.unique()

array(['HCA HEALTH SERVICES OF TENNESSEE, INC.',
       'SAINT THOMAS RUTHERFORD HOSPITAL',
       'WILLIAMSON COUNTY HOSPITAL DISTRICT',
       'HTI MEMORIAL HOSPITAL CORPORATION',
       'SUMNER REGIONAL MEDICAL CENTER LLC',
       'SAINT THOMAS STONES RIVER HOSPITAL, LLC',
       'TROUSDALE MEDICAL CENTER LLC',
       'HENDERSONVILLE HOSPITAL CORPORATION',
       'NASHVILLE GENERAL HOSPITAL', 'SAINT THOMAS WEST HOSPITAL',
       'NORTHCREST MEDICAL CENTER', 'MAURY REGIONAL HOSPITAL',
       'CENTRAL TENNESSEE HOSPITAL CORPORATION',
       'RIVERVIEW MEDICAL CENTER LLC',
       'MACON COUNTY GENERAL HOSPITAL, INC.'], dtype=object)

In [87]:
competitors_only.sort_values(['patient_count'], ascending = False)

Unnamed: 0,from_npi,to_npi,patient_count,transaction_count,average_day_wait,std_day_wait,from_zip,from_npi_specialty,to_zip,to_npi_specialty,to_npi_orgname,from_fname,from_lname
1385,1417131715,1023055126,4685,6245,2.890,12.170,37203,Internal Medicine,37203,General Acute Care Hospital,"HCA HEALTH SERVICES OF TENNESSEE, INC.",JOHN,RIDDICK
1924,1710932017,1265445506,3004,3718,1.013,7.964,37072,Radiology,37067,General Acute Care Hospital,WILLIAMSON COUNTY HOSPITAL DISTRICT,ELLIOT,HIMMELFARB
3964,1598751810,1861479545,2899,3276,11.055,39.218,38401,Radiology,38401,General Acute Care Hospital,MAURY REGIONAL HOSPITAL,GARY,PODGORSKI
671,1316949191,1861479545,2842,3224,11.346,38.649,38401,Radiology,38401,General Acute Care Hospital,MAURY REGIONAL HOSPITAL,JAMES,BUTLER
5697,1538216403,1861479545,2826,3201,9.195,35.364,38401,Radiology,38401,General Acute Care Hospital,MAURY REGIONAL HOSPITAL,NATHAN,BERKLEY
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1524,1073518734,1568551455,20,64,13.828,13.798,37208,Internal Medicine,37208,General Acute Care Hospital,NASHVILLE GENERAL HOSPITAL,MARQUETTA,FAULKNER
6122,1194019034,1265445506,19,54,12.852,18.724,37122,Psychologist,37067,General Acute Care Hospital,WILLIAMSON COUNTY HOSPITAL DISTRICT,STEVE,STRIDE
4879,1245325661,1265445506,17,74,14.676,42.423,37203,Internal Medicine,37067,General Acute Care Hospital,WILLIAMSON COUNTY HOSPITAL DISTRICT,PAUL,MYERS
2404,1417382151,1669567897,16,51,8.980,31.199,37036,Social Worker,37172,General Acute Care Hospital,NORTHCREST MEDICAL CENTER,ROBYN,LOVELL


In [368]:
df = competitors_only.sort_values(['from_npi_specialty','patient_count'], 
                                  ascending = (True, False))[['from_npi_specialty','from_npi', 'from_fname','from_lname',
                                                              'to_npi_orgname', 'patient_count','average_day_wait']]
df.reset_index()
df.head()

Unnamed: 0,from_npi_specialty,from_npi,from_fname,from_lname,to_npi_orgname,patient_count,average_day_wait
2188,Allergy & Immunology,1780731661,ROBERT,VALET,"HCA HEALTH SERVICES OF TENNESSEE, INC.",345,2.851
807,Anesthesiologist Assistant,1497709752,ERIKA,GILBERT,SAINT THOMAS RUTHERFORD HOSPITAL,84,4.770
4314,Anesthesiology,1790958593,BRETT,CAMPBELL,SAINT THOMAS WEST HOSPITAL,691,4.796
382,Anesthesiology,1285950006,BRADLEY,RODGERS,MAURY REGIONAL HOSPITAL,604,2.045
3432,Anesthesiology,1114121712,SARAH,BROWN,MAURY REGIONAL HOSPITAL,598,2.672
...,...,...,...,...,...,...,...
2191,Urology,1780662221,JEFF,WHITFIELD,"HCA HEALTH SERVICES OF TENNESSEE, INC.",49,48.814
335,,1275674590,DEBORAH,SHERMAN,SAINT THOMAS WEST HOSPITAL,268,8.098
603,,1316955040,BRANDON,ALLEN,HENDERSONVILLE HOSPITAL CORPORATION,46,31.368
592,,1316955040,BRANDON,ALLEN,SUMNER REGIONAL MEDICAL CENTER LLC,40,43.446


In [369]:
excl = ['Radiology','Emergency Medicine','Nurse Anesthetist, Certified Registered',
                                         'Anesthesiology','Hospitalist','Pathology']

In [370]:
dfexcl = df[~df['from_npi_specialty'].isin(excl)]
dfexcl

Unnamed: 0,from_npi_specialty,from_npi,from_fname,from_lname,to_npi_orgname,patient_count,average_day_wait
2188,Allergy & Immunology,1780731661,ROBERT,VALET,"HCA HEALTH SERVICES OF TENNESSEE, INC.",345,2.851
807,Anesthesiologist Assistant,1497709752,ERIKA,GILBERT,SAINT THOMAS RUTHERFORD HOSPITAL,84,4.770
585,Chiropractor,1336233683,MARK,GROFF,SUMNER REGIONAL MEDICAL CENTER LLC,83,35.180
599,Chiropractor,1336233683,MARK,GROFF,HENDERSONVILLE HOSPITAL CORPORATION,50,31.597
4300,Chiropractor,1770644973,JAMES,BROWN,HENDERSONVILLE HOSPITAL CORPORATION,50,39.938
...,...,...,...,...,...,...,...
2191,Urology,1780662221,JEFF,WHITFIELD,"HCA HEALTH SERVICES OF TENNESSEE, INC.",49,48.814
335,,1275674590,DEBORAH,SHERMAN,SAINT THOMAS WEST HOSPITAL,268,8.098
603,,1316955040,BRANDON,ALLEN,HENDERSONVILLE HOSPITAL CORPORATION,46,31.368
592,,1316955040,BRANDON,ALLEN,SUMNER REGIONAL MEDICAL CENTER LLC,40,43.446


In [371]:
dftopspec = dfexcl[['from_npi_specialty','patient_count']].groupby('from_npi_specialty').sum('patient_count')
dftopspec = dftopspec.sort_values(['patient_count'], ascending = False).reset_index().head(10)
#dftopspec['from_npi_specialty'].tolist()
dftopspec

Unnamed: 0,from_npi_specialty,patient_count
0,Internal Medicine,178804
1,Nurse Practitioner,40909
2,Family Medicine,17512
3,Physician Assistant,16860
4,Surgery,15228
5,Orthopaedic Surgery,14846
6,Specialist,14583
7,Psychiatry & Neurology,10072
8,Urology,8853
9,Obstetrics & Gynecology,4213


In [374]:
df1 = dfexcl[dfexcl['from_npi_specialty'].isin(pd.array(dftopspec['from_npi_specialty']))]
df1 = df1[['to_npi_orgname', 'from_npi_specialty','patient_count']].groupby([
    'from_npi_specialty','to_npi_orgname']).sum('patient_count')
df1 = df1.sort_values(['to_npi_orgname','patient_count'], ascending = (True, False))
df1 = df1.reset_index()
df1 = df1[~df1.from_npi_specialty.isin(['Radiology','Emergency Medicine','Nurse Anesthetist, Certified Registered',
                                         'Anesthesiology','Hospitalist','Pathology'])]
df1 = df1.reset_index()
df1 

Unnamed: 0,index,from_npi_specialty,to_npi_orgname,patient_count
0,0,Internal Medicine,CENTRAL TENNESSEE HOSPITAL CORPORATION,3890
1,1,Nurse Practitioner,CENTRAL TENNESSEE HOSPITAL CORPORATION,1793
2,2,Physician Assistant,CENTRAL TENNESSEE HOSPITAL CORPORATION,1617
3,3,Surgery,CENTRAL TENNESSEE HOSPITAL CORPORATION,576
4,4,Orthopaedic Surgery,CENTRAL TENNESSEE HOSPITAL CORPORATION,484
5,5,Psychiatry & Neurology,CENTRAL TENNESSEE HOSPITAL CORPORATION,428
6,6,Urology,CENTRAL TENNESSEE HOSPITAL CORPORATION,333
7,7,Family Medicine,CENTRAL TENNESSEE HOSPITAL CORPORATION,151
8,8,Internal Medicine,"HCA HEALTH SERVICES OF TENNESSEE, INC.",54461
9,9,Nurse Practitioner,"HCA HEALTH SERVICES OF TENNESSEE, INC.",9536


In [375]:
hosp_labels = dfexcl['to_npi_orgname'].unique().tolist()
hosp_labels

['HCA HEALTH SERVICES OF TENNESSEE, INC.',
 'SAINT THOMAS RUTHERFORD HOSPITAL',
 'SUMNER REGIONAL MEDICAL CENTER LLC',
 'HENDERSONVILLE HOSPITAL CORPORATION',
 'MAURY REGIONAL HOSPITAL',
 'MACON COUNTY GENERAL HOSPITAL, INC.',
 'RIVERVIEW MEDICAL CENTER LLC',
 'WILLIAMSON COUNTY HOSPITAL DISTRICT',
 'HTI MEMORIAL HOSPITAL CORPORATION',
 'SAINT THOMAS WEST HOSPITAL',
 'NORTHCREST MEDICAL CENTER',
 'CENTRAL TENNESSEE HOSPITAL CORPORATION',
 'SAINT THOMAS STONES RIVER HOSPITAL, LLC',
 'NASHVILLE GENERAL HOSPITAL',
 'TROUSDALE MEDICAL CENTER LLC']

In [376]:
hosp_abbr = ['HCA','Rutherford','Sumner','Hendersonville','Maury','Macon','Riverview','Williamson','HTI','St Thomas',
            'Northcrest','Central Tenn','Stones River','Nash Gen','Trousdale']

In [377]:
abbr = pd.DataFrame(zip(hosp_labels, hosp_abbr), columns=('to_npi_orgname','to_orgname'))
abbr

Unnamed: 0,to_npi_orgname,to_orgname
0,"HCA HEALTH SERVICES OF TENNESSEE, INC.",HCA
1,SAINT THOMAS RUTHERFORD HOSPITAL,Rutherford
2,SUMNER REGIONAL MEDICAL CENTER LLC,Sumner
3,HENDERSONVILLE HOSPITAL CORPORATION,Hendersonville
4,MAURY REGIONAL HOSPITAL,Maury
5,"MACON COUNTY GENERAL HOSPITAL, INC.",Macon
6,RIVERVIEW MEDICAL CENTER LLC,Riverview
7,WILLIAMSON COUNTY HOSPITAL DISTRICT,Williamson
8,HTI MEMORIAL HOSPITAL CORPORATION,HTI
9,SAINT THOMAS WEST HOSPITAL,St Thomas


In [378]:
df1 = df1.merge(abbr)
df1

Unnamed: 0,index,from_npi_specialty,to_npi_orgname,patient_count,to_orgname
0,0,Internal Medicine,CENTRAL TENNESSEE HOSPITAL CORPORATION,3890,Central Tenn
1,1,Nurse Practitioner,CENTRAL TENNESSEE HOSPITAL CORPORATION,1793,Central Tenn
2,2,Physician Assistant,CENTRAL TENNESSEE HOSPITAL CORPORATION,1617,Central Tenn
3,3,Surgery,CENTRAL TENNESSEE HOSPITAL CORPORATION,576,Central Tenn
4,4,Orthopaedic Surgery,CENTRAL TENNESSEE HOSPITAL CORPORATION,484,Central Tenn
5,5,Psychiatry & Neurology,CENTRAL TENNESSEE HOSPITAL CORPORATION,428,Central Tenn
6,6,Urology,CENTRAL TENNESSEE HOSPITAL CORPORATION,333,Central Tenn
7,7,Family Medicine,CENTRAL TENNESSEE HOSPITAL CORPORATION,151,Central Tenn
8,8,Internal Medicine,"HCA HEALTH SERVICES OF TENNESSEE, INC.",54461,HCA
9,9,Nurse Practitioner,"HCA HEALTH SERVICES OF TENNESSEE, INC.",9536,HCA


In [379]:
fig = px.bar(df1, y='to_orgname', x='patient_count', color = 'from_npi_specialty', 
             title = 'Providers with no referrals to Vanderbilt<br>(selected specialties excluded)',
             #orientation = 'h',
             labels = {'to_orgname':'Referred to', 'patient_count':'Referral count','from_npi_specialty': 'Referred by (top 10)'})
#fig.update_layout(legend=dict(orientation = 'h', yanchor = 'top', y=1.02,xanchor = 'right', x=1))
fig.show()

In [380]:
df2 = dfexcl[dfexcl['from_npi_specialty'].isin(pd.array(dftopspec['from_npi_specialty']))]
df2 = df2.groupby(['from_npi_specialty']).head(10)
df2['from_npi'] = df2['from_npi'].astype(str)
df2

Unnamed: 0,from_npi_specialty,from_npi,from_fname,from_lname,to_npi_orgname,patient_count,average_day_wait
2759,Family Medicine,1922394840,TRAVIS,GRAVES,HTI MEMORIAL HOSPITAL CORPORATION,745,1.922
2962,Family Medicine,1932188554,DAVID,TURNER,MAURY REGIONAL HOSPITAL,500,30.898
4809,Family Medicine,1679832661,TEJASVI,KOMMULA,HENDERSONVILLE HOSPITAL CORPORATION,388,0.689
4011,Family Medicine,1649293390,DAVID,SELLERS,SAINT THOMAS RUTHERFORD HOSPITAL,350,3.366
1319,Family Medicine,1386698058,YEKOLLA,REDDY,MAURY REGIONAL HOSPITAL,335,21.271
4804,Family Medicine,1700823713,JEFF,TODD,"SAINT THOMAS STONES RIVER HOSPITAL, LLC",332,10.633
6369,Family Medicine,1225039290,ROBERT,RICHTER,RIVERVIEW MEDICAL CENTER LLC,290,1.497
1233,Family Medicine,1376544767,JAMES,SPURLOCK,"SAINT THOMAS STONES RIVER HOSPITAL, LLC",285,7.863
720,Family Medicine,1467447664,MATTHEW,BRUST,"HCA HEALTH SERVICES OF TENNESSEE, INC.",278,1.377
4847,Family Medicine,1689647489,ILABEN,PATEL,"HCA HEALTH SERVICES OF TENNESSEE, INC.",276,4.44


In [381]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100 entries, 2759 to 1942
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   from_npi_specialty  100 non-null    object 
 1   from_npi            100 non-null    object 
 2   from_fname          100 non-null    object 
 3   from_lname          100 non-null    object 
 4   to_npi_orgname      100 non-null    object 
 5   patient_count       100 non-null    int64  
 6   average_day_wait    100 non-null    float64
dtypes: float64(1), int64(1), object(5)
memory usage: 6.2+ KB


In [382]:
fig = px.bar(df2[df2['from_npi_specialty'].isin(['Internal Medicine','Family Medicine','Nurse Practitioner',
                                                'Physician Assistant'])],
                 y='from_lname', x='patient_count',
             color = 'from_npi_specialty', 
             facet_col = 'from_npi_specialty',
             facet_col_wrap=2,
             title = 'Providers with no referrals to Vanderbilt',
             #height = 1200,
             log_x=True
            )
fig.update_yaxes(matches=None, title = None)


fig.show()

In [384]:
dfexcl.head()

Unnamed: 0,from_npi_specialty,from_npi,from_fname,from_lname,to_npi_orgname,patient_count,average_day_wait
2188,Allergy & Immunology,1780731661,ROBERT,VALET,"HCA HEALTH SERVICES OF TENNESSEE, INC.",345,2.851
807,Anesthesiologist Assistant,1497709752,ERIKA,GILBERT,SAINT THOMAS RUTHERFORD HOSPITAL,84,4.77
585,Chiropractor,1336233683,MARK,GROFF,SUMNER REGIONAL MEDICAL CENTER LLC,83,35.18
599,Chiropractor,1336233683,MARK,GROFF,HENDERSONVILLE HOSPITAL CORPORATION,50,31.597
4300,Chiropractor,1770644973,JAMES,BROWN,HENDERSONVILLE HOSPITAL CORPORATION,50,39.938


In [385]:
dfexcl['rept_name'] = dfexcl['from_fname'].str[0] + '_' + (dfexcl['from_lname'].str.lower()).str.capitalize()
dfexcl



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,from_npi_specialty,from_npi,from_fname,from_lname,to_npi_orgname,patient_count,average_day_wait,rept_name
2188,Allergy & Immunology,1780731661,ROBERT,VALET,"HCA HEALTH SERVICES OF TENNESSEE, INC.",345,2.851,R_Valet
807,Anesthesiologist Assistant,1497709752,ERIKA,GILBERT,SAINT THOMAS RUTHERFORD HOSPITAL,84,4.770,E_Gilbert
585,Chiropractor,1336233683,MARK,GROFF,SUMNER REGIONAL MEDICAL CENTER LLC,83,35.180,M_Groff
599,Chiropractor,1336233683,MARK,GROFF,HENDERSONVILLE HOSPITAL CORPORATION,50,31.597,M_Groff
4300,Chiropractor,1770644973,JAMES,BROWN,HENDERSONVILLE HOSPITAL CORPORATION,50,39.938,J_Brown
...,...,...,...,...,...,...,...,...
2191,Urology,1780662221,JEFF,WHITFIELD,"HCA HEALTH SERVICES OF TENNESSEE, INC.",49,48.814,J_Whitfield
335,,1275674590,DEBORAH,SHERMAN,SAINT THOMAS WEST HOSPITAL,268,8.098,D_Sherman
603,,1316955040,BRANDON,ALLEN,HENDERSONVILLE HOSPITAL CORPORATION,46,31.368,B_Allen
592,,1316955040,BRANDON,ALLEN,SUMNER REGIONAL MEDICAL CENTER LLC,40,43.446,B_Allen


In [386]:
dfexcl['ref_ctgry'] = pd.qcut(dfexcl['patient_count'], q=10,
                             labels = ['Q1','Q2','Q3','Q4','Q5','Q6','Q7', 'Q8','Q9','Q0'])
dfexcl.sort_values('patient_count', ascending = False)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,from_npi_specialty,from_npi,from_fname,from_lname,to_npi_orgname,patient_count,average_day_wait,rept_name,ref_ctgry
1385,Internal Medicine,1417131715,JOHN,RIDDICK,"HCA HEALTH SERVICES OF TENNESSEE, INC.",4685,2.890,J_Riddick,Q0
5730,Internal Medicine,1558461970,ROBERT,WHEATLEY,"HCA HEALTH SERVICES OF TENNESSEE, INC.",2446,4.180,R_Wheatley,Q0
5463,Internal Medicine,1942290333,REGINALD,DICKERSON,SUMNER REGIONAL MEDICAL CENTER LLC,1975,7.214,R_Dickerson,Q0
1407,Internal Medicine,1053337717,SUNIL,KAZA,HTI MEMORIAL HOSPITAL CORPORATION,1824,5.974,S_Kaza,Q0
5354,Internal Medicine,1972658060,TERRY,KETCH,HTI MEMORIAL HOSPITAL CORPORATION,1770,5.535,T_Ketch,Q0
...,...,...,...,...,...,...,...,...,...
5483,Internal Medicine,1962652487,ROBIN,JACOB,NASHVILLE GENERAL HOSPITAL,20,0.074,R_Jacob,Q1
6122,Psychologist,1194019034,STEVE,STRIDE,WILLIAMSON COUNTY HOSPITAL DISTRICT,19,12.852,S_Stride,Q1
4879,Internal Medicine,1245325661,PAUL,MYERS,WILLIAMSON COUNTY HOSPITAL DISTRICT,17,14.676,P_Myers,Q1
2404,Social Worker,1417382151,ROBYN,LOVELL,NORTHCREST MEDICAL CENTER,16,8.980,R_Lovell,Q1


In [387]:
fig4 = px.sunburst(dfexcl[dfexcl['ref_ctgry']=='Q0'].dropna(),
                  path = ['from_npi_specialty', 'rept_name'], values = 'patient_count',
                  title = 'Some title')
fig4.show()