# Looking At Nashville Referrals Summarized

**- Build a profile of providers referring patients to the major hospitals in Nashville**<br>
**- Are certain specialties more likely to refer to a particular hospital over the others?**

In [1]:
import pandas as pd
import sqlite3
from tqdm.notebook import tqdm
import plotly.graph_objects as go

In [2]:
pd.set_option("display.max_columns", 500)
pd.set_option('display.max_rows', 500)

**Reading Data**

In [3]:
# nashville_referrals = pd.read_csv("../data/nashville-referrals-summarized-5643.csv")
# normalized = pd.read_csv("../data/nashville_referrals_normalised_only_hospitals_5643.csv")
nashville_referrals = pd.read_csv("../data/nashville-referrals-summarized-6436.csv")
normalized = pd.read_csv("../data/nashville_referrals_normalised_only_hospitals_6436.csv")

**Cleaning Data**

In [4]:
# Normalized
# Fix data types
normalized["from_npi"] = normalized["from_npi"].astype(str)
normalized["to_npi"] = normalized["to_npi"].astype(str)
# normalized["from_zip"] = normalized["from_zip"].astype(str)
# normalized["from_entity_type_code"] = normalized["from_entity_type_code"].astype(int).astype(str)

# Drop unneeded columns
normalized = normalized[["from_npi", "to_npi", "to_facility_group", "to_facility_name_normalised"]]

In [5]:
# nashville_referrals
# Fix data types
nashville_referrals["referrer.npi"] = nashville_referrals["referrer.npi"].astype(str)
nashville_referrals["referrer.zip"] = nashville_referrals["referrer.zip"].astype(str)
nashville_referrals["hospital.npi"] = nashville_referrals["hospital.npi"].astype(str)
nashville_referrals["hospital.zip"] = nashville_referrals["hospital.zip"].astype(str)

# Rename columns
nashville_referrals = nashville_referrals.rename(columns={
    "r_cl.classification_name": "referrer.classification", 
    "r_gp.grouping_name": "referrer.grouping",
    "r_sp.specialization_name": "referrer.specialization",
    "h_cl.classification_name": "hospital.classification", 
    "h_gp.grouping_name": "hospital.grouping",
    "h_sp.specialization_name": "hospital.specialization"
})

# Drop unneeded columns
nashville_referrals = nashville_referrals.drop(columns=["hospital.first_name"])

In [6]:
# Join to get to_facility_group and to_facility_name_normalized
nashville_referrals = nashville_referrals.merge(
    normalized,
    how="inner",
    left_on=("referrer.npi", "hospital.npi"),
    right_on=("from_npi", "to_npi")
)

In [7]:
# Final cleanups
# Drop unneeded columns
nashville_referrals = nashville_referrals.drop(columns=["from_npi", "to_npi"])

# Rename columns
nashville_referrals = nashville_referrals.rename(columns={
    "to_facility_group": "hospital.facility_group", 
    "to_facility_name_normalised": "hospital.name_normalized"
})

In [8]:
nashville_referrals.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6436 entries, 0 to 6435
Data columns (total 31 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   referrer.npi                    6436 non-null   object 
 1   referrer.credential             6292 non-null   object 
 2   referrer.name_prefix            2748 non-null   object 
 3   referrer.first_name             6436 non-null   object 
 4   referrer.last_name              6435 non-null   object 
 5   referrer.business_address_1     6436 non-null   object 
 6   referrer.business_address_2     2844 non-null   object 
 7   referrer.city                   6436 non-null   object 
 8   referrer.state                  6436 non-null   object 
 9   referrer.zip                    6436 non-null   object 
 10  referrer.primary_taxonomy_code  6425 non-null   object 
 11  referrer.classification         6425 non-null   object 
 12  referrer.grouping               64

## Analyzing

**Note:**
- `from_npi` - The provider seen first in sequence, coded by NPI
- `to_npi` - The provider seen second in sequence, coded by NPI
- `patient_count` - The total number of patients shared between the two providers over the entire time period (the time period is typically one year)
- `transaction_count` - The count of times that a patient switched between the two providers, in the from-to direction.
- `average_day_wait` - The average amount of days it took for a “HOP” to occur. Which is the the time it took, in days, for a patient to switch to the second provider after having seen the first provider.
- `std_day_wait` - The standard deviation of days it took for a HOP to occur.

**Which are the unique hospitals in this dataset? How many referrals do they get overall?**

In [9]:
# Aggregated: Group by hospital.facility_group
nashville_referrals.groupby([
    "hospital.facility_group"
]).agg({
    "referrer.npi": "count",
    "referral.transaction_count": "sum",
    "referral.patient_count": "sum"
}).rename(columns={
    "referrer.npi": "count_referrer",
    "referral.transaction_count": "sum_transaction_count",
    "referral.patient_count": "sum_patient_count"
}).sort_values(by=["count_referrer", "hospital.facility_group"], ascending=(False,True))\
  .reset_index()

Unnamed: 0,hospital.facility_group,count_referrer,sum_transaction_count,sum_patient_count
0,Vanderbilt University Medical Center,2190,654125,441417
1,HCA,1918,559365,427083
2,Ascension Saint Thomas,1172,345368,284948
3,Williamson Medical Center,332,98355,71272
4,Maury Regional Medical Center,292,132087,96836
5,Sumner Regional Medical Center,210,67912,46664
6,NorthCrest Medical Center,139,41807,27041
7,Macon County General Hospital,59,13892,8083
8,Riverview Regional Medical Center,57,13719,6689
9,Nashville General Hosptial,43,6061,3735


In [10]:
# Individuals: Group by Individual Hospitals
nashville_referrals.groupby([
    "hospital.npi", 
    "hospital.name_normalized",
    "hospital.facility_group",
    "hospital.business_address_1",
]).agg({
    "referrer.npi": "count",
    "referral.transaction_count": "sum",
    "referral.patient_count": "sum"
}).rename(columns={
    "referrer.npi": "count_referrer",
    "referral.transaction_count": "sum_transaction_count",
    "referral.patient_count": "sum_patient_count"
}).sort_values(by=["count_referrer"], ascending=(False))\
  .reset_index()

Unnamed: 0,hospital.npi,hospital.name_normalized,hospital.facility_group,hospital.business_address_1,count_referrer,sum_transaction_count,sum_patient_count
0,1396882205,Vanderbilt University Medical Center,Vanderbilt University Medical Center,1211 MEDICAL CENTER DRIVE,2008,603385,405124
1,1023055126,Centennial Medical Center HCA,HCA,2300 PATTERSON ST,576,198922,146311
2,1629025648,Saint Thomas West Hospital,Ascension Saint Thomas,4220 HARDING RD,436,157359,130030
3,1780778969,Saint Thomas Midtown Hospital,Ascension Saint Thomas,2000 CHURCH ST,421,90255,75331
4,1265445506,Williamson County Hospital,Williamson Medical Center,4321 CAROTHERS PARKWAY,332,98355,71272
5,1164590386,Saint Thomas Rutherford Hospital,Ascension Saint Thomas,1700 MEDICAL CENTER PKWY,301,94125,77124
6,1861479545,Maury Regional Medical Center,Maury Regional Medical Center,1224 TROTWOOD AVE,292,132087,96836
7,1538114434,Hendersonville Medical Center HCA,HCA,355 NEW SHACKLE ISLAND RD,278,63052,48660
8,1295780476,TriStar Skyline Medical Center HCA,HCA,3441 DICKERSON PIKE,270,86255,69926
9,1982650024,Summit Medical Center HCA,HCA,5655 FRIST BLVD,267,84278,66381


**Who refers to who?**

In [11]:
nashville_referrals.head(3)

Unnamed: 0,referrer.npi,referrer.credential,referrer.name_prefix,referrer.first_name,referrer.last_name,referrer.business_address_1,referrer.business_address_2,referrer.city,referrer.state,referrer.zip,referrer.primary_taxonomy_code,referrer.classification,referrer.grouping,referrer.specialization,hospital.npi,hospital.org_name,hospital.business_address_1,hospital.business_address_2,hospital.city,hospital.state,hospital.zip,hospital.primary_taxonomy_code,hospital.classification,hospital.grouping,hospital.specialization,referral.average_day_wait,referral.transaction_count,referral.std_day_wait,referral.patient_count,hospital.facility_group,hospital.name_normalized
0,1184619124,M.D.,DR.,RICHARD,RUTHERFORD,133 HOSPITAL DR,SUITE 500,CARTHAGE,TN,37030,207Q00000X,Family Medicine,Allopathic & Osteopathic Physicians,,1417938846,"MACON COUNTY GENERAL HOSPITAL, INC.",204 MEDICAL DRIVE,,LAFAYETTE,TN,37083,282NC0060X,General Acute Care Hospital,Hospitals,Critical Access,34.507,67,41.94,44,Macon County General Hospital,Macon County General Hospital
1,1225027014,DO,DR.,PAUL,KUDELKO,353 NEW SHACKLE ISLAND RD,SUITE 300C,HENDERSONVILLE,TN,37075,207RC0000X,Internal Medicine,Allopathic & Osteopathic Physicians,Cardiovascular Disease,1417938846,"MACON COUNTY GENERAL HOSPITAL, INC.",204 MEDICAL DRIVE,,LAFAYETTE,TN,37083,282NC0060X,General Acute Care Hospital,Hospitals,Critical Access,49.829,199,58.804,172,Macon County General Hospital,Macon County General Hospital
2,1184887655,M.D.,DR.,MEGAN,MASON,300 STEAM PLANT RD,SUITE 300,GALLATIN,TN,37066,2084N0400X,Psychiatry & Neurology,Allopathic & Osteopathic Physicians,Neurology,1417938846,"MACON COUNTY GENERAL HOSPITAL, INC.",204 MEDICAL DRIVE,,LAFAYETTE,TN,37083,282NC0060X,General Acute Care Hospital,Hospitals,Critical Access,39.649,74,51.735,53,Macon County General Hospital,Macon County General Hospital


**Focusing on referrer.classification**

In [12]:
nash_ref_by_classification = nashville_referrals.groupby([
    "referrer.classification",
    #"hospital.npi",
    "hospital.name_normalized",
    "hospital.facility_group"
]).agg({
    "referrer.npi": "count",
    "referral.transaction_count": "sum",
    "referral.patient_count": "sum"
}).reset_index().rename(columns={
    "hospital.name_normalized": "hospital.name",
    "referrer.npi": "count_referrer",
    #"referral.transaction_count": "sum_transaction_count",
    "referral.patient_count": "sum_patient_count"
}).sort_values(by=["count_referrer"], ascending=(False))\
  .reset_index(drop=True)

display(nash_ref_by_classification.shape)
display(nash_ref_by_classification.head())

(483, 6)

Unnamed: 0,referrer.classification,hospital.name,hospital.facility_group,count_referrer,referral.transaction_count,sum_patient_count
0,Internal Medicine,Vanderbilt University Medical Center,Vanderbilt University Medical Center,600,242121,138232
1,Nurse Practitioner,Vanderbilt University Medical Center,Vanderbilt University Medical Center,300,49934,33901
2,Internal Medicine,Centennial Medical Center HCA,HCA,165,77587,50122
3,Internal Medicine,Saint Thomas West Hospital,Ascension Saint Thomas,161,62814,46091
4,Internal Medicine,Saint Thomas Midtown Hospital,Ascension Saint Thomas,155,32525,25285


In [13]:
nash_ref_by_classification

Unnamed: 0,referrer.classification,hospital.name,hospital.facility_group,count_referrer,referral.transaction_count,sum_patient_count
0,Internal Medicine,Vanderbilt University Medical Center,Vanderbilt University Medical Center,600,242121,138232
1,Nurse Practitioner,Vanderbilt University Medical Center,Vanderbilt University Medical Center,300,49934,33901
2,Internal Medicine,Centennial Medical Center HCA,HCA,165,77587,50122
3,Internal Medicine,Saint Thomas West Hospital,Ascension Saint Thomas,161,62814,46091
4,Internal Medicine,Saint Thomas Midtown Hospital,Ascension Saint Thomas,155,32525,25285
5,Family Medicine,Vanderbilt University Medical Center,Vanderbilt University Medical Center,114,18958,11842
6,Internal Medicine,Williamson County Hospital,Williamson Medical Center,104,30898,20348
7,Anesthesiology,Vanderbilt University Medical Center,Vanderbilt University Medical Center,103,20350,19242
8,Psychiatry & Neurology,Vanderbilt University Medical Center,Vanderbilt University Medical Center,103,24302,15429
9,Internal Medicine,Saint Thomas Rutherford Hospital,Ascension Saint Thomas,96,31736,23919


**Sankey Diagram**

In [14]:
# Each individual nodes should be unique
# all_nodes_npi = list(nashville_referrals["referrer.npi"].append(nashville_referrals["hospital.npi"]).unique())
# len(all_nodes_npi)

all_nodes = list(nash_ref_by_classification["referrer.classification"]\
                 .append(nash_ref_by_classification["hospital.name"])\
                 .unique())
len(all_nodes)

70

In [15]:
# All the individual sources (referrers) in a specific order
# There will be duplicates in this order but that is ok

# source_data_npi = nashville_referrals["referrer.npi"]
# len(source_data_npi)

source_data = nash_ref_by_classification["referrer.classification"]
len(source_data)

483

In [16]:
# For each el in source_data_npi, find it in all_nodes_npi and grab the matching index in all_nodes_npi
# source_npi_indexes = []

# for s in source_data_npi:
#     for i, n in enumerate(all_nodes_npi):
#         if s == n:
#             source_npi_indexes.append(i)
#             break

# len(source_npi_indexes)

source_data_idx = []

for s in source_data:
    for i, n in enumerate(all_nodes):
        if s == n:
            source_data_idx.append(i)
            break

len(source_data_idx)

483

In [17]:
# All the individual target (hospitals) in a specific order
# There will be duplicates in this order but that is ok
# target_data_npi = nashville_referrals["hospital.npi"]
# len(target_data_npi)

target_data = nash_ref_by_classification["hospital.name"]
len(target_data)

483

In [18]:
# For each el in target_data_npi, find it in all_nodes_npi and grab the matching index in all_nodes_npi
# target_npi_indexes = []

# for t in target_data_npi:
#     for i, n in enumerate(all_nodes_npi):
#         if t == n:
#             target_npi_indexes.append(i)
#             break

# len(target_npi_indexes)

target_data_idx = []

for t in target_data:
    for i, n in enumerate(all_nodes):
        if t == n:
            target_data_idx.append(i)
            break

len(target_data_idx)

483

In [19]:
nash_ref_by_classification

Unnamed: 0,referrer.classification,hospital.name,hospital.facility_group,count_referrer,referral.transaction_count,sum_patient_count
0,Internal Medicine,Vanderbilt University Medical Center,Vanderbilt University Medical Center,600,242121,138232
1,Nurse Practitioner,Vanderbilt University Medical Center,Vanderbilt University Medical Center,300,49934,33901
2,Internal Medicine,Centennial Medical Center HCA,HCA,165,77587,50122
3,Internal Medicine,Saint Thomas West Hospital,Ascension Saint Thomas,161,62814,46091
4,Internal Medicine,Saint Thomas Midtown Hospital,Ascension Saint Thomas,155,32525,25285
5,Family Medicine,Vanderbilt University Medical Center,Vanderbilt University Medical Center,114,18958,11842
6,Internal Medicine,Williamson County Hospital,Williamson Medical Center,104,30898,20348
7,Anesthesiology,Vanderbilt University Medical Center,Vanderbilt University Medical Center,103,20350,19242
8,Psychiatry & Neurology,Vanderbilt University Medical Center,Vanderbilt University Medical Center,103,24302,15429
9,Internal Medicine,Saint Thomas Rutherford Hospital,Ascension Saint Thomas,96,31736,23919


In [20]:
# The value to represent: Here, we are using patient count
# patient_count_values = list(nashville_referrals["referral.patient_count"])
# len(patient_count_values)

total_patient_count = list(nash_ref_by_classification["sum_patient_count"])
len(total_patient_count)

483

**Now building the diagram**

In [21]:
nashville_referrals.groupby([
    "referrer.classification",
    #"hospital.npi",
    #"hospital.name_normalized",
    #"hospital.facility_group"
]).agg({
    #"referrer.npi": "count",
    "referral.transaction_count": "sum",
    "referral.patient_count": "sum"
}).reset_index().rename(columns={
    #"hospital.name_normalized": "hospital.name",
    #"referrer.npi": "count_referrer",
    "referral.transaction_count": "sum_transaction_count",
    "referral.patient_count": "sum_patient_count"
}).sort_values(by=["sum_patient_count"], ascending=(False))\
  .reset_index(drop=True)

# display(nash_ref_by_classification.shape)
# display(nash_ref_by_classification.head())

Unnamed: 0,referrer.classification,sum_transaction_count,sum_patient_count
0,Internal Medicine,678099,435234
1,Radiology,397137,361987
2,Nurse Practitioner,121343,84351
3,Emergency Medicine,81652,74142
4,Family Medicine,102980,57359
5,"Nurse Anesthetist, Certified Registered",49618,48659
6,Pathology,62149,47834
7,Anesthesiology,50313,47380
8,Orthopaedic Surgery,56882,33886
9,Psychiatry & Neurology,42267,28756


In [22]:
# List of those beyond "Pediatrics" to include in "Others"
to_others_list = list(nashville_referrals.groupby([
    "referrer.classification"
]).agg({
    "referral.transaction_count": "sum",
    "referral.patient_count": "sum"
}).sort_values(by=["referral.patient_count"], ascending=(False)).index[10:]) # Top 10 Specializations + others

# Replace them in nash_ref_by_classification
for el in nash_ref_by_classification["referrer.classification"]:
    if el in to_others_list:
        nash_ref_by_classification.loc[nash_ref_by_classification["referrer.classification"] == el, "referrer.classification"] = "Others"
        continue

nash_ref_by_classification

Unnamed: 0,referrer.classification,hospital.name,hospital.facility_group,count_referrer,referral.transaction_count,sum_patient_count
0,Internal Medicine,Vanderbilt University Medical Center,Vanderbilt University Medical Center,600,242121,138232
1,Nurse Practitioner,Vanderbilt University Medical Center,Vanderbilt University Medical Center,300,49934,33901
2,Internal Medicine,Centennial Medical Center HCA,HCA,165,77587,50122
3,Internal Medicine,Saint Thomas West Hospital,Ascension Saint Thomas,161,62814,46091
4,Internal Medicine,Saint Thomas Midtown Hospital,Ascension Saint Thomas,155,32525,25285
5,Family Medicine,Vanderbilt University Medical Center,Vanderbilt University Medical Center,114,18958,11842
6,Internal Medicine,Williamson County Hospital,Williamson Medical Center,104,30898,20348
7,Anesthesiology,Vanderbilt University Medical Center,Vanderbilt University Medical Center,103,20350,19242
8,Psychiatry & Neurology,Vanderbilt University Medical Center,Vanderbilt University Medical Center,103,24302,15429
9,Internal Medicine,Saint Thomas Rutherford Hospital,Ascension Saint Thomas,96,31736,23919


## Re-trying re-building the diagram again

**Using `hospital.facility_group` instead of `hospital.name`**

In [23]:
# Specializations: List of those beyond top 7 to include in "Others"
to_others_list = list(nashville_referrals.groupby([
    "referrer.classification"
]).agg({
    #"referral.transaction_count": "sum",
    "referral.patient_count": "sum"
}).sort_values(by=["referral.patient_count"], ascending=(False)).index[7:]) # Top 7 Specializations + others

# Replace them in nashville_referrals
for el in nashville_referrals["referrer.classification"]:
    if el in to_others_list:
        nashville_referrals.loc[nashville_referrals["referrer.classification"] == el, "referrer.classification"] = "Other Specialities"
        continue


# hospital.facility_group: List of those beyond top 7 to include in "Others"
to_others_list = list(nashville_referrals.groupby([
    "hospital.facility_group"
]).agg({
    #"referral.transaction_count": "sum",
    "referral.patient_count": "sum"
}).sort_values(by=["referral.patient_count"], ascending=(False)).index[7:]) # Top 7 Facility Groups + others

# Replace them in nashville_referrals
for el in nashville_referrals["hospital.facility_group"]:
    if el in to_others_list:
        nashville_referrals.loc[nashville_referrals["hospital.facility_group"] == el, "hospital.facility_group"] = "Others"
        continue
        
        
nash_ref_by_classification = nashville_referrals.groupby([
    "referrer.classification",
    #"hospital.npi",
    #"hospital.name_normalized",
    "hospital.facility_group"
]).agg({
    #"referrer.npi": "count",
    #"referral.transaction_count": "sum",
    "referral.patient_count": "sum"
}).reset_index().rename(columns={
    #"hospital.name_normalized": "hospital.name",
    #"referrer.npi": "count_referrer",
    #"referral.transaction_count": "sum_transaction_count",
    "referral.patient_count": "sum_patient_count"
}).sort_values(by=["sum_patient_count"], ascending=(False))\
  .reset_index(drop=True)

        
# # For better visualization, only look at the top 10 specialities and top 6 hospital groups
# nash_ref_by_classification = nash_ref_by_classification[
# #     (nash_ref_by_classification["referrer.classification"] != "Others") &
#     (nash_ref_by_classification["hospital.facility_group"].isin([
#         "Vanderbilt University Medical Center",
#         "Ascension Saint Thomas",
#         "HCA",
#         "Williamson Medical Center",
#         "Maury Regional Medical Center",
#         "Sumner Regional Medical Center"
#     ]))
# ]

nash_ref_by_classification

Unnamed: 0,referrer.classification,hospital.facility_group,sum_patient_count
0,Internal Medicine,Vanderbilt University Medical Center,146878
1,Radiology,HCA,140938
2,Other Specialities,Vanderbilt University Medical Center,119250
3,Internal Medicine,HCA,116009
4,Internal Medicine,Ascension Saint Thomas,95531
5,Radiology,Vanderbilt University Medical Center,80109
6,Radiology,Ascension Saint Thomas,78156
7,Other Specialities,HCA,73838
8,Other Specialities,Ascension Saint Thomas,51724
9,Nurse Practitioner,Vanderbilt University Medical Center,35824


In [24]:
all_nodes = list(nash_ref_by_classification["referrer.classification"]\
                 .append(nash_ref_by_classification["hospital.facility_group"])\
                 .unique())
len(all_nodes)

16

In [25]:
source_data = nash_ref_by_classification["referrer.classification"]
len(source_data)

61

In [26]:
source_data_idx = []

for s in source_data:
    for i, n in enumerate(all_nodes):
        if s == n:
            source_data_idx.append(i)
            break

len(source_data_idx)

61

In [27]:
target_data = nash_ref_by_classification["hospital.facility_group"]
len(target_data)

61

In [28]:
target_data_idx = []

for t in target_data:
    for i, n in enumerate(all_nodes):
        if t == n:
            target_data_idx.append(i)
            break

len(target_data_idx)

61

In [29]:
total_patient_count = list(nash_ref_by_classification["sum_patient_count"])
len(total_patient_count)

61

In [31]:
# Now making the figure
fig = go.Figure(data=[go.Sankey(
    
    # Format of value to be represented
    #valueformat = ".0f",
    #valuesuffix = "TWh",
    
    # Define all the nodes
    node = dict(
      pad = 30, # The spacing size between the separations
      thickness = 50, # Thickness of the nodes
      line = dict(color = "black", width = 0.5), # Margin Line of the nodes
      label = all_nodes, # Label of each successive nodes: Just the npis for now
      #color = data['data'][0]['node']['color'] # Color of the nodes: 1 for overall, or match the label order
    ),
    
    # Define how the nodes are linked to each other
    link = dict(
      source = source_data_idx, # From indices: Index that correspond to position in label
      target = target_data_idx, # To indices: Index that correspond to position in label
      value = total_patient_count, # Amount: Correspond to label positions
      #color = ['red', 'blue', 'green']
      #label = data['data'][0]['link']['label'], # Hover label on the connector
      #color = data['data'][0]['link']['color'] # Color of the connector
))])

# Styling
fig.update_layout(
    title_text="Total Patient Count Referrals: From Referrer Specialities to Facility Groups",
    font_size=14
)

# Display figure
fig.show()