In [1]:
# Import libraries and set max row display to large so that data can be viewed.
import pandas as pd
from fuzzywuzzy import process
from itertools import chain
pd.set_option('display.max_rows', 1000)

In [2]:
# Read csvs from files.
facilities = pd.read_csv("data/revised_datasets/hop_teaming_facilities.csv")
providers = pd.read_csv("data/revised_datasets/hop_teaming_providers.csv")
p2p_referrals = pd.read_csv("data/revised_datasets/hop_teaming_provider_to_provider.csv")
p2f_referrals = pd.read_csv("data/revised_datasets/hop_teaming_provider_to_facility.csv")

In [3]:
# Exclude certain classifications from consideration as deemed outside the scope 
exclude_these = ['Pharmacy',
                 'Ambulance',
                 'Durable Medical Equipment & Medical Supplies',
                 'Radiology',
                 'Public Health or Welfare',
                 'Preferred Provider Organization',
                 'Prosthetic/Orthotic Supplier',
                 'Eyewear Supplier',
                 'Legal Medicine',
                 'Portable X-ray and/or Other Portable Diagnostic Imaging Supplier',
                 'Exclusive Provider Organization',
                 'Student in an Organized Health Care Education/Training Program',
                 'Nuclear Medicine',
                 'Dietitian, Registered',
                 'Chiropractor']

facilities = facilities.loc[~facilities['classification'].isin(exclude_these)]
providers = providers.loc[~providers['classification'].isin(exclude_these)]

In [4]:
# Affiliations do not currently represent various ambulatory surgery center groups or other specialty groups.
vandy_org_str = 'VANDERBILT|MAURY REGIONAL|WILLIAMSON MEDICAL'
st_thomas_org_str = 'SAINT|THOMAS|ASCENSION|BAPTIST'
hca_org_str = 'HCA|TRISTAR|FRIST|STONECREST|NORTHCREST|SKYLINE|SOUTHERN HILLS|PHYSICIANS PAVILION|SUMMIT|CENTENNIAL'
lifepoint_org_str = 'HIGHPOINT|SUMNER|RIVERVIEW|TROUSDALE|SOUTHERN TENNESSEE|STARR REGIONAL'
chs_org_str = 'TENNOVA'
asc_str = 'SURGERY'

In [5]:
# Assign affiliation based on organization name words/phrases as a first step.
facilities['affiliation'] = 'NO MAJOR AFFILIATION/OTHER'
facilities['affiliation'].loc[(facilities['organization_name'].str.contains(vandy_org_str)) | 
               (facilities['organization_name_other'].str.contains(vandy_org_str))] = 'VANDERBILT'
facilities['affiliation'].loc[(facilities['organization_name'].str.contains(st_thomas_org_str)) | 
               (facilities['organization_name_other'].str.contains(st_thomas_org_str))] = 'SAINT THOMAS'
facilities['affiliation'].loc[(facilities['organization_name'].str.contains(hca_org_str)) | 
               (facilities['organization_name_other'].str.contains(hca_org_str))] = 'HCA'
facilities['affiliation'].loc[(facilities['organization_name'].str.contains(lifepoint_org_str)) | 
               (facilities['organization_name_other'].str.contains(lifepoint_org_str))] = 'LIFEPOINT'
facilities['affiliation'].loc[(facilities['organization_name'].str.contains(chs_org_str)) | 
               (facilities['organization_name_other'].str.contains(chs_org_str))] = 'CHS'
facilities['affiliation'].loc[(facilities['organization_name'].str.contains(asc_str)) | 
               (facilities['organization_name_other'].str.contains(asc_str))] = 'SURGERY CENTER'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  facilities['affiliation'].loc[(facilities['organization_name'].str.contains(vandy_org_str)) |
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  facilities['affiliation'].loc[(facilities['organization_name'].str.contains(st_thomas_org_str)) |
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  facilities['affiliation'].loc[(facilities['organization_name'].str.contains(hca_org_str)) |
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the do

In [6]:
# Combine all address fields into one.
facilities['address'] = (
    facilities[['address_first_line', 
                'address_second_line',
                'city', 
                'state',
                'zipcode']]\
    .astype(str)\
    .agg(' '.join, axis = 1)\
    .str.replace("nan ","")
)

# Create dictionary to map affiliated entities to their addresses.
affiliated_address_dict = (
    dict(facilities[['address','affiliation']]\
         .loc[facilities['affiliation'] != 'NO MAJOR AFFILIATION/OTHER']\
         .values)
)

# Create function to fuzzy match addresses of supposedly "unaffiliated" entities with those of "affiliated" entities.
# Threshold is set rather high to try to avoid false positives.
def find_affiliation_address(addr):
    results = pd.DataFrame(process.extract(query = addr, choices = affiliated_address_dict.keys()))
    if results[1].max() >= 95:
        return results[0].loc[results[1] == results[1].max()][0]
    else:
        return addr

# Create a new column to store "matching" addresses, then map those addresses' affiliations to the originals.
facilities['affiliation_address'] = facilities['address'].apply(find_affiliation_address)
facilities['affiliation'] = (
    facilities['affiliation_address']\
    .map(affiliated_address_dict)\
    .fillna(value = 'NO MAJOR AFFILIATION/OTHER')
)

In [7]:
# Create strings to pull affiliations for providers from.
vandy_provider_str = 'VANDERBILT|MAURY|WILLIAMSON'
st_thomas_provider_str = 'SAINT|THOMAS|ASCENSION|BAPTIST'
hca_provider_str = 'HCA|TRISTAR|FRIST|STONECREST|NORTHCREST|SKYLINE|HILLS|PAVILION|SUMMIT|CENTENNIAL'
lifepoint_provider_str = 'HIGHPOINT|SUMNER|RIVERVIEW|TROUSDALE|STARR'
chs_provider_str = 'TENNOVA'
asc_provider_str = 'SURGERY'

# Filter providers dataframe to Nashville CBSA for affiliation assignment.
nashville_providers = providers[providers['cbsa'] == 34980]

# Combine all hospital affiliations into a list column.
nashville_providers['all_possible_affiliations'] = (
    nashville_providers[nashville_providers\
                        .columns[pd.Series(nashville_providers.columns)\
                                 .str.startswith('cms_hosp')]]\
    .astype(str)\
    .values.tolist()
)

# Create function to split each affiliation name in the list coluumn into words.
def split_into_words(ls):
    return [line.split() for line in ls if line != 'nan']

# Create function to score each unnested word list by words in the affiliation strings above.
def obtain_affiliation_by_score(ls):
    
    score_dict = {
        'VANDERBILT': 0,
        'SAINT THOMAS': 0,
        'HCA': 0,
        'LIFEPOINT': 0,
        'CHS': 0,
        'SURGERY CENTER': 0
    }
    
    for i in ls:
        if i in vandy_provider_str:
            score_dict['VANDERBILT'] += 1
        elif i in st_thomas_provider_str:
            score_dict['SAINT THOMAS'] += 1
        elif i in hca_provider_str:
            score_dict['HCA'] += 1
        elif i in lifepoint_provider_str:
            score_dict['LIFEPOINT'] += 1
        elif i in chs_provider_str:
            score_dict['CHS'] += 1
        elif i in asc_provider_str:
            score_dict['SURGERY CENTER'] += 1
    
    if max(score_dict.values()) == 0:
        return 'NO MAJOR AFFILIATION/OTHER'
    else:
        return max(score_dict, key = score_dict.get)

# Apply word split function to affiliations column. It comes out as a nested list, so have to use itertools to unnest.
nashville_providers['all_possible_affiliations'] = (
    nashville_providers['all_possible_affiliations']\
    .apply(lambda x: list(chain(*split_into_words(x))))
)

# Determine affiliation by score.
nashville_providers['affiliation'] = nashville_providers['all_possible_affiliations'].apply(obtain_affiliation_by_score)

# Additional clean-up based on organization name for a provider.
nashville_providers['affiliation'].loc[(nashville_providers['affiliation'] == 'NO MAJOR AFFILIATION/OTHER') &
                                      (nashville_providers['cms_org_nm'].str.contains(vandy_provider_str))] = 'VANDERBILT'
nashville_providers['affiliation'].loc[(nashville_providers['affiliation'] == 'NO MAJOR AFFILIATION/OTHER') &
                                      (nashville_providers['cms_org_nm'].str.contains(st_thomas_provider_str))] = 'SAINT THOMAS'
nashville_providers['affiliation'].loc[(nashville_providers['affiliation'] == 'NO MAJOR AFFILIATION/OTHER') &
                                      (nashville_providers['cms_org_nm'].str.contains(hca_provider_str))] = 'HCA'
nashville_providers['affiliation'].loc[(nashville_providers['affiliation'] == 'NO MAJOR AFFILIATION/OTHER') &
                                      (nashville_providers['cms_org_nm'].str.contains(lifepoint_provider_str))] = 'LIFEPOINT'
nashville_providers['affiliation'].loc[(nashville_providers['affiliation'] == 'NO MAJOR AFFILIATION/OTHER') &
                                      (nashville_providers['cms_org_nm'].str.contains(chs_provider_str))] = 'CHS'
nashville_providers['affiliation'].loc[(nashville_providers['affiliation'] == 'NO MAJOR AFFILIATION/OTHER') &
                                      (nashville_providers['cms_org_nm'].str.contains(asc_provider_str))] = 'SURGERY CENTER'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nashville_providers['all_possible_affiliations'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nashville_providers['all_possible_affiliations'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nashville_providers['affiliation'] = nashville_providers['all_possible_affiliations'].apply(obtain_

In [8]:
# Combine names and subset columns for merge.
nashville_providers['name'] = (
    nashville_providers[['first_name',
                         'middle_name',
                         'last_name',
                         'suffix',
                         'credentials']]\
    .astype(str)\
    .agg(' '.join, axis = 1)\
    .str.replace("nan ","")
)

nashville_providers = nashville_providers[['npi',
                                           'entity_type_code',
                                           'name',
                                           'address_first_line',
                                           'address_second_line',
                                           'city',
                                           'state',
                                           'zipcode',
                                           'grouping',
                                           'classification',
                                           'specialization',
                                           'affiliation']]

facilities['name'] = (
    facilities[['organization_name',
                'organization_name_other']]\
    .astype(str)\
    .agg('; '.join, axis = 1)\
    .str.replace("; nan","")
)

facilities = facilities[['npi',
                         'entity_type_code',
                         'name',
                         'address_first_line',
                         'address_second_line',
                         'city',
                         'state',
                         'zipcode',
                         'grouping',
                         'classification',
                         'specialization',
                         'affiliation']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nashville_providers['name'] = (


In [9]:
# Merge the provider information and facility information together with the facility referral table and 
# the provider referral table. Concatenate those merged tables together and write to a csv for use later.
nashville_provider_referrals = (
    pd.merge(p2p_referrals,
             nashville_providers,
             left_on = 'from_npi',
             right_on = 'npi')\
    .merge(nashville_providers,
           left_on = 'to_npi',
           right_on = 'npi',
           suffixes = ('_from','_to'))
)

facility_referrals = (
    pd.merge(p2f_referrals,
             nashville_providers,
             left_on = 'from_npi',
             right_on = 'npi')\
    .merge(facilities,
           left_on = 'to_npi',
           right_on = 'npi',
           suffixes = ('_from','_to'))
)

nashville_referrals = pd.concat([nashville_provider_referrals, facility_referrals]).drop(columns = ['from_npi','to_npi'])
nashville_referrals.to_csv('data/nashville_referrals.csv', index = False)