# Data download and cleanup

In [1]:
import os
import glob
import pandas as pd

## Download data 

Source: Data were downloaded from the [Medicare public data website](https://data.cms.gov/provider-summary-by-type-of-service/medicare-physician-other-practitioners/medicare-physician-other-practitioners-by-provider) for years 2018, 2019, 2020.

In [51]:
os.getcwd()

'/Users/oana/Documents/Stanford/Spring2023/CS230/final_project/cs230_project/code/preprocessing'

In [53]:
# assuming you're in code/preprocessing 
data_path = '../../data/raw/'
all_data_files = glob.glob(os.path.join(data_path, 'medicare_*.csv'))

data_list = []

for f in all_data_files:
    df = pd.read_csv(f, header = 0, encoding = 'latin-1')
    df['year'] = str.split(str.split(str.split(f, '/')[-1], "_")[-1], ".csv")[0]
    data_list.append(df)

data_df = pd.concat(data_list, axis=0, ignore_index = True)

  df = pd.read_csv(f, header = 0, encoding = 'latin-1')
  df = pd.read_csv(f, header = 0, encoding = 'latin-1')


In [55]:
all_data_files #just to check

['../../data/raw/medicare_raw_2019.csv',
 '../../data/raw/medicare_raw_2018.csv']

In [56]:
data_df.shape

(2277332, 74)

## Remove irrelevant columns

In [58]:
# Keep individuals only, denoted by 'I' 
data_df = data_df.loc[data_df['Rndrng_Prvdr_Ent_Cd'] == 'I']
data_df.shape

(2156627, 74)

In [59]:
# US data only
data_df = data_df.loc[data_df['Rndrng_Prvdr_Cntry'] == 'US']
data_df.shape

(2156287, 74)

In [60]:
# Only providers participating in Medicare
data_df = data_df.loc[data_df['Rndrng_Prvdr_Mdcr_Prtcptg_Ind'] == 'Y']
data_df.shape

(2154394, 74)

In [61]:
# Remove data that was supressed already (Drug-related)
data_df = data_df.loc[data_df['Drug_Sprsn_Ind'].isnull()]
data_df.shape

(1915045, 74)

In [62]:
# Remove data that was supressed already (Medical-related)
data_df = data_df.loc[data_df['Med_Sprsn_Ind'].isnull()]
data_df.shape

(1915045, 74)

In [63]:
# columns to remove are largely redundant ID-related information. Determined from the data dictionary:
# here: https://data.cms.gov/resources/medicare-physician-other-practitioners-by-provider-data-dictionary

cols_to_remove = ['Rndrng_Prvdr_Last_Org_Name', # don't need name b/c have unique ID
                 'Rndrng_Prvdr_First_Name',
                 'Rndrng_Prvdr_MI',
                 'Rndrng_Prvdr_Ent_Cd', # will be all I's
                 'Rndrng_Prvdr_St1', # only keeping zip code
                 'Rndrng_Prvdr_St2',
                 'Rndrng_Prvdr_City',
                 'Rndrng_Prvdr_State_Abrvtn',
                 'Rndrng_Prvdr_State_FIPS',
                 'Rndrng_Prvdr_RUCA', # Determined by zip code 
                 'Rndrng_Prvdr_RUCA_Desc',
                 'Rndrng_Prvdr_Cntry',
                 'Rndrng_Prvdr_Mdcr_Prtcptg_Ind',
                 'Rndrng_Prvdr_Crdntls', 
                 'Tot_Mdcr_Pymt_Amt', # Tot_Mdcr_Stdzd_Amt more specific to provider behavior
                 'Drug_Sprsn_Ind', # indicator for supressed data (which was removed)
                 'Med_Sprsn_Ind', # indicator for supressed data (which was removed)
                 'Med_Mdcr_Pymt_Amt', # Med_Mdcr_Stdzd_Amt more specific to provider
                 ]
data_df = data_df.drop(cols_to_remove, axis=1)
data_df.shape

(1915045, 56)

In [64]:
# Check representation across years
data_df.groupby(['year']).size()

year
2018    941602
2019    973443
dtype: int64

In [65]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1915045 entries, 0 to 2277331
Data columns (total 56 columns):
 #   Column                Dtype  
---  ------                -----  
 0   Rndrng_NPI            int64  
 1   Rndrng_Prvdr_Gndr     object 
 2   Rndrng_Prvdr_Zip5     object 
 3   Rndrng_Prvdr_Type     object 
 4   Tot_HCPCS_Cds         int64  
 5   Tot_Benes             int64  
 6   Tot_Srvcs             float64
 7   Tot_Sbmtd_Chrg        float64
 8   Tot_Mdcr_Alowd_Amt    float64
 9   Tot_Mdcr_Stdzd_Amt    float64
 10  Drug_Tot_HCPCS_Cds    float64
 11  Drug_Tot_Benes        float64
 12  Drug_Tot_Srvcs        float64
 13  Drug_Sbmtd_Chrg       float64
 14  Drug_Mdcr_Alowd_Amt   float64
 15  Drug_Mdcr_Pymt_Amt    float64
 16  Drug_Mdcr_Stdzd_Amt   float64
 17  Med_Tot_HCPCS_Cds     float64
 18  Med_Tot_Benes         float64
 19  Med_Tot_Srvcs         float64
 20  Med_Sbmtd_Chrg        float64
 21  Med_Mdcr_Alowd_Amt    float64
 22  Med_Mdcr_Stdzd_Amt    float64
 23  Bene_Avg_Age

## Fix dtypes

In [66]:
#data_df = data_df.convert_dtypes()

In [67]:
# Set year to int
data_df['year'] = pd.to_numeric(data_df['year'])

In [68]:
# weird incomplete single zip code
data_df.loc[data_df['Rndrng_Prvdr_Zip5'] == '8742-', 'Rndrng_Prvdr_Zip5'] = pd.NA
data_df['Rndrng_Prvdr_Zip5'] = pd.to_numeric(data_df['Rndrng_Prvdr_Zip5'])

In [69]:
# Turn gender into a binary column 
data_df['Rndrng_Prvdr_Gndr'] = data_df['Rndrng_Prvdr_Gndr'].astype('category').cat.codes

In [70]:
# one hot encode categorical columns
data_df = pd.get_dummies(data_df, columns = ['Rndrng_Prvdr_Type'], dtype='int')

In [71]:
data_df.shape

(1915045, 146)

In [72]:
data_df.head()

Unnamed: 0,Rndrng_NPI,Rndrng_Prvdr_Gndr,Rndrng_Prvdr_Zip5,Tot_HCPCS_Cds,Tot_Benes,Tot_Srvcs,Tot_Sbmtd_Chrg,Tot_Mdcr_Alowd_Amt,Tot_Mdcr_Stdzd_Amt,Drug_Tot_HCPCS_Cds,...,Rndrng_Prvdr_Type_Slide Preparation Facility,Rndrng_Prvdr_Type_Speech Language Pathologist,Rndrng_Prvdr_Type_Sports Medicine,Rndrng_Prvdr_Type_Surgical Oncology,Rndrng_Prvdr_Type_Thoracic Surgery,Rndrng_Prvdr_Type_Undefined Physician type,Rndrng_Prvdr_Type_Undersea and Hyperbaric Medicine,Rndrng_Prvdr_Type_Unknown Supplier/Provider Specialty,Rndrng_Prvdr_Type_Urology,Rndrng_Prvdr_Type_Vascular Surgery
0,1003000126,1,21502.0,18,610,1392.0,519136.0,156626.32,125266.1,0.0,...,0,0,0,0,0,0,0,0,0,0
1,1003000134,1,60201.0,20,3614,7835.0,1213264.0,298905.09,218268.63,0.0,...,0,0,0,0,0,0,0,0,0,0
4,1003000480,1,80045.0,30,104,159.0,210964.25,40069.88,30002.44,0.0,...,0,0,0,0,0,0,0,0,0,0
5,1003000522,1,32725.0,46,374,1319.0,349343.44,112849.81,87218.2,8.0,...,0,0,0,0,0,0,0,0,0,0
6,1003000530,0,18951.0,33,347,1485.0,234676.0,145869.56,106806.12,7.0,...,0,0,0,0,0,0,0,0,0,0


## Add labels 

The LEIE database was downloaded from [here](https://oig.hhs.gov/exclusions/exclusions_list.asp) (5/10/2023 update). This is updated montly and indicates providers who are excluded from federally funded HC programs for various reasons. In line with prior work, use indicators most likely to be associated with Medicare fraud [source](https://journalofbigdata.springeropen.com/articles/10.1186/s40537-018-0138-3).

In [73]:
df_leie = pd.read_csv('../../data/raw/leie_5_10_2023.csv', low_memory=False)

In [74]:
# Drop irrelevant columns 
df_leie = df_leie.loc[:,['NPI','EXCLTYPE','EXCLDATE', 'WAIVERDATE']]

#drop NPI = 0
df_leie = df_leie[df_leie["NPI"]>0]

In [75]:
# Remove whitespace and then only keep relevant exclusion codes 
df_leie['EXCLTYPE'] = df_leie['EXCLTYPE'].str.strip()
df_leie = df_leie[df_leie['EXCLTYPE'].isin(['1128a1','1128a2','1128a3','1128b4','1128b7'])]

In [76]:
df_leie.shape

(5461, 4)

In [77]:
df_leie['exclusion_year'] = [x[0:4] for x in df_leie['EXCLDATE'].astype(str)]
df_leie['exclusion_year'] = df_leie['exclusion_year'].astype(int)

In [78]:
df_leie.head()

Unnamed: 0,NPI,EXCLTYPE,EXCLDATE,WAIVERDATE,exclusion_year
6,1922348218,1128a1,20180419,0,2018
30,1275600959,1128a1,20130320,0,2013
43,1265830335,1128a1,20220818,0,2022
63,1851631543,1128b7,20190326,0,2019
64,1902198435,1128a1,20160120,0,2016


In [79]:
len(df_leie['NPI'].unique())

5389

In [80]:
len(df_leie['NPI'][df_leie['exclusion_year'] >= 2020].unique())

1468

In [82]:
df_leie_after_2019 = df_leie.loc[df_leie['exclusion_year'] >= 2020]

In [86]:
len(df_leie_after_2019['NPI'].unique())

1468

In [90]:
df_leie_after_2019['excluded'] = [1] * df_leie_after_2019.shape[0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_leie_after_2019['excluded'] = [1] * df_leie_after_2019.shape[0]


In [91]:
df_leie_after_2019.head()

Unnamed: 0,NPI,EXCLTYPE,EXCLDATE,WAIVERDATE,exclusion_year,excluded
43,1265830335,1128a1,20220818,0,2022,1
65,1073916631,1128b7,20210816,0,2021,1
80,1437510278,1128a1,20230420,0,2023,1
119,1801231436,1128a1,20211029,0,2021,1
139,1750442018,1128a1,20201020,0,2020,1


## Combine

In [104]:
combined_df = pd.merge(data_df, 
                       df_leie_after_2019.loc[:,['NPI', 'excluded']], 
                       left_on='Rndrng_NPI', 
                       right_on='NPI', 
                       how='left')
combined_df = combined_df.drop(['NPI'], axis=1)

In [105]:
combined_df.shape

(1915047, 147)

In [106]:
data_df.shape

(1915045, 146)

In [107]:
# Proportion of positive cases
num_fraudulent_providers = len(combined_df['Rndrng_NPI'][combined_df['excluded'] == 1].unique())
num_fraudulent_providers/len(combined_df['Rndrng_NPI'].unique())

0.0003047735504729085

In [108]:
num_fraudulent_providers

329

In [109]:
num_providers = len(combined_df['Rndrng_NPI'].unique())
num_providers

1079490

In [120]:
combined_df.columns

Index(['Rndrng_NPI', 'Rndrng_Prvdr_Gndr', 'Rndrng_Prvdr_Zip5', 'Tot_HCPCS_Cds',
       'Tot_Benes', 'Tot_Srvcs', 'Tot_Sbmtd_Chrg', 'Tot_Mdcr_Alowd_Amt',
       'Tot_Mdcr_Stdzd_Amt', 'Drug_Tot_HCPCS_Cds',
       ...
       'Rndrng_Prvdr_Type_Speech Language Pathologist',
       'Rndrng_Prvdr_Type_Sports Medicine',
       'Rndrng_Prvdr_Type_Surgical Oncology',
       'Rndrng_Prvdr_Type_Thoracic Surgery',
       'Rndrng_Prvdr_Type_Undefined Physician type',
       'Rndrng_Prvdr_Type_Undersea and Hyperbaric Medicine',
       'Rndrng_Prvdr_Type_Unknown Supplier/Provider Specialty',
       'Rndrng_Prvdr_Type_Urology', 'Rndrng_Prvdr_Type_Vascular Surgery',
       'excluded'],
      dtype='object', length=147)

In [121]:
combined_df['excluded'] = combined_df.excluded.fillna(0)

In [122]:
combined_df.head()

Unnamed: 0,Rndrng_NPI,Rndrng_Prvdr_Gndr,Rndrng_Prvdr_Zip5,Tot_HCPCS_Cds,Tot_Benes,Tot_Srvcs,Tot_Sbmtd_Chrg,Tot_Mdcr_Alowd_Amt,Tot_Mdcr_Stdzd_Amt,Drug_Tot_HCPCS_Cds,...,Rndrng_Prvdr_Type_Speech Language Pathologist,Rndrng_Prvdr_Type_Sports Medicine,Rndrng_Prvdr_Type_Surgical Oncology,Rndrng_Prvdr_Type_Thoracic Surgery,Rndrng_Prvdr_Type_Undefined Physician type,Rndrng_Prvdr_Type_Undersea and Hyperbaric Medicine,Rndrng_Prvdr_Type_Unknown Supplier/Provider Specialty,Rndrng_Prvdr_Type_Urology,Rndrng_Prvdr_Type_Vascular Surgery,excluded
0,1003000126,1,21502.0,18,610,1392.0,519136.0,156626.32,125266.1,0.0,...,0,0,0,0,0,0,0,0,0,0.0
1,1003000134,1,60201.0,20,3614,7835.0,1213264.0,298905.09,218268.63,0.0,...,0,0,0,0,0,0,0,0,0,0.0
2,1003000480,1,80045.0,30,104,159.0,210964.25,40069.88,30002.44,0.0,...,0,0,0,0,0,0,0,0,0,0.0
3,1003000522,1,32725.0,46,374,1319.0,349343.44,112849.81,87218.2,8.0,...,0,0,0,0,0,0,0,0,0,0.0
4,1003000530,0,18951.0,33,347,1485.0,234676.0,145869.56,106806.12,7.0,...,0,0,0,0,0,0,0,0,0,0.0


In [125]:
combined_df.groupby(['excluded']).size()

excluded
0.0    1914523
1.0        524
dtype: int64

In [126]:
combined_df.to_csv('../../data/processed/combined_processed_data.csv', 
                   index=False)