In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import datetime as dt
import regex as re
from matplotlib import pyplot as plt
%matplotlib inline
pd.set_option('display.max_columns', None)

font = {'family' : 'normal',
        'weight' : 'normal',
        'size'   : 16}

plt.rc('font', **font)

### Data Preprocessing/Feature Engineering

In [3]:
#load data
data = pd.read_csv('Merged_Data.csv', index_col = 0)
prov = pd.read_csv('Data/Train-Potential Fraud.csv')
prov.head()

Unnamed: 0,Provider,PotentialFraud
0,PRV51001,No
1,PRV51003,Yes
2,PRV51004,No
3,PRV51005,Yes
4,PRV51007,No


In [4]:
prov['PotentialFraud'].value_counts()

No     4904
Yes     506
Name: PotentialFraud, dtype: int64

In [5]:
prov['Fraud'] = np.where(prov['PotentialFraud'] == 'No', 0, 1)
prov = prov.drop('PotentialFraud', axis = 1)

In [6]:
def numeric_col_mean_feature(data_source, by, col, name, target_df):
    df = data_source.groupby(by)[col].mean()
    df = df.rename(name)
    return target_df.join(df, on = by, how = 'left')

In [7]:
prov = numeric_col_mean_feature(data, 'Provider', 'InscClaimAmtReimbursed', 'reimburse', prov)
prov = numeric_col_mean_feature(data, 'Provider', 'DeductibleAmtPaid', 'deductible', prov)
prov = numeric_col_mean_feature(data, 'Provider', 'DaysAdmitted', 'days_admitted', prov)
prov = numeric_col_mean_feature(data, 'Provider', 'NumDiagnoses', 'num_diagnoses', prov)
prov = numeric_col_mean_feature(data, 'Provider', 'NumProcedures', 'num_procedures', prov)

In [8]:
prov.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5410 entries, 0 to 5409
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Provider        5410 non-null   object 
 1   Fraud           5410 non-null   int32  
 2   reimburse       5410 non-null   float64
 3   deductible      5409 non-null   float64
 4   days_admitted   5410 non-null   float64
 5   num_diagnoses   5410 non-null   float64
 6   num_procedures  5410 non-null   float64
dtypes: float64(5), int32(1), object(1)
memory usage: 274.9+ KB


In [9]:
#1 deductible is NA, we will fill it with 0
prov = prov.fillna(0)
prov.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5410 entries, 0 to 5409
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Provider        5410 non-null   object 
 1   Fraud           5410 non-null   int32  
 2   reimburse       5410 non-null   float64
 3   deductible      5410 non-null   float64
 4   days_admitted   5410 non-null   float64
 5   num_diagnoses   5410 non-null   float64
 6   num_procedures  5410 non-null   float64
dtypes: float64(5), int32(1), object(1)
memory usage: 274.9+ KB


In [10]:
#rather than DOB, let's create an age column
data_date = dt.datetime(2009,1,1,0,0,0,0)
data['Age'] = round(((data_date - pd.to_datetime(data['DOB'])).dt.days)/365)
prov = numeric_col_mean_feature(data, 'Provider', 'Age', 'age', prov)

In [11]:
prov

Unnamed: 0,Provider,Fraud,reimburse,deductible,days_admitted,num_diagnoses,num_procedures,age
0,PRV51001,0,4185.600000,213.600000,2.440000,3.200000,0.120000,77.880000
1,PRV51003,1,4588.409091,502.166667,4.674242,5.250000,0.363636,69.083333
2,PRV51004,0,350.134228,2.080537,2.429530,2.583893,0.000000,71.248322
3,PRV51005,1,241.124464,3.175966,2.088412,2.588841,0.000000,69.545923
4,PRV51007,0,468.194444,45.333333,1.958333,2.986111,0.013889,68.430556
...,...,...,...,...,...,...,...,...
5405,PRV57759,0,380.000000,4.642857,3.142857,2.071429,0.000000,73.035714
5406,PRV57760,0,216.818182,0.000000,1.318182,2.363636,0.000000,60.272727
5407,PRV57761,0,225.243902,4.512195,2.390244,2.670732,0.000000,70.987805
5408,PRV57762,0,1900.000000,0.000000,1.000000,1.000000,0.000000,67.000000


In [12]:
#let's convert patientType column into a numeric column called outpatient where 1 indicates it was an outpatient visit
data.loc[:, 'outpatient'] = np.where(data.loc[:, 'patientType'] == 'outpatient', 0, 1)

#then add proportion of outpatients per provider to provider df
prov = numeric_col_mean_feature(data, 'Provider', 'outpatient', 'outpatient_proportion', prov)
prov.head()

Unnamed: 0,Provider,Fraud,reimburse,deductible,days_admitted,num_diagnoses,num_procedures,age,outpatient_proportion
0,PRV51001,0,4185.6,213.6,2.44,3.2,0.12,77.88,0.2
1,PRV51003,1,4588.409091,502.166667,4.674242,5.25,0.363636,69.083333,0.469697
2,PRV51004,0,350.134228,2.080537,2.42953,2.583893,0.0,71.248322,0.0
3,PRV51005,1,241.124464,3.175966,2.088412,2.588841,0.0,69.545923,0.0
4,PRV51007,0,468.194444,45.333333,1.958333,2.986111,0.013889,68.430556,0.041667


In [13]:
#chronic conditions should be 0 = no, 1 = yes, so we'll change 2 to 0
chronic_cols = [col for col in data.columns if 'Chronic' in col]

#let's see if any chronic cols have NA values
data[chronic_cols].isna().sum()

ChronicCond_Alzheimer              0
ChronicCond_Heartfailure           0
ChronicCond_KidneyDisease          0
ChronicCond_Cancer                 0
ChronicCond_ObstrPulmonary         0
ChronicCond_Depression             0
ChronicCond_Diabetes               0
ChronicCond_IschemicHeart          0
ChronicCond_Osteoporasis           0
ChronicCond_rheumatoidarthritis    0
ChronicCond_stroke                 0
dtype: int64

In [14]:
#since we have no NA values, we can replace all 2's with a 0, and then everything else will be a 1
for i in chronic_cols:
    data.loc[:, i] = np.where(data.loc[:, i] == 2, 0, 1)
data[chronic_cols].head()

Unnamed: 0,ChronicCond_Alzheimer,ChronicCond_Heartfailure,ChronicCond_KidneyDisease,ChronicCond_Cancer,ChronicCond_ObstrPulmonary,ChronicCond_Depression,ChronicCond_Diabetes,ChronicCond_IschemicHeart,ChronicCond_Osteoporasis,ChronicCond_rheumatoidarthritis,ChronicCond_stroke
0,1,0,1,0,0,1,1,1,0,1,1
1,1,0,1,0,0,1,1,1,0,1,1
2,1,0,1,0,0,1,1,1,0,1,1
3,0,1,1,0,0,1,1,0,0,1,1
4,0,1,1,0,1,1,0,1,0,0,0


In [15]:
chronic_cols

['ChronicCond_Alzheimer',
 'ChronicCond_Heartfailure',
 'ChronicCond_KidneyDisease',
 'ChronicCond_Cancer',
 'ChronicCond_ObstrPulmonary',
 'ChronicCond_Depression',
 'ChronicCond_Diabetes',
 'ChronicCond_IschemicHeart',
 'ChronicCond_Osteoporasis',
 'ChronicCond_rheumatoidarthritis',
 'ChronicCond_stroke']

In [16]:
#add proportion of patients with each chronic condition for a provider
for i in chronic_cols:
    prov = numeric_col_mean_feature(data, 'Provider', i, str.lower(i), prov)

In [17]:
prov.describe()

Unnamed: 0,Fraud,reimburse,deductible,days_admitted,num_diagnoses,num_procedures,age,outpatient_proportion,chroniccond_alzheimer,chroniccond_heartfailure,chroniccond_kidneydisease,chroniccond_cancer,chroniccond_obstrpulmonary,chroniccond_depression,chroniccond_diabetes,chroniccond_ischemicheart,chroniccond_osteoporasis,chroniccond_rheumatoidarthritis,chroniccond_stroke
count,5410.0,5410.0,5410.0,5410.0,5410.0,5410.0,5410.0,5410.0,5410.0,5410.0,5410.0,5410.0,5410.0,5410.0,5410.0,5410.0,5410.0,5410.0,5410.0
mean,0.09353,1740.679369,155.614405,3.013987,3.407356,0.108011,72.815235,0.144568,0.404218,0.594383,0.420224,0.15326,0.322807,0.436831,0.707307,0.765842,0.320718,0.309094,0.104631
std,0.291201,3484.473124,306.468426,2.057721,1.727429,0.246305,4.712976,0.288362,0.18229,0.183746,0.190397,0.133056,0.176796,0.181678,0.16895,0.153917,0.171529,0.168559,0.113676
min,0.0,0.0,0.0,1.0,0.0,0.0,33.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,232.394593,0.311286,1.875,2.5,0.0,70.852941,0.0,0.333333,0.518519,0.333333,0.083333,0.241856,0.363636,0.651475,0.708333,0.25,0.230769,0.032551
50%,0.0,356.085106,4.285714,2.586207,2.81455,0.0,72.939383,0.0,0.4,0.598485,0.413793,0.142857,0.315165,0.437037,0.714286,0.770492,0.317073,0.307692,0.090909
75%,0.0,1490.154301,137.363953,3.544613,3.567646,0.083333,74.849389,0.128205,0.484848,0.666667,0.5,0.197452,0.392494,0.5,0.785714,0.839304,0.385388,0.375,0.137931
max,1.0,57000.0,1068.0,36.0,10.0,3.0,100.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [18]:
#we should also include proportion of patients with renal disease that a provider has seen
data['RenalDiseaseIndicator'].value_counts()

0    448363
Y    109848
Name: RenalDiseaseIndicator, dtype: int64

In [19]:
data.loc[:, 'RenalDiseaseIndicator'] = np.where(data['RenalDiseaseIndicator'] == 'Y', 1, 0)
prov = numeric_col_mean_feature(data, 'Provider', 'RenalDiseaseIndicator', 'renal_disease', prov)

In [20]:
prov.describe()

Unnamed: 0,Fraud,reimburse,deductible,days_admitted,num_diagnoses,num_procedures,age,outpatient_proportion,chroniccond_alzheimer,chroniccond_heartfailure,chroniccond_kidneydisease,chroniccond_cancer,chroniccond_obstrpulmonary,chroniccond_depression,chroniccond_diabetes,chroniccond_ischemicheart,chroniccond_osteoporasis,chroniccond_rheumatoidarthritis,chroniccond_stroke,renal_disease
count,5410.0,5410.0,5410.0,5410.0,5410.0,5410.0,5410.0,5410.0,5410.0,5410.0,5410.0,5410.0,5410.0,5410.0,5410.0,5410.0,5410.0,5410.0,5410.0,5410.0
mean,0.09353,1740.679369,155.614405,3.013987,3.407356,0.108011,72.815235,0.144568,0.404218,0.594383,0.420224,0.15326,0.322807,0.436831,0.707307,0.765842,0.320718,0.309094,0.104631,0.196768
std,0.291201,3484.473124,306.468426,2.057721,1.727429,0.246305,4.712976,0.288362,0.18229,0.183746,0.190397,0.133056,0.176796,0.181678,0.16895,0.153917,0.171529,0.168559,0.113676,0.147006
min,0.0,0.0,0.0,1.0,0.0,0.0,33.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,232.394593,0.311286,1.875,2.5,0.0,70.852941,0.0,0.333333,0.518519,0.333333,0.083333,0.241856,0.363636,0.651475,0.708333,0.25,0.230769,0.032551,0.125
50%,0.0,356.085106,4.285714,2.586207,2.81455,0.0,72.939383,0.0,0.4,0.598485,0.413793,0.142857,0.315165,0.437037,0.714286,0.770492,0.317073,0.307692,0.090909,0.188265
75%,0.0,1490.154301,137.363953,3.544613,3.567646,0.083333,74.849389,0.128205,0.484848,0.666667,0.5,0.197452,0.392494,0.5,0.785714,0.839304,0.385388,0.375,0.137931,0.25
max,1.0,57000.0,1068.0,36.0,10.0,3.0,100.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [21]:
data['Gender'].value_counts()

2    323114
1    235097
Name: Gender, dtype: int64

In [22]:
#let's change gender 2 to gender 0 and then convert gender into proportion of patients seen of gender 1
data.loc[:, 'Gender'] = np.where(data.loc[:, 'Gender'] == 2, 0, 1)
prov = numeric_col_mean_feature(data, 'Provider', 'Gender', 'gender', prov)
prov.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5410 entries, 0 to 5409
Data columns (total 22 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Provider                         5410 non-null   object 
 1   Fraud                            5410 non-null   int32  
 2   reimburse                        5410 non-null   float64
 3   deductible                       5410 non-null   float64
 4   days_admitted                    5410 non-null   float64
 5   num_diagnoses                    5410 non-null   float64
 6   num_procedures                   5410 non-null   float64
 7   age                              5410 non-null   float64
 8   outpatient_proportion            5410 non-null   float64
 9   chroniccond_alzheimer            5410 non-null   float64
 10  chroniccond_heartfailure         5410 non-null   float64
 11  chroniccond_kidneydisease        5410 non-null   float64
 12  chroniccond_cancer  

In [23]:
data.Race.value_counts()

1    471036
2     55640
3     19715
5     11820
Name: Race, dtype: int64

In [24]:
#rename category 5 as 4
data.loc[:, 'Race'] = np.where(data.loc[:, 'Race'] == 5, 4, data.loc[:, 'Race'])

In [25]:
data.Race.value_counts()

1    471036
2     55640
3     19715
4     11820
Name: Race, dtype: int64

In [27]:
#initialize race columns as 0
prov['race_1'] = 0
prov['race_2'] = 0
prov['race_3'] = 0
prov['race_4'] = 0

#calculate proportion of patients by race for each provider
race_proportions = data.groupby('Provider')['Race'].value_counts() / data.groupby('Provider')['Race'].count()

#set index to provider so df is easier to update
prov = prov.set_index('Provider')

#fill the race columns by provider and race
for i in race_proportions.index:
    col = 'race_' + str(i[1])
    prov.loc[i[0], col] = race_proportions[i]

In [28]:
#check to make sure each provider's race proportions sum up to 1, and with 5410 providers this should be 5410
sum(prov[['race_1', 'race_2', 'race_3', 'race_4']].sum(axis = 1))

5410.0

In [29]:
prov.head()

Unnamed: 0_level_0,Fraud,reimburse,deductible,days_admitted,num_diagnoses,num_procedures,age,outpatient_proportion,chroniccond_alzheimer,chroniccond_heartfailure,chroniccond_kidneydisease,chroniccond_cancer,chroniccond_obstrpulmonary,chroniccond_depression,chroniccond_diabetes,chroniccond_ischemicheart,chroniccond_osteoporasis,chroniccond_rheumatoidarthritis,chroniccond_stroke,renal_disease,gender,race_1,race_2,race_3,race_4
Provider,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
PRV51001,0,4185.6,213.6,2.44,3.2,0.12,77.88,0.2,0.6,0.76,0.68,0.2,0.4,0.36,0.84,0.92,0.24,0.32,0.24,0.32,0.36,0.84,0.16,0.0,0.0
PRV51003,1,4588.409091,502.166667,4.674242,5.25,0.363636,69.083333,0.469697,0.424242,0.606061,0.484848,0.075758,0.310606,0.409091,0.757576,0.848485,0.25,0.287879,0.090909,0.219697,0.409091,0.810606,0.181818,0.0,0.007576
PRV51004,0,350.134228,2.080537,2.42953,2.583893,0.0,71.248322,0.0,0.42953,0.590604,0.33557,0.107383,0.275168,0.422819,0.704698,0.724832,0.328859,0.308725,0.114094,0.154362,0.308725,0.805369,0.161074,0.033557,0.0
PRV51005,1,241.124464,3.175966,2.088412,2.588841,0.0,69.545923,0.0,0.365665,0.583691,0.435193,0.141631,0.253219,0.416309,0.685837,0.76824,0.295279,0.28412,0.106438,0.222318,0.438627,0.766524,0.224893,0.008584,0.0
PRV51007,0,468.194444,45.333333,1.958333,2.986111,0.013889,68.430556,0.041667,0.361111,0.555556,0.305556,0.166667,0.222222,0.402778,0.680556,0.708333,0.291667,0.305556,0.166667,0.152778,0.472222,0.805556,0.194444,0.0,0.0


In [30]:
prov.to_csv('provider_data.csv')