In [2]:
pip install regex

Collecting regex
  Downloading regex-2022.3.2-cp38-cp38-macosx_11_0_arm64.whl (281 kB)
[K     |████████████████████████████████| 281 kB 3.9 MB/s eta 0:00:01
[?25hInstalling collected packages: regex
Successfully installed regex-2022.3.2
Note: you may need to restart the kernel to use updated packages.


In [19]:
import pandas as pd
import numpy as np
import seaborn as sns
import datetime as dt
import regex as re
from matplotlib import pyplot as plt
%matplotlib inline
pd.set_option('display.max_columns', None)

font = {'family' : 'normal',
        'weight' : 'normal',
        'size'   : 16}

plt.rc('font', **font)

### Data Preprocessing/Feature Engineering

In [20]:
#load data
data = pd.read_csv('Merged_Data.csv', index_col = 0)
prov = pd.read_csv('Train-Potential Fraud.csv')
prov.head()

Unnamed: 0,Provider,PotentialFraud
0,PRV51001,No
1,PRV51003,Yes
2,PRV51004,No
3,PRV51005,Yes
4,PRV51007,No


In [31]:
data.columns.values

array(['BeneID', 'ClaimID', 'ClaimStartDt', 'ClaimEndDt', 'Provider',
       'InscClaimAmtReimbursed', 'AttendingPhysician',
       'DeductibleAmtPaid', 'ClmDiagnosisCode_1', 'ClmDiagnosisCode_2',
       'ClmDiagnosisCode_3', 'ClmDiagnosisCode_4', 'ClmDiagnosisCode_5',
       'ClmDiagnosisCode_6', 'ClmDiagnosisCode_7', 'ClmDiagnosisCode_8',
       'ClmDiagnosisCode_9', 'ClmDiagnosisCode_10', 'ClmProcedureCode_1',
       'ClmProcedureCode_2', 'ClmProcedureCode_3', 'ClmProcedureCode_4',
       'ClmProcedureCode_5', 'ClmProcedureCode_6', 'patientType', 'DOB',
       'Gender', 'Race', 'RenalDiseaseIndicator', 'State', 'County',
       'NoOfMonths_PartACov', 'NoOfMonths_PartBCov',
       'ChronicCond_Alzheimer', 'ChronicCond_Heartfailure',
       'ChronicCond_KidneyDisease', 'ChronicCond_Cancer',
       'ChronicCond_ObstrPulmonary', 'ChronicCond_Depression',
       'ChronicCond_Diabetes', 'ChronicCond_IschemicHeart',
       'ChronicCond_Osteoporasis', 'ChronicCond_rheumatoidarthritis',
    

In [21]:
prov['PotentialFraud'].value_counts()

No     4904
Yes     506
Name: PotentialFraud, dtype: int64

In [22]:
prov['Fraud'] = np.where(prov['PotentialFraud'] == 'No', 0, 1)
prov = prov.drop('PotentialFraud', axis = 1)

In [23]:
def numeric_col_mean_feature(data_source, by, col, name, target_df):
    df = data_source.groupby(by)[col].mean()
    df = df.rename(name)
    return target_df.join(df, on = by, how = 'left')

def numeric_col_median_feature(data_source, by, col, name, target_df):
    df = data_source.groupby(by)[col].median()
    df = df.rename(name)
    return target_df.join(df, on = by, how = 'left')

def numeric_col_max_feature(data_source, by, col, name, target_df):
    df = data_source.groupby(by)[col].max()
    df = df.rename(name)
    return target_df.join(df, on = by, how = 'left')

In [24]:
prov = numeric_col_mean_feature(data, 'Provider', 'InscClaimAmtReimbursed', 'reimburse_mean', prov)
prov = numeric_col_mean_feature(data, 'Provider', 'DeductibleAmtPaid', 'deductible_mean', prov)
prov = numeric_col_mean_feature(data, 'Provider', 'DaysAdmitted', 'days_admitted_mean', prov)
prov = numeric_col_mean_feature(data, 'Provider', 'NumDiagnoses', 'num_diagnoses_mean', prov)
prov = numeric_col_mean_feature(data, 'Provider', 'NumProcedures', 'num_procedures_mean', prov)

In [25]:
prov = numeric_col_median_feature(data, 'Provider', 'InscClaimAmtReimbursed', 'reimburse_median', prov)
prov = numeric_col_median_feature(data, 'Provider', 'DeductibleAmtPaid', 'deductible_median', prov)
prov = numeric_col_median_feature(data, 'Provider', 'DaysAdmitted', 'days_admitted_median', prov)
prov = numeric_col_median_feature(data, 'Provider', 'NumDiagnoses', 'num_diagnoses_median', prov)
prov = numeric_col_median_feature(data, 'Provider', 'NumProcedures', 'num_procedures_median', prov)

In [26]:
prov = numeric_col_max_feature(data, 'Provider', 'InscClaimAmtReimbursed', 'reimburse_max', prov)
prov = numeric_col_max_feature(data, 'Provider', 'DeductibleAmtPaid', 'deductible_max', prov)
prov = numeric_col_max_feature(data, 'Provider', 'DaysAdmitted', 'days_admitted_max', prov)
prov = numeric_col_max_feature(data, 'Provider', 'NumDiagnoses', 'num_diagnoses_meax', prov)
prov = numeric_col_max_feature(data, 'Provider', 'NumProcedures', 'num_procedures_max', prov)

In [27]:
prov.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5410 entries, 0 to 5409
Data columns (total 17 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Provider               5410 non-null   object 
 1   Fraud                  5410 non-null   int64  
 2   reimburse_mean         5410 non-null   float64
 3   deductible_mean        5409 non-null   float64
 4   days_admitted_mean     5410 non-null   float64
 5   num_diagnoses_mean     5410 non-null   float64
 6   num_procedures_mean    5410 non-null   float64
 7   reimburse_median       5410 non-null   float64
 8   deductible_median      5409 non-null   float64
 9   days_admitted_median   5410 non-null   float64
 10  num_diagnoses_median   5410 non-null   float64
 11  num_procedures_median  5410 non-null   float64
 12  reimburse_max          5410 non-null   int64  
 13  deductible_max         5409 non-null   float64
 14  days_admitted_max      5410 non-null   int64  
 15  num_

In [28]:
#1 deductible is NA, we will fill it with 0
prov = prov.fillna(0)
prov.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5410 entries, 0 to 5409
Data columns (total 17 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Provider               5410 non-null   object 
 1   Fraud                  5410 non-null   int64  
 2   reimburse_mean         5410 non-null   float64
 3   deductible_mean        5410 non-null   float64
 4   days_admitted_mean     5410 non-null   float64
 5   num_diagnoses_mean     5410 non-null   float64
 6   num_procedures_mean    5410 non-null   float64
 7   reimburse_median       5410 non-null   float64
 8   deductible_median      5410 non-null   float64
 9   days_admitted_median   5410 non-null   float64
 10  num_diagnoses_median   5410 non-null   float64
 11  num_procedures_median  5410 non-null   float64
 12  reimburse_max          5410 non-null   int64  
 13  deductible_max         5410 non-null   float64
 14  days_admitted_max      5410 non-null   int64  
 15  num_

In [32]:
#rather than DOB, let's create an age column
data_date = dt.datetime(2009,1,1,0,0,0,0)
data['Age'] = round(((data_date - pd.to_datetime(data['DOB'])).dt.days)/365)
prov = numeric_col_mean_feature(data, 'Provider', 'Age', 'age_mean', prov)
prov = numeric_col_median_feature(data, 'Provider', 'Age', 'age_median', prov)

In [33]:
prov

Unnamed: 0,Provider,Fraud,reimburse_mean,deductible_mean,days_admitted_mean,num_diagnoses_mean,num_procedures_mean,reimburse_median,deductible_median,days_admitted_median,num_diagnoses_median,num_procedures_median,reimburse_max,deductible_max,days_admitted_max,num_diagnoses_meax,num_procedures_max,age,age_mean,age_median
0,PRV51001,0,4185.600000,213.600000,2.440000,3.200000,0.120000,400.0,0.0,1.0,2.0,0.0,42000,1068.0,15,9,2,77.880000,77.880000,79.0
1,PRV51003,1,4588.409091,502.166667,4.674242,5.250000,0.363636,1750.0,0.0,2.5,4.5,0.0,57000,1068.0,28,10,2,69.083333,69.083333,71.0
2,PRV51004,0,350.134228,2.080537,2.429530,2.583893,0.000000,70.0,0.0,1.0,2.0,0.0,3300,100.0,21,9,0,71.248322,71.248322,72.0
3,PRV51005,1,241.124464,3.175966,2.088412,2.588841,0.000000,70.0,0.0,1.0,2.0,0.0,4080,200.0,21,10,0,69.545923,69.545923,70.0
4,PRV51007,0,468.194444,45.333333,1.958333,2.986111,0.013889,80.0,0.0,1.0,2.0,0.0,10000,1068.0,21,10,1,68.430556,68.430556,69.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5405,PRV57759,0,380.000000,4.642857,3.142857,2.071429,0.000000,65.0,0.0,1.0,2.0,0.0,3300,90.0,21,5,0,73.035714,73.035714,72.5
5406,PRV57760,0,216.818182,0.000000,1.318182,2.363636,0.000000,85.0,0.0,1.0,1.5,0.0,1100,0.0,8,7,0,60.272727,60.272727,58.0
5407,PRV57761,0,225.243902,4.512195,2.390244,2.670732,0.000000,70.0,0.0,1.0,2.0,0.0,2200,90.0,21,9,0,70.987805,70.987805,73.0
5408,PRV57762,0,1900.000000,0.000000,1.000000,1.000000,0.000000,1900.0,0.0,1.0,1.0,0.0,1900,0.0,1,1,0,67.000000,67.000000,67.0


In [34]:
#let's convert patientType column into a numeric column called outpatient where 1 indicates it was an outpatient visit
data.loc[:, 'outpatient'] = np.where(data.loc[:, 'patientType'] == 'outpatient', 0, 1)

#then add proportion of outpatients per provider to provider df
prov = numeric_col_mean_feature(data, 'Provider', 'outpatient', 'outpatient_proportion', prov)
prov.head()

Unnamed: 0,Provider,Fraud,reimburse_mean,deductible_mean,days_admitted_mean,num_diagnoses_mean,num_procedures_mean,reimburse_median,deductible_median,days_admitted_median,num_diagnoses_median,num_procedures_median,reimburse_max,deductible_max,days_admitted_max,num_diagnoses_meax,num_procedures_max,age,age_mean,age_median,outpatient_proportion
0,PRV51001,0,4185.6,213.6,2.44,3.2,0.12,400.0,0.0,1.0,2.0,0.0,42000,1068.0,15,9,2,77.88,77.88,79.0,0.2
1,PRV51003,1,4588.409091,502.166667,4.674242,5.25,0.363636,1750.0,0.0,2.5,4.5,0.0,57000,1068.0,28,10,2,69.083333,69.083333,71.0,0.469697
2,PRV51004,0,350.134228,2.080537,2.42953,2.583893,0.0,70.0,0.0,1.0,2.0,0.0,3300,100.0,21,9,0,71.248322,71.248322,72.0,0.0
3,PRV51005,1,241.124464,3.175966,2.088412,2.588841,0.0,70.0,0.0,1.0,2.0,0.0,4080,200.0,21,10,0,69.545923,69.545923,70.0,0.0
4,PRV51007,0,468.194444,45.333333,1.958333,2.986111,0.013889,80.0,0.0,1.0,2.0,0.0,10000,1068.0,21,10,1,68.430556,68.430556,69.5,0.041667


In [35]:
#chronic conditions should be 0 = no, 1 = yes, so we'll change 2 to 0
chronic_cols = [col for col in data.columns if 'Chronic' in col]

#let's see if any chronic cols have NA values
data[chronic_cols].isna().sum()

ChronicCond_Alzheimer              0
ChronicCond_Heartfailure           0
ChronicCond_KidneyDisease          0
ChronicCond_Cancer                 0
ChronicCond_ObstrPulmonary         0
ChronicCond_Depression             0
ChronicCond_Diabetes               0
ChronicCond_IschemicHeart          0
ChronicCond_Osteoporasis           0
ChronicCond_rheumatoidarthritis    0
ChronicCond_stroke                 0
dtype: int64

In [36]:
#since we have no NA values, we can replace all 2's with a 0, and then everything else will be a 1
for i in chronic_cols:
    data.loc[:, i] = np.where(data.loc[:, i] == 2, 0, 1)
data[chronic_cols].head()

Unnamed: 0,ChronicCond_Alzheimer,ChronicCond_Heartfailure,ChronicCond_KidneyDisease,ChronicCond_Cancer,ChronicCond_ObstrPulmonary,ChronicCond_Depression,ChronicCond_Diabetes,ChronicCond_IschemicHeart,ChronicCond_Osteoporasis,ChronicCond_rheumatoidarthritis,ChronicCond_stroke
0,1,0,1,0,0,1,1,1,0,1,1
1,1,0,1,0,0,1,1,1,0,1,1
2,1,0,1,0,0,1,1,1,0,1,1
3,0,1,1,0,0,1,1,0,0,1,1
4,0,1,1,0,1,1,0,1,0,0,0


In [37]:
chronic_cols

['ChronicCond_Alzheimer',
 'ChronicCond_Heartfailure',
 'ChronicCond_KidneyDisease',
 'ChronicCond_Cancer',
 'ChronicCond_ObstrPulmonary',
 'ChronicCond_Depression',
 'ChronicCond_Diabetes',
 'ChronicCond_IschemicHeart',
 'ChronicCond_Osteoporasis',
 'ChronicCond_rheumatoidarthritis',
 'ChronicCond_stroke']

In [38]:
#add proportion of patients with each chronic condition for a provider
for i in chronic_cols:
    prov = numeric_col_mean_feature(data, 'Provider', i, str.lower(i), prov)

In [39]:
prov.describe()

Unnamed: 0,Fraud,reimburse_mean,deductible_mean,days_admitted_mean,num_diagnoses_mean,num_procedures_mean,reimburse_median,deductible_median,days_admitted_median,num_diagnoses_median,num_procedures_median,reimburse_max,deductible_max,days_admitted_max,num_diagnoses_meax,num_procedures_max,age,age_mean,age_median,outpatient_proportion,chroniccond_alzheimer,chroniccond_heartfailure,chroniccond_kidneydisease,chroniccond_cancer,chroniccond_obstrpulmonary,chroniccond_depression,chroniccond_diabetes,chroniccond_ischemicheart,chroniccond_osteoporasis,chroniccond_rheumatoidarthritis,chroniccond_stroke
count,5410.0,5410.0,5410.0,5410.0,5410.0,5410.0,5410.0,5410.0,5410.0,5410.0,5410.0,5410.0,5410.0,5410.0,5410.0,5410.0,5410.0,5410.0,5410.0,5410.0,5410.0,5410.0,5410.0,5410.0,5410.0,5410.0,5410.0,5410.0,5410.0,5410.0,5410.0
mean,0.09353,1740.679369,155.614405,3.013987,3.407356,0.108011,928.602588,117.811645,1.56451,2.90915,0.062477,13014.913124,446.817745,17.185767,8.150277,0.750277,72.815235,72.815235,73.702773,0.144568,0.404218,0.594383,0.420224,0.15326,0.322807,0.436831,0.707307,0.765842,0.320718,0.309094,0.104631
std,0.291201,3484.473124,306.468426,2.057721,1.727429,0.246305,2977.988601,332.035542,1.908737,2.102591,0.251225,18995.317219,493.873879,9.227715,2.322526,1.110266,4.712976,4.712976,4.933886,0.288362,0.18229,0.183746,0.190397,0.133056,0.176796,0.181678,0.16895,0.153917,0.171529,0.168559,0.113676
min,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,33.0,33.0,27.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,232.394593,0.311286,1.875,2.5,0.0,70.0,0.0,1.0,2.0,0.0,1700.0,10.0,10.25,8.0,0.0,70.852941,70.852941,72.0,0.0,0.333333,0.518519,0.333333,0.083333,0.241856,0.363636,0.651475,0.708333,0.25,0.230769,0.032551
50%,0.0,356.085106,4.285714,2.586207,2.81455,0.0,85.0,0.0,1.0,2.0,0.0,3300.0,100.0,21.0,9.0,0.0,72.939383,72.939383,74.0,0.0,0.4,0.598485,0.413793,0.142857,0.315165,0.437037,0.714286,0.770492,0.317073,0.307692,0.090909
75%,0.0,1490.154301,137.363953,3.544613,3.567646,0.083333,100.0,0.0,1.0,3.0,0.0,17000.0,1068.0,21.0,10.0,2.0,74.849389,74.849389,76.0,0.128205,0.484848,0.666667,0.5,0.197452,0.392494,0.5,0.785714,0.839304,0.385388,0.375,0.137931
max,1.0,57000.0,1068.0,36.0,10.0,3.0,57000.0,1068.0,36.0,10.0,3.0,125000.0,1068.0,37.0,10.0,5.0,100.0,100.0,100.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [40]:
#we should also include proportion of patients with renal disease that a provider has seen
data['RenalDiseaseIndicator'].value_counts()

0    448363
Y    109848
Name: RenalDiseaseIndicator, dtype: int64

In [41]:
data.loc[:, 'RenalDiseaseIndicator'] = np.where(data['RenalDiseaseIndicator'] == 'Y', 1, 0)
prov = numeric_col_mean_feature(data, 'Provider', 'RenalDiseaseIndicator', 'renal_disease', prov)

In [42]:
prov.describe()

Unnamed: 0,Fraud,reimburse_mean,deductible_mean,days_admitted_mean,num_diagnoses_mean,num_procedures_mean,reimburse_median,deductible_median,days_admitted_median,num_diagnoses_median,num_procedures_median,reimburse_max,deductible_max,days_admitted_max,num_diagnoses_meax,num_procedures_max,age,age_mean,age_median,outpatient_proportion,chroniccond_alzheimer,chroniccond_heartfailure,chroniccond_kidneydisease,chroniccond_cancer,chroniccond_obstrpulmonary,chroniccond_depression,chroniccond_diabetes,chroniccond_ischemicheart,chroniccond_osteoporasis,chroniccond_rheumatoidarthritis,chroniccond_stroke,renal_disease
count,5410.0,5410.0,5410.0,5410.0,5410.0,5410.0,5410.0,5410.0,5410.0,5410.0,5410.0,5410.0,5410.0,5410.0,5410.0,5410.0,5410.0,5410.0,5410.0,5410.0,5410.0,5410.0,5410.0,5410.0,5410.0,5410.0,5410.0,5410.0,5410.0,5410.0,5410.0,5410.0
mean,0.09353,1740.679369,155.614405,3.013987,3.407356,0.108011,928.602588,117.811645,1.56451,2.90915,0.062477,13014.913124,446.817745,17.185767,8.150277,0.750277,72.815235,72.815235,73.702773,0.144568,0.404218,0.594383,0.420224,0.15326,0.322807,0.436831,0.707307,0.765842,0.320718,0.309094,0.104631,0.196768
std,0.291201,3484.473124,306.468426,2.057721,1.727429,0.246305,2977.988601,332.035542,1.908737,2.102591,0.251225,18995.317219,493.873879,9.227715,2.322526,1.110266,4.712976,4.712976,4.933886,0.288362,0.18229,0.183746,0.190397,0.133056,0.176796,0.181678,0.16895,0.153917,0.171529,0.168559,0.113676,0.147006
min,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,33.0,33.0,27.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,232.394593,0.311286,1.875,2.5,0.0,70.0,0.0,1.0,2.0,0.0,1700.0,10.0,10.25,8.0,0.0,70.852941,70.852941,72.0,0.0,0.333333,0.518519,0.333333,0.083333,0.241856,0.363636,0.651475,0.708333,0.25,0.230769,0.032551,0.125
50%,0.0,356.085106,4.285714,2.586207,2.81455,0.0,85.0,0.0,1.0,2.0,0.0,3300.0,100.0,21.0,9.0,0.0,72.939383,72.939383,74.0,0.0,0.4,0.598485,0.413793,0.142857,0.315165,0.437037,0.714286,0.770492,0.317073,0.307692,0.090909,0.188265
75%,0.0,1490.154301,137.363953,3.544613,3.567646,0.083333,100.0,0.0,1.0,3.0,0.0,17000.0,1068.0,21.0,10.0,2.0,74.849389,74.849389,76.0,0.128205,0.484848,0.666667,0.5,0.197452,0.392494,0.5,0.785714,0.839304,0.385388,0.375,0.137931,0.25
max,1.0,57000.0,1068.0,36.0,10.0,3.0,57000.0,1068.0,36.0,10.0,3.0,125000.0,1068.0,37.0,10.0,5.0,100.0,100.0,100.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [43]:
data['Gender'].value_counts()

2    323114
1    235097
Name: Gender, dtype: int64

In [44]:
#let's change gender 2 to gender 0 and then convert gender into proportion of patients seen of gender 1
data.loc[:, 'Gender'] = np.where(data.loc[:, 'Gender'] == 2, 0, 1)
prov = numeric_col_mean_feature(data, 'Provider', 'Gender', 'gender', prov)
prov.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5410 entries, 0 to 5409
Data columns (total 34 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Provider                         5410 non-null   object 
 1   Fraud                            5410 non-null   int64  
 2   reimburse_mean                   5410 non-null   float64
 3   deductible_mean                  5410 non-null   float64
 4   days_admitted_mean               5410 non-null   float64
 5   num_diagnoses_mean               5410 non-null   float64
 6   num_procedures_mean              5410 non-null   float64
 7   reimburse_median                 5410 non-null   float64
 8   deductible_median                5410 non-null   float64
 9   days_admitted_median             5410 non-null   float64
 10  num_diagnoses_median             5410 non-null   float64
 11  num_procedures_median            5410 non-null   float64
 12  reimburse_max       

In [45]:
data.Race.value_counts()

1    471036
2     55640
3     19715
5     11820
Name: Race, dtype: int64

In [46]:
#rename category 5 as 4
data.loc[:, 'Race'] = np.where(data.loc[:, 'Race'] == 5, 4, data.loc[:, 'Race'])

In [47]:
data.Race.value_counts()

1    471036
2     55640
3     19715
4     11820
Name: Race, dtype: int64

In [48]:
#initialize race columns as 0
prov['race_1'] = 0
prov['race_2'] = 0
prov['race_3'] = 0
prov['race_4'] = 0

#calculate proportion of patients by race for each provider
race_proportions = data.groupby('Provider')['Race'].value_counts() / data.groupby('Provider')['Race'].count()

#set index to provider so df is easier to update
prov = prov.set_index('Provider')

#fill the race columns by provider and race
for i in race_proportions.index:
    col = 'race_' + str(i[1])
    prov.loc[i[0], col] = race_proportions[i]

In [49]:
#check to make sure each provider's race proportions sum up to 1, and with 5410 providers this should be 5410
sum(prov[['race_1', 'race_2', 'race_3', 'race_4']].sum(axis = 1))

5410.0

In [50]:
prov.head()

Unnamed: 0_level_0,Fraud,reimburse_mean,deductible_mean,days_admitted_mean,num_diagnoses_mean,num_procedures_mean,reimburse_median,deductible_median,days_admitted_median,num_diagnoses_median,num_procedures_median,reimburse_max,deductible_max,days_admitted_max,num_diagnoses_meax,num_procedures_max,age,age_mean,age_median,outpatient_proportion,chroniccond_alzheimer,chroniccond_heartfailure,chroniccond_kidneydisease,chroniccond_cancer,chroniccond_obstrpulmonary,chroniccond_depression,chroniccond_diabetes,chroniccond_ischemicheart,chroniccond_osteoporasis,chroniccond_rheumatoidarthritis,chroniccond_stroke,renal_disease,gender,race_1,race_2,race_3,race_4
Provider,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1
PRV51001,0,4185.6,213.6,2.44,3.2,0.12,400.0,0.0,1.0,2.0,0.0,42000,1068.0,15,9,2,77.88,77.88,79.0,0.2,0.6,0.76,0.68,0.2,0.4,0.36,0.84,0.92,0.24,0.32,0.24,0.32,0.36,0.84,0.16,0.0,0.0
PRV51003,1,4588.409091,502.166667,4.674242,5.25,0.363636,1750.0,0.0,2.5,4.5,0.0,57000,1068.0,28,10,2,69.083333,69.083333,71.0,0.469697,0.424242,0.606061,0.484848,0.075758,0.310606,0.409091,0.757576,0.848485,0.25,0.287879,0.090909,0.219697,0.409091,0.810606,0.181818,0.0,0.007576
PRV51004,0,350.134228,2.080537,2.42953,2.583893,0.0,70.0,0.0,1.0,2.0,0.0,3300,100.0,21,9,0,71.248322,71.248322,72.0,0.0,0.42953,0.590604,0.33557,0.107383,0.275168,0.422819,0.704698,0.724832,0.328859,0.308725,0.114094,0.154362,0.308725,0.805369,0.161074,0.033557,0.0
PRV51005,1,241.124464,3.175966,2.088412,2.588841,0.0,70.0,0.0,1.0,2.0,0.0,4080,200.0,21,10,0,69.545923,69.545923,70.0,0.0,0.365665,0.583691,0.435193,0.141631,0.253219,0.416309,0.685837,0.76824,0.295279,0.28412,0.106438,0.222318,0.438627,0.766524,0.224893,0.008584,0.0
PRV51007,0,468.194444,45.333333,1.958333,2.986111,0.013889,80.0,0.0,1.0,2.0,0.0,10000,1068.0,21,10,1,68.430556,68.430556,69.5,0.041667,0.361111,0.555556,0.305556,0.166667,0.222222,0.402778,0.680556,0.708333,0.291667,0.305556,0.166667,0.152778,0.472222,0.805556,0.194444,0.0,0.0


In [51]:
prov.to_csv('provider_data.csv')

In [53]:
cor = prov.corr()
cor

Unnamed: 0,Fraud,reimburse_mean,deductible_mean,days_admitted_mean,num_diagnoses_mean,num_procedures_mean,reimburse_median,deductible_median,days_admitted_median,num_diagnoses_median,num_procedures_median,reimburse_max,deductible_max,days_admitted_max,num_diagnoses_meax,num_procedures_max,age,age_mean,age_median,outpatient_proportion,chroniccond_alzheimer,chroniccond_heartfailure,chroniccond_kidneydisease,chroniccond_cancer,chroniccond_obstrpulmonary,chroniccond_depression,chroniccond_diabetes,chroniccond_ischemicheart,chroniccond_osteoporasis,chroniccond_rheumatoidarthritis,chroniccond_stroke,renal_disease,gender,race_1,race_2,race_3,race_4
Fraud,1.0,0.193803,0.20464,0.149802,0.189909,0.188194,0.073349,0.150575,0.053504,0.170442,0.048993,0.514671,0.326441,0.37531,0.235075,0.45309,0.002464,0.002464,0.007579,0.205683,0.041509,0.064358,0.089859,0.014112,0.068987,0.036988,0.049574,0.046158,0.016154,0.039084,0.042907,0.056905,0.006384,-0.006634,0.00139,0.004409,0.008092
reimburse_mean,0.193803,1.0,0.843378,0.652609,0.781438,0.818606,0.924803,0.781945,0.674637,0.770057,0.693719,0.516703,0.524714,0.125262,0.237891,0.476123,0.03051,0.03051,0.026185,0.843718,0.139482,0.200173,0.294828,0.044439,0.239312,0.074674,0.162364,0.188203,0.015523,0.092201,0.171167,0.165536,0.019497,-0.024657,0.032364,-0.005407,0.002412
deductible_mean,0.20464,0.843378,1.0,0.625601,0.91738,0.875857,0.755442,0.927418,0.654529,0.910327,0.722109,0.476762,0.627795,0.108239,0.285576,0.541421,0.033647,0.033647,0.033872,0.998572,0.193035,0.217318,0.317853,0.081919,0.27638,0.112841,0.18505,0.18783,0.031948,0.098146,0.19654,0.157557,0.025973,-0.025613,0.026509,-0.003658,0.014981
days_admitted_mean,0.149802,0.652609,0.625601,1.0,0.602819,0.592456,0.601807,0.58066,0.821679,0.587878,0.486955,0.333908,0.393375,0.414083,0.210909,0.356812,0.034807,0.034807,0.026398,0.629629,0.14955,0.212197,0.280248,0.019679,0.224855,0.094114,0.134652,0.169947,0.01253,0.076385,0.187905,0.198034,0.011573,-0.036981,0.024908,0.02714,0.004763
num_diagnoses_mean,0.189909,0.781438,0.91738,0.602819,1.0,0.810839,0.702734,0.852147,0.624684,0.957263,0.66969,0.436281,0.572723,0.109025,0.403117,0.498123,0.025811,0.025811,0.02377,0.9191,0.202143,0.239388,0.328341,0.081022,0.280984,0.138831,0.214715,0.208866,0.016074,0.093165,0.19299,0.170349,0.018799,-0.0319,0.022724,0.006055,0.025808
num_procedures_mean,0.188194,0.818606,0.875857,0.592456,0.810839,1.0,0.738184,0.809751,0.610975,0.800346,0.863084,0.444195,0.547534,0.106004,0.249797,0.557258,0.029678,0.029678,0.028525,0.878035,0.136528,0.184133,0.290696,0.049467,0.241509,0.074829,0.168404,0.170806,0.022747,0.07449,0.164493,0.161704,0.008047,-0.044459,0.05471,0.002868,-0.00555
reimburse_median,0.073349,0.924803,0.755442,0.601807,0.702734,0.738184,1.0,0.763989,0.709676,0.71756,0.707372,0.282491,0.345448,0.001509,0.129531,0.275256,0.037697,0.037697,0.025353,0.754869,0.114993,0.174227,0.261826,0.028848,0.215896,0.06137,0.141106,0.168833,0.00676,0.08296,0.157155,0.159473,0.015756,-0.018405,0.039642,-0.018208,-0.011473
deductible_median,0.150575,0.781945,0.927418,0.58066,0.852147,0.809751,0.763989,1.0,0.668975,0.901612,0.707073,0.343575,0.445104,0.04295,0.196778,0.381521,0.035115,0.035115,0.03378,0.925356,0.177871,0.19805,0.289473,0.07365,0.257184,0.10282,0.164094,0.173503,0.033758,0.087606,0.179144,0.144601,0.027516,-0.020272,0.025109,-0.007146,0.00898
days_admitted_median,0.053504,0.674637,0.654529,0.821679,0.624684,0.610975,0.709676,0.668975,1.0,0.647262,0.570296,0.195613,0.276825,0.044454,0.049067,0.218765,0.046822,0.046822,0.02684,0.658104,0.129205,0.190824,0.234329,0.023591,0.224355,0.079703,0.119235,0.154382,0.018304,0.072611,0.185696,0.146455,0.003153,-0.021755,0.021927,0.012772,-0.008292
num_diagnoses_median,0.170442,0.770057,0.910327,0.587878,0.957263,0.800346,0.71756,0.901612,0.647262,1.0,0.677932,0.373772,0.474647,0.048647,0.26071,0.423598,0.029249,0.029249,0.024494,0.911297,0.193311,0.225186,0.308744,0.07482,0.274017,0.129176,0.202626,0.203862,0.018468,0.092257,0.185437,0.157482,0.019481,-0.024421,0.017405,0.003481,0.021357
