# Data Preprocessing

In [2]:
# Standard Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import warnings

#Sklearn stuff
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import recall_score, confusion_matrix
from sklearn.utils.class_weight import compute_sample_weight
from imblearn import over_sampling
sns.set(style = "whitegrid")
sns.set_palette("icefire")
plt.rcParams['figure.figsize'] = (9, 6)
pd.set_option('display.max_columns', 500)
warnings.filterwarnings(action="ignore")

In [3]:
# Read in data 
data = pd.read_pickle('./data/claims.pkl')

In [4]:
data.head()

Unnamed: 0,BeneID,ClaimID,ClaimStartDt,ClaimEndDt,Provider,InscClaimAmtReimbursed,AttendingPhysician,OperatingPhysician,OtherPhysician,AdmissionDt,ClmAdmitDiagnosisCode,DeductibleAmtPaid,DischargeDt,DiagnosisGroupCode,ClmDiagnosisCode_1,ClmDiagnosisCode_2,ClmDiagnosisCode_3,ClmDiagnosisCode_4,ClmDiagnosisCode_5,ClmDiagnosisCode_6,ClmDiagnosisCode_7,ClmDiagnosisCode_8,ClmDiagnosisCode_9,ClmDiagnosisCode_10,ClmProcedureCode_1,ClmProcedureCode_2,ClmProcedureCode_3,ClmProcedureCode_4,ClmProcedureCode_5,ClmProcedureCode_6,IsOutpatient,DOB,DOD,Gender,Race,RenalDiseaseIndicator,State,County,NoOfMonths_PartACov,NoOfMonths_PartBCov,ChronicCond_Alzheimer,ChronicCond_Heartfailure,ChronicCond_KidneyDisease,ChronicCond_Cancer,ChronicCond_ObstrPulmonary,ChronicCond_Depression,ChronicCond_Diabetes,ChronicCond_IschemicHeart,ChronicCond_Osteoporasis,ChronicCond_rheumatoidarthritis,ChronicCond_stroke,IPAnnualReimbursementAmt,IPAnnualDeductibleAmt,OPAnnualReimbursementAmt,OPAnnualDeductibleAmt,PotentialFraud,ClaimDuration,NoPhy,AllPhy,SameAttOper,AdmisDuration,AgeAtClm,TotalRev,ClmYear,ClmMonth,ClmWeek,InsCovRatio,RevPerDay,Chronic_Sum,Bene_Mult
0,BENE11001,CLM46614,2009-04-12,2009-04-18,PRV55912,26000,PHY390922,,,2009-04-12,7866,1068.0,2009-04-18,201,1970,4019,5853,7843.0,2768,71590.0,2724.0,19889.0,5849.0,,,,,,,,0,1943-01-01,NaT,1,1,0,39,230,12,12,1,0,1,0,0,1,1,1,0,1,1,36000,3204,60,70,Yes,6,False,False,False,6.0,66,27068.0,2009,4,15,0.960544,3866.857143,7,1.0
1,BENE11001,CLM66048,2009-08-31,2009-09-02,PRV55907,5000,PHY318495,PHY318495,,2009-08-31,6186,1068.0,2009-09-02,750,6186,2948,56400,,,,,,,,7092.0,,,,,,0,1943-01-01,NaT,1,1,0,39,230,12,12,1,0,1,0,0,1,1,1,0,1,1,36000,3204,60,70,No,2,False,True,True,2.0,67,6068.0,2009,8,36,0.823995,2022.666667,7,1.0
2,BENE11001,CLM68358,2009-09-17,2009-09-20,PRV56046,5000,PHY372395,,PHY324689,2009-09-17,29590,1068.0,2009-09-20,883,29623,30390,71690,34590.0,V1581,32723.0,,,,,,,,,,,0,1943-01-01,NaT,1,1,0,39,230,12,12,1,0,1,0,0,1,1,1,0,1,1,36000,3204,60,70,No,3,False,False,False,3.0,67,6068.0,2009,9,38,0.823995,1517.0,7,1.0
3,BENE11011,CLM38412,2009-02-14,2009-02-22,PRV52405,5000,PHY369659,PHY392961,PHY349768,2009-02-14,431,1068.0,2009-02-22,67,43491,2762,7843,32723.0,V1041,4254.0,25062.0,40390.0,4019.0,,331.0,,,,,,0,1914-03-01,NaT,0,2,0,1,360,12,12,0,1,1,0,0,1,1,0,0,1,1,5000,1068,250,320,No,8,False,True,False,8.0,95,6068.0,2009,2,7,0.823995,674.222222,6,1.0
4,BENE11014,CLM63689,2009-08-13,2009-08-30,PRV56614,10000,PHY379376,PHY398258,,2009-08-13,78321,1068.0,2009-08-30,975,42,3051,34400,5856.0,42732,486.0,5119.0,29620.0,20300.0,,3893.0,,,,,,0,1938-04-01,NaT,0,1,1,45,780,12,12,0,1,1,0,1,1,0,1,0,0,0,21260,2136,120,100,No,17,False,True,False,17.0,71,11068.0,2009,8,33,0.903506,614.888889,5,1.0


In [5]:
data.shape

(558211, 70)

In [6]:
data.dtypes

BeneID                  object
ClaimID                 object
ClaimStartDt    datetime64[ns]
ClaimEndDt      datetime64[ns]
Provider                object
                     ...      
ClmWeek                  int64
InsCovRatio            float64
RevPerDay              float64
Chronic_Sum              int64
Bene_Mult              float64
Length: 70, dtype: object

In [7]:
data['PotentialFraud'] = data['PotentialFraud'].replace({'No':0,'Yes':1})

In [8]:
data = data.select_dtypes(exclude=['object', 'datetime64'])

In [9]:
# Convert boolean columns to 1 and 0 
data['NoPhy'] = data['NoPhy'].astype(int)
data['AllPhy'] = data['AllPhy'].astype(int)
data['SameAttOper'] = data['SameAttOper'].astype(int)

In [10]:
# Check the data
data.dtypes

InscClaimAmtReimbursed               int64
DeductibleAmtPaid                  float64
IsOutpatient                         int64
Gender                               int64
NoOfMonths_PartACov                  int64
NoOfMonths_PartBCov                  int64
ChronicCond_Alzheimer                int64
ChronicCond_Heartfailure             int64
ChronicCond_KidneyDisease            int64
ChronicCond_Cancer                   int64
ChronicCond_ObstrPulmonary           int64
ChronicCond_Depression               int64
ChronicCond_Diabetes                 int64
ChronicCond_IschemicHeart            int64
ChronicCond_Osteoporasis             int64
ChronicCond_rheumatoidarthritis      int64
ChronicCond_stroke                   int64
IPAnnualReimbursementAmt             int64
IPAnnualDeductibleAmt                int64
OPAnnualReimbursementAmt             int64
OPAnnualDeductibleAmt                int64
PotentialFraud                       int64
ClaimDuration                        int64
NoPhy      

In [11]:
data = data.apply(lambda x : x.fillna(x.mean()), axis=0)

In [12]:
# Check for missing values
data.isna().sum().head()

InscClaimAmtReimbursed    0
DeductibleAmtPaid         0
IsOutpatient              0
Gender                    0
NoOfMonths_PartACov       0
dtype: int64

### Build Functions for Classifier

In [13]:
def classify(est, x, y, X_test, y_test):
    # Pass in the model and the train test dataset to fit the model
    est.fit(x, y)
    # Predicting the probabilities of the Test Data
    y2 = est.predict_proba(X_test)
    y1 = est.predict(X_test)
    
    print("Accuracy: ", metrics.accuracy_score(y_test, y1))
    print("Area under the ROC curve: ", metrics.roc_auc_score(y_test, y2[:, 1]))
    
    # Calculate the different metrics
    print("F-metric: ", metrics.f1_score(y_test, y1))
    print(" ")
    print("Classification report:")
    print(metrics.classification_report(y_test, y1))
    print(" ")
    print("Evaluations by cross-validation:")
    print(cross_val_score(est, x, y))
    
    return est, y1, y2[:, 1]

In [14]:
def feat_importance(estimator):
    feature_importance = {}
    for index, name in enumerate(df_LC.columns):
        feature_importance[name] = estimator.feature_importances_[index]
        
    feature_importance = {k: v for k, v in feature_importances.items()}
    sorted_x = sorted(feature_importance.items(), key=operator.item)

In [15]:
# Set up X and y
X = data.drop(columns=['PotentialFraud'])
y = data['PotentialFraud']

In [16]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0, stratify=y)

In [17]:
# Check the values of the train and test samples
X_train.head()

Unnamed: 0,InscClaimAmtReimbursed,DeductibleAmtPaid,IsOutpatient,Gender,NoOfMonths_PartACov,NoOfMonths_PartBCov,ChronicCond_Alzheimer,ChronicCond_Heartfailure,ChronicCond_KidneyDisease,ChronicCond_Cancer,ChronicCond_ObstrPulmonary,ChronicCond_Depression,ChronicCond_Diabetes,ChronicCond_IschemicHeart,ChronicCond_Osteoporasis,ChronicCond_rheumatoidarthritis,ChronicCond_stroke,IPAnnualReimbursementAmt,IPAnnualDeductibleAmt,OPAnnualReimbursementAmt,OPAnnualDeductibleAmt,ClaimDuration,NoPhy,AllPhy,SameAttOper,AdmisDuration,AgeAtClm,TotalRev,ClmYear,ClmMonth,ClmWeek,InsCovRatio,RevPerDay,Chronic_Sum,Bene_Mult
210572,70,0.0,1,0,12,12,1,1,1,0,0,0,0,1,1,1,0,0,0,460,350,0,0,0,0,5.665168,80,70.0,2009,12,53,1.0,70.0,6,1.0
159813,200,0.0,1,0,12,12,0,1,1,0,0,1,1,1,0,0,0,0,0,13010,2270,0,0,0,0,5.665168,76,200.0,2009,1,2,1.0,200.0,5,1.0
424892,10,0.0,1,0,12,12,1,0,1,1,0,0,1,1,1,1,1,18000,3204,550,280,0,0,0,0,5.665168,70,10.0,2009,1,3,1.0,10.0,8,1.0
138178,80,0.0,1,0,12,12,1,1,1,0,1,1,1,1,0,0,0,0,0,1730,500,0,0,0,0,5.665168,72,80.0,2009,3,9,1.0,80.0,7,1.0
313249,400,0.0,1,0,12,12,0,1,0,1,0,0,1,1,1,0,0,42280,1068,1750,90,1,0,1,0,5.665168,84,400.0,2009,6,23,1.0,200.0,5,1.0


In [18]:
y_train.head()

210572    0
159813    0
424892    1
138178    1
313249    1
Name: PotentialFraud, dtype: int64

In [19]:
X_test.shape

(167464, 35)

In [20]:
y_test.shape

(167464,)

In [21]:
# Scale the values of X
ss = StandardScaler()
ss.fit(X_test, y_test)
X_test_scaled = ss.transform(X_test)

In [22]:
X_test_scaled[:2].round(3)

array([[-2.520e-01, -2.860e-01,  2.790e-01, -8.530e-01,  7.700e-02,
         7.700e-02,  1.221e+00, -1.202e+00, -8.360e-01, -4.230e-01,
        -6.740e-01, -8.750e-01,  6.460e-01, -1.776e+00, -6.830e-01,
        -6.720e-01, -3.370e-01, -4.430e-01, -4.750e-01,  5.600e-02,
         1.128e+00, -3.510e-01, -5.200e-02, -5.060e-01, -3.510e-01,
         2.000e-03, -2.093e+00, -2.590e-01,  7.000e-02,  1.890e-01,
         1.900e-01,  2.580e-01, -3.490e-01, -1.072e+00,  4.110e-01],
       [-2.490e-01, -2.860e-01,  2.790e-01, -8.530e-01,  7.700e-02,
         7.700e-02,  1.221e+00,  8.320e-01, -8.360e-01, -4.230e-01,
         1.485e+00, -8.750e-01,  6.460e-01,  5.630e-01,  1.465e+00,
        -6.720e-01, -3.370e-01, -4.430e-01, -4.750e-01, -4.570e-01,
        -5.160e-01, -3.510e-01, -5.200e-02, -5.060e-01, -3.510e-01,
         2.000e-03,  3.600e-01, -2.570e-01,  7.000e-02,  1.069e+00,
         1.063e+00,  2.580e-01, -3.380e-01,  6.450e-01, -2.433e+00]])

### Multiple Models for Spot Checking

In [None]:
# Prepare models
X = X_test_scaled
y = y_test
models = []
models.append(('LR', LogisticRegression(solver='liblinear')))
models.append(('LogRegCV', LogisticRegressionCV()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('RF', RandomForestClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC()))
models.append(('GBC', GradientBoostingClassifier()))

# evaluate each model in turn
results = []
names = []
scoring = 'accuracy'
for name, model in models:
    kfold = KFold(n_splits=10, random_state=7, shuffle=True)
    cv_results = cross_val_score(model, X, y, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)
    
# boxplot algorithm comparison
fig = plt.fig()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplots(111)
plt.boxplots(results)
ax.set_xticklabels(names)
plt.show()

LR: 0.630261 (0.003681)
LogRegCV: 0.630279 (0.003693)
KNN: 0.587213 (0.003681)
RF: 0.634883 (0.003709)
NB: 0.621602 (0.004325)
