# Second Modelling Iteration

In [1]:
# Standard Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import warnings

#Sklearn stuff
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold, cross_val_score, StratifiedKFold, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import recall_score, confusion_matrix, recall_score
from collections import Counter
from sklearn.utils.class_weight import compute_sample_weight
from imblearn import over_sampling
from sklearn.utils.class_weight import compute_sample_weight
sns.set(style = "whitegrid")
sns.set_palette("icefire")
plt.rcParams['figure.figsize'] = (9, 6)
pd.set_option('display.max_columns', 500)
warnings.filterwarnings(action="ignore")

### Load Data

In [2]:
# Read in data 
providers = pd.read_pickle('./data/claims.pkl')

In [3]:
# Explore the rows
providers.head()

Unnamed: 0,BeneID,ClaimID,ClaimStartDt,ClaimEndDt,Provider,InscClaimAmtReimbursed,AttendingPhysician,OperatingPhysician,OtherPhysician,AdmissionDt,ClmAdmitDiagnosisCode,DeductibleAmtPaid,DischargeDt,DiagnosisGroupCode,ClmDiagnosisCode_1,ClmDiagnosisCode_2,ClmDiagnosisCode_3,ClmDiagnosisCode_4,ClmDiagnosisCode_5,ClmDiagnosisCode_6,ClmDiagnosisCode_7,ClmDiagnosisCode_8,ClmDiagnosisCode_9,ClmDiagnosisCode_10,ClmProcedureCode_1,ClmProcedureCode_2,ClmProcedureCode_3,ClmProcedureCode_4,ClmProcedureCode_5,ClmProcedureCode_6,IsOutpatient,DOB,DOD,Gender,Race,RenalDiseaseIndicator,State,County,NoOfMonths_PartACov,NoOfMonths_PartBCov,ChronicCond_Alzheimer,ChronicCond_Heartfailure,ChronicCond_KidneyDisease,ChronicCond_Cancer,ChronicCond_ObstrPulmonary,ChronicCond_Depression,ChronicCond_Diabetes,ChronicCond_IschemicHeart,ChronicCond_Osteoporasis,ChronicCond_rheumatoidarthritis,ChronicCond_stroke,IPAnnualReimbursementAmt,IPAnnualDeductibleAmt,OPAnnualReimbursementAmt,OPAnnualDeductibleAmt,PotentialFraud,ClaimDuration,NoPhy,AllPhy,SameAttOper,AdmisDuration,AgeAtClm,TotalRev,ClmYear,ClmMonth,ClmWeek,InsCovRatio,RevPerDay,Chronic_Sum,Bene_Mult
0,BENE11001,CLM46614,2009-04-12,2009-04-18,PRV55912,26000,PHY390922,,,2009-04-12,7866,1068.0,2009-04-18,201,1970,4019,5853,7843.0,2768,71590.0,2724.0,19889.0,5849.0,,,,,,,,0,1943-01-01,NaT,1,1,0,39,230,12,12,1,0,1,0,0,1,1,1,0,1,1,36000,3204,60,70,Yes,6,False,False,False,6.0,66,27068.0,2009,4,15,0.960544,3866.857143,7,1.0
1,BENE11001,CLM66048,2009-08-31,2009-09-02,PRV55907,5000,PHY318495,PHY318495,,2009-08-31,6186,1068.0,2009-09-02,750,6186,2948,56400,,,,,,,,7092.0,,,,,,0,1943-01-01,NaT,1,1,0,39,230,12,12,1,0,1,0,0,1,1,1,0,1,1,36000,3204,60,70,No,2,False,True,True,2.0,67,6068.0,2009,8,36,0.823995,2022.666667,7,1.0
2,BENE11001,CLM68358,2009-09-17,2009-09-20,PRV56046,5000,PHY372395,,PHY324689,2009-09-17,29590,1068.0,2009-09-20,883,29623,30390,71690,34590.0,V1581,32723.0,,,,,,,,,,,0,1943-01-01,NaT,1,1,0,39,230,12,12,1,0,1,0,0,1,1,1,0,1,1,36000,3204,60,70,No,3,False,False,False,3.0,67,6068.0,2009,9,38,0.823995,1517.0,7,1.0
3,BENE11011,CLM38412,2009-02-14,2009-02-22,PRV52405,5000,PHY369659,PHY392961,PHY349768,2009-02-14,431,1068.0,2009-02-22,67,43491,2762,7843,32723.0,V1041,4254.0,25062.0,40390.0,4019.0,,331.0,,,,,,0,1914-03-01,NaT,0,2,0,1,360,12,12,0,1,1,0,0,1,1,0,0,1,1,5000,1068,250,320,No,8,False,True,False,8.0,95,6068.0,2009,2,7,0.823995,674.222222,6,1.0
4,BENE11014,CLM63689,2009-08-13,2009-08-30,PRV56614,10000,PHY379376,PHY398258,,2009-08-13,78321,1068.0,2009-08-30,975,42,3051,34400,5856.0,42732,486.0,5119.0,29620.0,20300.0,,3893.0,,,,,,0,1938-04-01,NaT,0,1,1,45,780,12,12,0,1,1,0,1,1,0,1,0,0,0,21260,2136,120,100,No,17,False,True,False,17.0,71,11068.0,2009,8,33,0.903506,614.888889,5,1.0


In [4]:
providers.shape

(558211, 70)

In [5]:
providers.dtypes

BeneID                  object
ClaimID                 object
ClaimStartDt    datetime64[ns]
ClaimEndDt      datetime64[ns]
Provider                object
                     ...      
ClmWeek                  int64
InsCovRatio            float64
RevPerDay              float64
Chronic_Sum              int64
Bene_Mult              float64
Length: 70, dtype: object

In [6]:
# Re-name some columns and convert the data types for the pipeline
providers['PotentialFraud'] = providers['PotentialFraud'].replace({'No':0,'Yes':1})
providers = providers.select_dtypes(exclude=['object', 'datetime64'])
# Convert boolean columns to 1 and 0 
providers['NoPhy'] = providers['NoPhy'].astype(int)
providers['AllPhy'] = providers['AllPhy'].astype(int)
providers['SameAttOper'] = providers['SameAttOper'].astype(int)

In [7]:
# Fill missing values
providers = providers.apply(lambda x : x.fillna(x.mean()), axis=0)

In [8]:
providers.dtypes

InscClaimAmtReimbursed               int64
DeductibleAmtPaid                  float64
IsOutpatient                         int64
Gender                               int64
NoOfMonths_PartACov                  int64
NoOfMonths_PartBCov                  int64
ChronicCond_Alzheimer                int64
ChronicCond_Heartfailure             int64
ChronicCond_KidneyDisease            int64
ChronicCond_Cancer                   int64
ChronicCond_ObstrPulmonary           int64
ChronicCond_Depression               int64
ChronicCond_Diabetes                 int64
ChronicCond_IschemicHeart            int64
ChronicCond_Osteoporasis             int64
ChronicCond_rheumatoidarthritis      int64
ChronicCond_stroke                   int64
IPAnnualReimbursementAmt             int64
IPAnnualDeductibleAmt                int64
OPAnnualReimbursementAmt             int64
OPAnnualDeductibleAmt                int64
PotentialFraud                       int64
ClaimDuration                        int64
NoPhy      

### Train test split

In [9]:
# Separate input features (X) and target variable (y)
X = providers.drop('PotentialFraud', axis=1)
y = providers['PotentialFraud']

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    stratify = y, 
                                                    test_size = 0.3, 
                                                    random_state = 0)

### Classification Modeling

In [11]:
# Stratified Cross Validation needs to be applied
skf = StratifiedKFold(n_splits = 5, random_state = 0, shuffle = True)

In [12]:
# apply standardization
scaler = StandardScaler()
X_train_stan = pd.DataFrame(scaler.fit_transform(X_train), columns=X.columns)
X_test_stan = pd.DataFrame(scaler.transform(X_test), columns=X.columns)

In [13]:
# Model evaluation metric: Recall score
def eval_model(model, best_model, X_train, X_test, y_train, y_test):
    print(model,'score')
    print('-'*30)
    print('Train: recall score:', round(recall_score(y_train, best_model.predict(X_train)),5))
    print('Test: recall score:', round(recall_score(y_test, best_model.predict(X_test)),5))
    print(' ')
    print("Train Test Confusion Matrix")
    print('-'*30)
    print('Train Set')
    print(confusion_matrix(y_train, best_model.predict(X_train)))
    print('Test Set')
    print(confusion_matrix(y_test, best_model.predict(X_test)))

### Logistic Regression

In [14]:
# Instantiate the model
logistic = LogisticRegression(random_state= 0, solver="liblinear", penalty = 'l1', class_weight = 'balanced')

In [15]:
# Grid serch parameters
param_grid = [{'C': np.logspace(-5,1,100)}]
logisticcv = GridSearchCV(logistic, param_grid, scoring = 'recall', cv = skf)
para_search = logisticcv.fit(X_train_stan, y_train)
best_logistic = para_search.best_estimator_
print('Best estimator:', best_logistic)

KeyboardInterrupt: 

In [None]:
# Evaluate the model
eval_model('Logistic Regression', best_logistic, X_train_stan, X_test_stan, y_train, y_test)

In [None]:
# Plot the coefficients of the Logistic Regression 
logistic_coef = pd.DataFrame(best_logistic.coef_.T, index = X.columns).rename(columns = {0:"Coef."})
logistic_coef = logistic_coef.reindex(logistic_coef["Coef."].abs().sort_values(ascending = False).index)
logistic_coef = logistic_coef.reset_index()
plt.figure(figsize = (5,5))
sns.barplot(data = logistic_coef[logistic_coef['Coef.'].abs() > 0], x = 'Coef.', y = 'index');
plt.title('Coefficients of Penalized Logistic Regression Model');
plt.ylabel('');

In [None]:
### Ridge Classifier

In [None]:
logistic_coef = logistic_coef.reset_index()
notimportant = logistic_coef[logistic_coef['Coef.'].abs() == 0]['index'].tolist()

In [None]:
# Reset the dataset with selective columns
X_train_reduced = X_train_stan.iloc[:,~X_train_stan.columns.isin(notimportant)]
X_test_reduced = X_test_stan.iloc[:,~X_test_stan.columns.isin(notimportant)]

In [None]:
ridge = RidgeClassifier(random_state = 0, class_weight = 'balanced')

In [None]:
param_grid = [{'alpha': np.logspace(-5,2,100)}]
ridgecv = GridSearchCV(ridge, param_grid = param_grid, scoring = 'recall', cv = skf)
para_search = ridgecv.fit(X_train_reduced, y_train)
best_ridge = para_search.best_estimator_
print('Best estimator:', best_ridge)

In [None]:
eval_model('Ridge Classifier', best_ridge, X_train_reduced, X_test_reduced, y_train, y_test)

In [None]:
ridge_coef = pd.DataFrame(best_ridge.coef_.T, index = X_train_reduced.columns).rename(columns = {0:"Coef."})
ridge_coef = ridge_coef.reindex(ridge_coef["Coef."].abs().sort_values(ascending = False).index)
ridge_coef = ridge_coef.reset_index()
plt.figure(figsize = (5,5))
sns.barplot(data = ridge_coef, x = 'Coef.', y = 'index');
plt.title('Coefficients of Ridge Classifier Model');
plt.ylabel('');

In [None]:
### Random Forest

In [None]:
randomForest = RandomForestClassifier(random_state = 0, max_features = 'auto', class_weight = 'balanced_subsample')
grid_para_forest = {'n_estimators': [100,500,1000,2500,5000],
                    'max_depth': [3,5,7,8,10],
                    'min_samples_split': [2,4,8,12],
                    'min_samples_leaf' : [2,4,8,12]}

In [None]:
grid_search_forest = RandomizedSearchCV(randomForest,
                                        grid_para_forest,
                                        cv=skf, 
                                        n_jobs = -1, 
                                        verbose = 1,
                                        return_train_score = True,
                                        scoring = 'recall')
# Fit Random Forest 
para_search = grid_search_forest.fit(X_train, y_train)

# Save the best estimator
best_rf = para_search.best_estimator_
print('Best parameters:', para_search.best_params_)

In [None]:
eval_model('Random Forest', best_rf, X_train, X_test, y_train, y_test)