In [1]:
import os

import numpy as np
import pandas as pd

### Load the data for training

In [2]:
# Load the training data
train_beneficiary_df = pd.read_csv('Medicare-Fraud-Detection-DS/Train_Beneficiarydata-1542865627584.csv')
train_inpatientdata_df = pd.read_csv('Medicare-Fraud-Detection-DS/Train_Inpatientdata-1542865627584.csv')
train_outpatientdata_df = pd.read_csv('Medicare-Fraud-Detection-DS/Train_Outpatientdata-1542865627584.csv')
train_provider_df = pd.read_csv('Medicare-Fraud-Detection-DS/Train-1542865627584.csv')

# Load the test data
test_beneficiary_df = pd.read_csv('Medicare-Fraud-Detection-DS/Test_Beneficiarydata-1542969243754.csv')
test_inpatientdata_df = pd.read_csv('Medicare-Fraud-Detection-DS/Test_Inpatientdata-1542969243754.csv')
test_outpatientdata_df = pd.read_csv('Medicare-Fraud-Detection-DS/Test_Outpatientdata-1542969243754.csv')
test_provider_df = pd.read_csv('Medicare-Fraud-Detection-DS/Test-1542969243754.csv') 

In [3]:
# common features in train_beneficiary_df and train_inpatientdata_df
common_features_beneficiary_inpatient = list(set(train_beneficiary_df.columns).intersection(set(train_inpatientdata_df.columns)))
common_features_beneficiary_inpatient

['BeneID']

### Check whether all provider details of **All-Providers** are present in the **Test-Beneficiary**, **Test-Inpatient**, and **Test-Outpatient** or not

In [4]:
# check how many providers are there in test_provider_df but not in test_beneficiary_df, test_inpatientdata_df, test_outpatientdata_df
provider_test_not_in_beneficiary = test_provider_df[~test_provider_df['Provider'].isin(test_beneficiary_df['BeneID'])]
provider_test_not_in_inpatient = test_provider_df[~test_provider_df['Provider'].isin(test_inpatientdata_df['Provider'])]
provider_test_not_in_outpatient = test_provider_df[~test_provider_df['Provider'].isin(test_outpatientdata_df['Provider'])]

print(provider_test_not_in_beneficiary.shape)
print(provider_test_not_in_inpatient.shape)
print(provider_test_not_in_outpatient.shape)

(1353, 1)
(833, 1)
(94, 1)


###

In [5]:
a = pd.merge(train_beneficiary_df, train_inpatientdata_df, on='BeneID', how='left')

# common features between a and train_outpatientdata_df
common_features_a_outpatient = list(set(a.columns).intersection(set(train_outpatientdata_df.columns)))
common_features_a_outpatient

['ClaimEndDt',
 'ClmDiagnosisCode_4',
 'ClmProcedureCode_5',
 'ClmDiagnosisCode_1',
 'OperatingPhysician',
 'ClaimStartDt',
 'ClmProcedureCode_2',
 'ClmDiagnosisCode_6',
 'BeneID',
 'ClmDiagnosisCode_7',
 'ClmDiagnosisCode_3',
 'ClmProcedureCode_6',
 'ClmDiagnosisCode_5',
 'OtherPhysician',
 'ClmDiagnosisCode_8',
 'DeductibleAmtPaid',
 'ClmDiagnosisCode_9',
 'ClmDiagnosisCode_10',
 'AttendingPhysician',
 'ClmProcedureCode_3',
 'ClmDiagnosisCode_2',
 'InscClaimAmtReimbursed',
 'ClmAdmitDiagnosisCode',
 'ClaimID',
 'ClmProcedureCode_1',
 'ClmProcedureCode_4',
 'Provider']

In [6]:
b = pd.merge(a, train_outpatientdata_df, on=['ClaimID', 'BeneID', 'Provider'], how='left')

# common features between b and train_provider_df
common_features_b_provider = list(set(b.columns).intersection(set(train_provider_df.columns)))
common_features_b_provider

['Provider']

In [7]:
# Create the aggregated data
df_train = pd.merge(train_beneficiary_df, train_inpatientdata_df, on='BeneID', how='left')
df_train = pd.merge(df_train, train_outpatientdata_df, on=['ClaimID', 'BeneID', 'Provider'], how='left')
df_train = pd.merge(df_train, train_provider_df, on='Provider', how='left')

df_train.shape

(147741, 79)

In [8]:
df_test = pd.merge(test_beneficiary_df, test_inpatientdata_df, on='BeneID', how='left')
df_test = pd.merge(df_test, test_outpatientdata_df, on=['ClaimID', 'BeneID', 'Provider'], how='left')
df_test = pd.merge(df_test, test_provider_df, on='Provider', how='left')

df_test.shape

(65168, 78)

In [9]:
df_train.head()

Unnamed: 0,BeneID,DOB,DOD,Gender,Race,RenalDiseaseIndicator,State,County,NoOfMonths_PartACov,NoOfMonths_PartBCov,...,ClmDiagnosisCode_10_y,ClmProcedureCode_1_y,ClmProcedureCode_2_y,ClmProcedureCode_3_y,ClmProcedureCode_4_y,ClmProcedureCode_5_y,ClmProcedureCode_6_y,DeductibleAmtPaid_y,ClmAdmitDiagnosisCode_y,PotentialFraud
0,BENE11001,1943-01-01,,1,1,0,39,230,12,12,...,,,,,,,,,,Yes
1,BENE11001,1943-01-01,,1,1,0,39,230,12,12,...,,,,,,,,,,No
2,BENE11001,1943-01-01,,1,1,0,39,230,12,12,...,,,,,,,,,,No
3,BENE11002,1936-09-01,,2,1,0,39,280,12,12,...,,,,,,,,,,
4,BENE11003,1936-08-01,,1,1,0,52,590,12,12,...,,,,,,,,,,


In [10]:
df_test.head()

Unnamed: 0,BeneID,DOB,DOD,Gender,Race,RenalDiseaseIndicator,State,County,NoOfMonths_PartACov,NoOfMonths_PartBCov,...,ClmDiagnosisCode_9_y,ClmDiagnosisCode_10_y,ClmProcedureCode_1_y,ClmProcedureCode_2_y,ClmProcedureCode_3_y,ClmProcedureCode_4_y,ClmProcedureCode_5_y,ClmProcedureCode_6_y,DeductibleAmtPaid_y,ClmAdmitDiagnosisCode_y
0,BENE11001,1943-01-01,,1,1,0,39,230,12,12,...,,,,,,,,,,
1,BENE11007,1940-09-01,2009-12-01,1,2,0,45,610,12,12,...,,,,,,,,,,
2,BENE11010,1936-07-01,,2,1,0,41,30,12,12,...,,,,,,,,,,
3,BENE11011,1914-03-01,,2,2,0,1,360,12,12,...,,,,,,,,,,
4,BENE11014,1938-04-01,,2,1,Y,45,780,12,12,...,,,,,,,,,,


#### Add the **number_of_claims** features

In [11]:
df_train.columns

Index(['BeneID', 'DOB', 'DOD', 'Gender', 'Race', 'RenalDiseaseIndicator',
       'State', 'County', 'NoOfMonths_PartACov', 'NoOfMonths_PartBCov',
       'ChronicCond_Alzheimer', 'ChronicCond_Heartfailure',
       'ChronicCond_KidneyDisease', 'ChronicCond_Cancer',
       'ChronicCond_ObstrPulmonary', 'ChronicCond_Depression',
       'ChronicCond_Diabetes', 'ChronicCond_IschemicHeart',
       'ChronicCond_Osteoporasis', 'ChronicCond_rheumatoidarthritis',
       'ChronicCond_stroke', 'IPAnnualReimbursementAmt',
       'IPAnnualDeductibleAmt', 'OPAnnualReimbursementAmt',
       'OPAnnualDeductibleAmt', 'ClaimID', 'ClaimStartDt_x', 'ClaimEndDt_x',
       'Provider', 'InscClaimAmtReimbursed_x', 'AttendingPhysician_x',
       'OperatingPhysician_x', 'OtherPhysician_x', 'AdmissionDt',
       'ClmAdmitDiagnosisCode_x', 'DeductibleAmtPaid_x', 'DischargeDt',
       'DiagnosisGroupCode', 'ClmDiagnosisCode_1_x', 'ClmDiagnosisCode_2_x',
       'ClmDiagnosisCode_3_x', 'ClmDiagnosisCode_4_x', 'ClmDiag

#### Process **ClmDiagnosisCode_{i}_x** feature

In [12]:
columns_to_process = ['ClmDiagnosisCode_{}_x'.format(i) for i in range(1, 11)]
print(f'Features to be processed: \n{columns_to_process}')

df_train[columns_to_process] = df_train[columns_to_process].notna().astype('int64')

# Sum number of claims
df_train['num_claim_diagnose_in'] = df_train[columns_to_process].sum(axis=1)

# Drop columns of claim code.
df_train.drop(columns=columns_to_process, axis=1, inplace=True)

Features to be processed: 
['ClmDiagnosisCode_1_x', 'ClmDiagnosisCode_2_x', 'ClmDiagnosisCode_3_x', 'ClmDiagnosisCode_4_x', 'ClmDiagnosisCode_5_x', 'ClmDiagnosisCode_6_x', 'ClmDiagnosisCode_7_x', 'ClmDiagnosisCode_8_x', 'ClmDiagnosisCode_9_x', 'ClmDiagnosisCode_10_x']


In [13]:
df_train.shape

(147741, 70)

In [14]:
columns_to_process = ['ClmDiagnosisCode_{}_x'.format(i) for i in range(1, 11)]
print(f'Features to be processed: \n{columns_to_process}')

df_test[columns_to_process] = df_test[columns_to_process].notna().astype('int64')

# Sum number of claims
df_test['num_claim_diagnose_in'] = df_test[columns_to_process].sum(axis=1)

# Drop columns of claim code
df_test.drop(columns=columns_to_process, axis=1, inplace=True)

Features to be processed: 
['ClmDiagnosisCode_1_x', 'ClmDiagnosisCode_2_x', 'ClmDiagnosisCode_3_x', 'ClmDiagnosisCode_4_x', 'ClmDiagnosisCode_5_x', 'ClmDiagnosisCode_6_x', 'ClmDiagnosisCode_7_x', 'ClmDiagnosisCode_8_x', 'ClmDiagnosisCode_9_x', 'ClmDiagnosisCode_10_x']


In [15]:
df_test.shape

(65168, 69)

#### Process **ClmDiagnosisCode_{i}_y** feature

In [16]:
columns_to_process = ['ClmDiagnosisCode_{}_y'.format(i) for i in range(1, 11)]
print(f'Features to be processed: \n{columns_to_process}')

df_train[columns_to_process] = df_train[columns_to_process].notna().astype('int64')

# Sum number of claims
df_train['num_claim_diagnose_out'] = df_train[columns_to_process].sum(axis=1)

# Drop columns of claim codes
df_train.drop(columns=columns_to_process, inplace=True)

Features to be processed: 
['ClmDiagnosisCode_1_y', 'ClmDiagnosisCode_2_y', 'ClmDiagnosisCode_3_y', 'ClmDiagnosisCode_4_y', 'ClmDiagnosisCode_5_y', 'ClmDiagnosisCode_6_y', 'ClmDiagnosisCode_7_y', 'ClmDiagnosisCode_8_y', 'ClmDiagnosisCode_9_y', 'ClmDiagnosisCode_10_y']


In [17]:
df_train.shape

(147741, 61)

In [18]:
columns_to_process = ['ClmDiagnosisCode_{}_y'.format(i) for i in range(1, 11)]
print(f'Features to be processed: \n{columns_to_process}')

df_test[columns_to_process] = df_test[columns_to_process].notna().astype('int64')

# Sum number of claims
df_test['num_claim_diagnose_out'] = df_test[columns_to_process].sum(axis=1)

# Drop columns of claim codes
df_test.drop(columns=columns_to_process, inplace=True)

Features to be processed: 
['ClmDiagnosisCode_1_y', 'ClmDiagnosisCode_2_y', 'ClmDiagnosisCode_3_y', 'ClmDiagnosisCode_4_y', 'ClmDiagnosisCode_5_y', 'ClmDiagnosisCode_6_y', 'ClmDiagnosisCode_7_y', 'ClmDiagnosisCode_8_y', 'ClmDiagnosisCode_9_y', 'ClmDiagnosisCode_10_y']


In [19]:
df_test.shape 

(65168, 60)

#### Process **ClmProcedureCode_{i}_x** feature

In [20]:
columns_to_process = ['ClmProcedureCode_{}_x'.format(i) for i in range(1, 7)]
print(f'Features to be processed: \n{columns_to_process}')

df_train[columns_to_process] = df_train[columns_to_process].notna().astype('int64')

# Sum number of claims
df_train['num_claim_procedure_in'] = df_train[columns_to_process].sum(axis=1)

# Drop columns of claim codes
df_train.drop(columns=columns_to_process, inplace=True)

Features to be processed: 
['ClmProcedureCode_1_x', 'ClmProcedureCode_2_x', 'ClmProcedureCode_3_x', 'ClmProcedureCode_4_x', 'ClmProcedureCode_5_x', 'ClmProcedureCode_6_x']


In [21]:
df_train.shape

(147741, 56)

In [22]:
columns_to_process = ['ClmProcedureCode_{}_x'.format(i) for i in range(1, 7)]
print(f'Features to be processed: \n{columns_to_process}')

df_test[columns_to_process] = df_test[columns_to_process].notna().astype('int64')

# Sum number of claims
df_test['num_claim_procedure_in'] = df_test[columns_to_process].sum(axis=1)

# Drop columns of claim codes
df_test.drop(columns=columns_to_process, inplace=True)

Features to be processed: 
['ClmProcedureCode_1_x', 'ClmProcedureCode_2_x', 'ClmProcedureCode_3_x', 'ClmProcedureCode_4_x', 'ClmProcedureCode_5_x', 'ClmProcedureCode_6_x']


In [23]:
df_test.shape 

(65168, 55)

#### Process **ClmProcedureCode_{i}_y** feature

In [24]:
columns_to_process = ['ClmProcedureCode_{}_y'.format(i) for i in range(1, 7)]
print(f'Features to be processed: \n{columns_to_process}')

df_train[columns_to_process] = df_train[columns_to_process].notna().astype('int64')

# Sum number of claims
df_train['num_claim_procedure_out'] = df_train[columns_to_process].sum(axis=1)

# Drop columns of claim codes
df_train.drop(columns=columns_to_process, inplace=True)

Features to be processed: 
['ClmProcedureCode_1_y', 'ClmProcedureCode_2_y', 'ClmProcedureCode_3_y', 'ClmProcedureCode_4_y', 'ClmProcedureCode_5_y', 'ClmProcedureCode_6_y']


In [25]:
df_train.shape

(147741, 51)

In [26]:
columns_to_process = ['ClmProcedureCode_{}_y'.format(i) for i in range(1, 7)]
print(f'Features to be processed: \n{columns_to_process}')

df_test[columns_to_process] = df_test[columns_to_process].notna().astype('int64')

# Sum number of claims
df_test['num_claim_procedure_out'] = df_test[columns_to_process].sum(axis=1)

# Drop columns of claim codes
df_test.drop(columns=columns_to_process, inplace=True)

Features to be processed: 
['ClmProcedureCode_1_y', 'ClmProcedureCode_2_y', 'ClmProcedureCode_3_y', 'ClmProcedureCode_4_y', 'ClmProcedureCode_5_y', 'ClmProcedureCode_6_y']


In [27]:
df_test.shape 

(65168, 50)

#### Process **Attending_Physician**, **Operating_Physician** and **Other_Physician**

In [28]:
columns_to_process = ['AttendingPhysician_x', 'OperatingPhysician_x', 'OtherPhysician_x', 'AttendingPhysician_y', 'OperatingPhysician_y', 'OtherPhysician_y']

print(f'Features to be processed: \n{columns_to_process}')

df_train[columns_to_process] = df_train[columns_to_process].notna().astype('int64')

Features to be processed: 
['AttendingPhysician_x', 'OperatingPhysician_x', 'OtherPhysician_x', 'AttendingPhysician_y', 'OperatingPhysician_y', 'OtherPhysician_y']


In [29]:
df_train['RenalDiseaseIndicator'].unique()

array(['0', 'Y'], dtype=object)

In [30]:
columns_to_process = ['AttendingPhysician_x', 'OperatingPhysician_x', 'OtherPhysician_x', 'AttendingPhysician_y', 'OperatingPhysician_y', 'OtherPhysician_y']

print(f'Features to be processed: \n{columns_to_process}')

df_test[columns_to_process] = df_test[columns_to_process].notna().astype('int64')

Features to be processed: 
['AttendingPhysician_x', 'OperatingPhysician_x', 'OtherPhysician_x', 'AttendingPhysician_y', 'OperatingPhysician_y', 'OtherPhysician_y']


In [31]:
df_test['RenalDiseaseIndicator'].unique()

array(['0', 'Y'], dtype=object)

In [32]:
df_train.dropna(subset=['RenalDiseaseIndicator'], inplace=True)
df_test.dropna(subset=['RenalDiseaseIndicator'], inplace=True)

#### Convert **Chronic_condition** classes from **(1, 2)** to **(1, 0)**

In [33]:
# For all the Chronic condition, we have two classes: 1 and 2. Here, we will replace 2 by 0.
df_train = df_train.replace({'ChronicCond_Alzheimer': 2, 'ChronicCond_Heartfailure': 2, 
                            'ChronicCond_KidneyDisease': 2, 'ChronicCond_Cancer': 2, 
                            'ChronicCond_ObstrPulmonary': 2, 'ChronicCond_Depression': 2, 
                            'ChronicCond_Diabetes': 2, 'ChronicCond_IschemicHeart': 2, 
                            'ChronicCond_Osteoporasis': 2, 'ChronicCond_rheumatoidarthritis': 2, 
                            'ChronicCond_stroke': 2}, 0)

df_train = df_train.replace({'RenalDiseaseIndicator': 'Y'}, 1)
df_train['RenalDiseaseIndicator'] = df_train['RenalDiseaseIndicator'].astype('int64')

In [34]:
df_train.shape

(147741, 51)

In [35]:
df_test = df_test.replace({'ChronicCond_Alzheimer': 2, 'ChronicCond_Heartfailure': 2,
                            'ChronicCond_KidneyDisease': 2, 'ChronicCond_Cancer': 2,
                            'ChronicCond_ObstrPulmonary': 2, 'ChronicCond_Depression': 2,
                            'ChronicCond_Diabetes': 2, 'ChronicCond_IschemicHeart': 2,
                            'ChronicCond_Osteoporasis': 2, 'ChronicCond_rheumatoidarthritis': 2,
                            'ChronicCond_stroke': 2}, 0)

df_test = df_test.replace({'RenalDiseaseIndicator': 'Y'}, 1)
df_test['RenalDiseaseIndicator'] = df_test['RenalDiseaseIndicator'].astype('int64')

In [36]:
df_test.shape

(65168, 50)

In [37]:
df_train.dtypes

BeneID                              object
DOB                                 object
DOD                                 object
Gender                               int64
Race                                 int64
RenalDiseaseIndicator                int64
State                                int64
County                               int64
NoOfMonths_PartACov                  int64
NoOfMonths_PartBCov                  int64
ChronicCond_Alzheimer                int64
ChronicCond_Heartfailure             int64
ChronicCond_KidneyDisease            int64
ChronicCond_Cancer                   int64
ChronicCond_ObstrPulmonary           int64
ChronicCond_Depression               int64
ChronicCond_Diabetes                 int64
ChronicCond_IschemicHeart            int64
ChronicCond_Osteoporasis             int64
ChronicCond_rheumatoidarthritis      int64
ChronicCond_stroke                   int64
IPAnnualReimbursementAmt             int64
IPAnnualDeductibleAmt                int64
OPAnnualRei

In [38]:
df_test.dtypes

BeneID                              object
DOB                                 object
DOD                                 object
Gender                               int64
Race                                 int64
RenalDiseaseIndicator                int64
State                                int64
County                               int64
NoOfMonths_PartACov                  int64
NoOfMonths_PartBCov                  int64
ChronicCond_Alzheimer                int64
ChronicCond_Heartfailure             int64
ChronicCond_KidneyDisease            int64
ChronicCond_Cancer                   int64
ChronicCond_ObstrPulmonary           int64
ChronicCond_Depression               int64
ChronicCond_Diabetes                 int64
ChronicCond_IschemicHeart            int64
ChronicCond_Osteoporasis             int64
ChronicCond_rheumatoidarthritis      int64
ChronicCond_stroke                   int64
IPAnnualReimbursementAmt             int64
IPAnnualDeductibleAmt                int64
OPAnnualRei

In [39]:
df_train.shape, df_test.shape

((147741, 51), (65168, 50))

In [40]:
a = df_test.groupby(['Provider'],as_index=False).agg('sum')
a.shape

(520, 50)

In [41]:
df_test["Provider"].nunique()

520

In [42]:
df_train = df_train.groupby(['Provider', 'PotentialFraud'],as_index=False).agg('sum')
df_train.drop(columns = ['Gender', 'Race', 'State', 'County'], inplace = True, axis = 1)
df_train.shape

(2092, 47)

In [43]:
df_test = df_test.groupby(['Provider'],as_index=False).agg('sum')
df_test.drop(columns = ['Gender', 'Race', 'State', 'County'], inplace = True, axis = 1)
df_test.shape

(520, 46)

In [44]:
df_train.columns

Index(['Provider', 'PotentialFraud', 'BeneID', 'DOB', 'DOD',
       'RenalDiseaseIndicator', 'NoOfMonths_PartACov', 'NoOfMonths_PartBCov',
       'ChronicCond_Alzheimer', 'ChronicCond_Heartfailure',
       'ChronicCond_KidneyDisease', 'ChronicCond_Cancer',
       'ChronicCond_ObstrPulmonary', 'ChronicCond_Depression',
       'ChronicCond_Diabetes', 'ChronicCond_IschemicHeart',
       'ChronicCond_Osteoporasis', 'ChronicCond_rheumatoidarthritis',
       'ChronicCond_stroke', 'IPAnnualReimbursementAmt',
       'IPAnnualDeductibleAmt', 'OPAnnualReimbursementAmt',
       'OPAnnualDeductibleAmt', 'ClaimID', 'ClaimStartDt_x', 'ClaimEndDt_x',
       'InscClaimAmtReimbursed_x', 'AttendingPhysician_x',
       'OperatingPhysician_x', 'OtherPhysician_x', 'AdmissionDt',
       'ClmAdmitDiagnosisCode_x', 'DeductibleAmtPaid_x', 'DischargeDt',
       'DiagnosisGroupCode', 'ClaimStartDt_y', 'ClaimEndDt_y',
       'InscClaimAmtReimbursed_y', 'AttendingPhysician_y',
       'OperatingPhysician_y', 'Other

#### Rename Columns ending with **x**

In [45]:
# Define a function to rename columns ending with '_x'


def rename_columns(column_name):
    if column_name.endswith('_x'):
        return column_name[:-2] + '_In'
    else:
        return column_name

In [46]:
# Use list comprehension to generate new column names
new_columns = [rename_columns(column) for column in df_train.columns]

# Rename columns using the 'rename()' method
df_train.rename(columns=dict(zip(df_train.columns, new_columns)), inplace=True)

In [47]:
# Use list comprehension to generate new column names
new_columns = [rename_columns(column) for column in df_test.columns]

df_test.rename(columns=dict(zip(df_test.columns, new_columns)), inplace=True)

In [48]:
# Define a function to rename columns ending with '_y'


def rename_columns(column_name):
    if column_name.endswith('_y'):
        return column_name[:-2] + '_Out'
    else:
        return column_name

In [49]:
# Use list comprehension to generate new column names
new_columns = [rename_columns(column) for column in df_train.columns]

# Rename columns using the 'rename()' method
df_train.rename(columns=dict(zip(df_train.columns, new_columns)), inplace=True)

In [50]:
# Use list comprehension to generate new column names
new_columns = [rename_columns(column) for column in df_test.columns]

df_test.rename(columns=dict(zip(df_test.columns, new_columns)), inplace=True)

In [51]:
df_train.head()

Unnamed: 0,Provider,PotentialFraud,BeneID,DOB,DOD,RenalDiseaseIndicator,NoOfMonths_PartACov,NoOfMonths_PartBCov,ChronicCond_Alzheimer,ChronicCond_Heartfailure,...,InscClaimAmtReimbursed_Out,AttendingPhysician_Out,OperatingPhysician_Out,OtherPhysician_Out,DeductibleAmtPaid_Out,ClmAdmitDiagnosisCode_Out,num_claim_diagnose_in,num_claim_diagnose_out,num_claim_procedure_in,num_claim_procedure_out
0,PRV51001,No,BENE36012BENE38773BENE98831BENE102690BENE152088,1940-07-011938-05-011934-11-011929-11-011913-0...,0,2,60,60,2,4,...,0.0,0,0,0,0.0,0,36,0,3,0
1,PRV51003,Yes,BENE18650BENE21377BENE24383BENE25685BENE30002B...,1951-09-011928-12-011951-03-011938-04-011962-0...,2009-09-01,17,732,732,32,36,...,0.0,0,0,0,0.0,0,503,0,48,0
2,PRV51007,No,BENE16277BENE98446BENE99552,1926-06-011936-04-011930-11-01,0,1,36,36,2,3,...,0.0,0,0,0,0.0,0,22,0,1,0
3,PRV51008,No,BENE123836BENE139063,1936-03-011980-08-01,0,0,24,24,1,0,...,0.0,0,0,0,0.0,0,15,0,2,0
4,PRV51011,No,BENE39712,1912-03-01,0,1,12,12,1,0,...,0.0,0,0,0,0.0,0,8,0,0,0


In [52]:
df_test.head()

Unnamed: 0,Provider,BeneID,DOB,DOD,RenalDiseaseIndicator,NoOfMonths_PartACov,NoOfMonths_PartBCov,ChronicCond_Alzheimer,ChronicCond_Heartfailure,ChronicCond_KidneyDisease,...,InscClaimAmtReimbursed_Out,AttendingPhysician_Out,OperatingPhysician_Out,OtherPhysician_Out,DeductibleAmtPaid_Out,ClmAdmitDiagnosisCode_Out,num_claim_diagnose_in,num_claim_diagnose_out,num_claim_procedure_in,num_claim_procedure_out
0,PRV51009,BENE46602BENE86272,1935-02-011939-09-01,0,1,24,24,1,1,1,...,0.0,0,0,0,0.0,0,15,0,3,0
1,PRV51010,BENE88689BENE102305BENE127431BENE127431BENE128...,1921-01-011927-05-011934-07-011934-07-011925-0...,0,1,72,72,4,6,4,...,0.0,0,0,0,0.0,0,42,0,3,0
2,PRV51020,BENE79183BENE107177,1938-03-011943-04-01,0,0,24,12,1,2,0,...,0.0,0,0,0,0.0,0,19,0,1,0
3,PRV51022,BENE12211BENE13915BENE35388BENE50308BENE97770B...,1941-06-011921-02-011950-08-011919-01-011919-0...,0,1,72,84,2,6,6,...,0.0,0,0,0,0.0,0,50,0,7,0
4,PRV51033,BENE87872,1925-06-01,0,0,12,12,0,0,0,...,0.0,0,0,0,0.0,0,5,0,0,0


#### Deletes column that have only one value


In [53]:
# Get columns with only one unique value
singleton_columns = df_train.columns[df_train.nunique() == 1]

# Remove columns with only one unique value
df_train = df_train.drop(columns=singleton_columns)

# Store the names of deleted columns for reference
deleted_columns_df_train = list(singleton_columns)

In [54]:
deleted_columns_df_train

['ClaimStartDt_Out',
 'ClaimEndDt_Out',
 'InscClaimAmtReimbursed_Out',
 'AttendingPhysician_Out',
 'OperatingPhysician_Out',
 'OtherPhysician_Out',
 'DeductibleAmtPaid_Out',
 'ClmAdmitDiagnosisCode_Out',
 'num_claim_diagnose_out',
 'num_claim_procedure_out']

In [55]:
#check null columns
df_train.isnull().sum()

Provider                           0
PotentialFraud                     0
BeneID                             0
DOB                                0
DOD                                0
RenalDiseaseIndicator              0
NoOfMonths_PartACov                0
NoOfMonths_PartBCov                0
ChronicCond_Alzheimer              0
ChronicCond_Heartfailure           0
ChronicCond_KidneyDisease          0
ChronicCond_Cancer                 0
ChronicCond_ObstrPulmonary         0
ChronicCond_Depression             0
ChronicCond_Diabetes               0
ChronicCond_IschemicHeart          0
ChronicCond_Osteoporasis           0
ChronicCond_rheumatoidarthritis    0
ChronicCond_stroke                 0
IPAnnualReimbursementAmt           0
IPAnnualDeductibleAmt              0
OPAnnualReimbursementAmt           0
OPAnnualDeductibleAmt              0
ClaimID                            0
ClaimStartDt_In                    0
ClaimEndDt_In                      0
InscClaimAmtReimbursed_In          0
A

In [56]:
df_train.dtypes

Provider                            object
PotentialFraud                      object
BeneID                              object
DOB                                 object
DOD                                 object
RenalDiseaseIndicator                int64
NoOfMonths_PartACov                  int64
NoOfMonths_PartBCov                  int64
ChronicCond_Alzheimer                int64
ChronicCond_Heartfailure             int64
ChronicCond_KidneyDisease            int64
ChronicCond_Cancer                   int64
ChronicCond_ObstrPulmonary           int64
ChronicCond_Depression               int64
ChronicCond_Diabetes                 int64
ChronicCond_IschemicHeart            int64
ChronicCond_Osteoporasis             int64
ChronicCond_rheumatoidarthritis      int64
ChronicCond_stroke                   int64
IPAnnualReimbursementAmt             int64
IPAnnualDeductibleAmt                int64
OPAnnualReimbursementAmt             int64
OPAnnualDeductibleAmt                int64
ClaimID    

In [57]:
df_train.shape, df_test.shape

((2092, 37), (520, 46))

#### Advance Feature Selection

In [58]:
# Fill the NaN values with zero for numerical columns
df_copy = df_train.copy()
num_cols = df_copy.select_dtypes(include=['float64', 'int64']).columns.tolist()
df_copy[num_cols] = df_copy[num_cols].fillna(0)

In [59]:
len(num_cols)

25

In [60]:
num_cols

['RenalDiseaseIndicator',
 'NoOfMonths_PartACov',
 'NoOfMonths_PartBCov',
 'ChronicCond_Alzheimer',
 'ChronicCond_Heartfailure',
 'ChronicCond_KidneyDisease',
 'ChronicCond_Cancer',
 'ChronicCond_ObstrPulmonary',
 'ChronicCond_Depression',
 'ChronicCond_Diabetes',
 'ChronicCond_IschemicHeart',
 'ChronicCond_Osteoporasis',
 'ChronicCond_rheumatoidarthritis',
 'ChronicCond_stroke',
 'IPAnnualReimbursementAmt',
 'IPAnnualDeductibleAmt',
 'OPAnnualReimbursementAmt',
 'OPAnnualDeductibleAmt',
 'InscClaimAmtReimbursed_In',
 'AttendingPhysician_In',
 'OperatingPhysician_In',
 'OtherPhysician_In',
 'DeductibleAmtPaid_In',
 'num_claim_diagnose_in',
 'num_claim_procedure_in']

In [61]:
df_copy.head()

Unnamed: 0,Provider,PotentialFraud,BeneID,DOB,DOD,RenalDiseaseIndicator,NoOfMonths_PartACov,NoOfMonths_PartBCov,ChronicCond_Alzheimer,ChronicCond_Heartfailure,...,AttendingPhysician_In,OperatingPhysician_In,OtherPhysician_In,AdmissionDt,ClmAdmitDiagnosisCode_In,DeductibleAmtPaid_In,DischargeDt,DiagnosisGroupCode,num_claim_diagnose_in,num_claim_procedure_in
0,PRV51001,No,BENE36012BENE38773BENE98831BENE102690BENE152088,1940-07-011938-05-011934-11-011929-11-011913-0...,0,2,60,60,2,4,...,5,2,1,2009-07-042009-05-232009-02-032009-05-042009-0...,29623038953624140180121,5340.0,2009-07-082009-05-252009-02-082009-05-182009-0...,882864353245062,36,3
1,PRV51003,Yes,BENE18650BENE21377BENE24383BENE25685BENE30002B...,1951-09-011928-12-011951-03-011938-04-011962-0...,2009-09-01,17,732,732,32,36,...,62,40,0,2009-05-132009-02-262009-04-102009-11-112009-0...,5856185428050707807933204280V66042789786057802...,66216.0,2009-05-192009-03-052009-04-162009-11-182009-0...,2627232982039494594069512351962626383922385993...,503,48
2,PRV51007,No,BENE16277BENE98446BENE99552,1926-06-011936-04-011930-11-01,0,1,36,36,2,3,...,3,1,0,2009-05-282009-08-192009-03-15,780972957029633,3204.0,2009-06-012009-08-242009-03-22,085886887,22,1
3,PRV51008,No,BENE123836BENE139063,1936-03-011980-08-01,0,0,24,24,1,0,...,2,2,0,2009-04-192009-06-17,7837920,2136.0,2009-04-242009-06-20,623095,15,2
4,PRV51011,No,BENE39712,1912-03-01,0,1,12,12,1,0,...,1,0,0,2009-10-22,78906,1068.0,2009-10-27,414,8,0


In [62]:
df_copy.isnull().sum()

Provider                           0
PotentialFraud                     0
BeneID                             0
DOB                                0
DOD                                0
RenalDiseaseIndicator              0
NoOfMonths_PartACov                0
NoOfMonths_PartBCov                0
ChronicCond_Alzheimer              0
ChronicCond_Heartfailure           0
ChronicCond_KidneyDisease          0
ChronicCond_Cancer                 0
ChronicCond_ObstrPulmonary         0
ChronicCond_Depression             0
ChronicCond_Diabetes               0
ChronicCond_IschemicHeart          0
ChronicCond_Osteoporasis           0
ChronicCond_rheumatoidarthritis    0
ChronicCond_stroke                 0
IPAnnualReimbursementAmt           0
IPAnnualDeductibleAmt              0
OPAnnualReimbursementAmt           0
OPAnnualDeductibleAmt              0
ClaimID                            0
ClaimStartDt_In                    0
ClaimEndDt_In                      0
InscClaimAmtReimbursed_In          0
A

In [63]:
num_cols

['RenalDiseaseIndicator',
 'NoOfMonths_PartACov',
 'NoOfMonths_PartBCov',
 'ChronicCond_Alzheimer',
 'ChronicCond_Heartfailure',
 'ChronicCond_KidneyDisease',
 'ChronicCond_Cancer',
 'ChronicCond_ObstrPulmonary',
 'ChronicCond_Depression',
 'ChronicCond_Diabetes',
 'ChronicCond_IschemicHeart',
 'ChronicCond_Osteoporasis',
 'ChronicCond_rheumatoidarthritis',
 'ChronicCond_stroke',
 'IPAnnualReimbursementAmt',
 'IPAnnualDeductibleAmt',
 'OPAnnualReimbursementAmt',
 'OPAnnualDeductibleAmt',
 'InscClaimAmtReimbursed_In',
 'AttendingPhysician_In',
 'OperatingPhysician_In',
 'OtherPhysician_In',
 'DeductibleAmtPaid_In',
 'num_claim_diagnose_in',
 'num_claim_procedure_in']

In [64]:
df_train.columns

Index(['Provider', 'PotentialFraud', 'BeneID', 'DOB', 'DOD',
       'RenalDiseaseIndicator', 'NoOfMonths_PartACov', 'NoOfMonths_PartBCov',
       'ChronicCond_Alzheimer', 'ChronicCond_Heartfailure',
       'ChronicCond_KidneyDisease', 'ChronicCond_Cancer',
       'ChronicCond_ObstrPulmonary', 'ChronicCond_Depression',
       'ChronicCond_Diabetes', 'ChronicCond_IschemicHeart',
       'ChronicCond_Osteoporasis', 'ChronicCond_rheumatoidarthritis',
       'ChronicCond_stroke', 'IPAnnualReimbursementAmt',
       'IPAnnualDeductibleAmt', 'OPAnnualReimbursementAmt',
       'OPAnnualDeductibleAmt', 'ClaimID', 'ClaimStartDt_In', 'ClaimEndDt_In',
       'InscClaimAmtReimbursed_In', 'AttendingPhysician_In',
       'OperatingPhysician_In', 'OtherPhysician_In', 'AdmissionDt',
       'ClmAdmitDiagnosisCode_In', 'DeductibleAmtPaid_In', 'DischargeDt',
       'DiagnosisGroupCode', 'num_claim_diagnose_in',
       'num_claim_procedure_in'],
      dtype='object')

In [65]:
df_copy.shape

(2092, 37)

In [66]:
len(num_cols)

25

In [67]:
num_cols

['RenalDiseaseIndicator',
 'NoOfMonths_PartACov',
 'NoOfMonths_PartBCov',
 'ChronicCond_Alzheimer',
 'ChronicCond_Heartfailure',
 'ChronicCond_KidneyDisease',
 'ChronicCond_Cancer',
 'ChronicCond_ObstrPulmonary',
 'ChronicCond_Depression',
 'ChronicCond_Diabetes',
 'ChronicCond_IschemicHeart',
 'ChronicCond_Osteoporasis',
 'ChronicCond_rheumatoidarthritis',
 'ChronicCond_stroke',
 'IPAnnualReimbursementAmt',
 'IPAnnualDeductibleAmt',
 'OPAnnualReimbursementAmt',
 'OPAnnualDeductibleAmt',
 'InscClaimAmtReimbursed_In',
 'AttendingPhysician_In',
 'OperatingPhysician_In',
 'OtherPhysician_In',
 'DeductibleAmtPaid_In',
 'num_claim_diagnose_in',
 'num_claim_procedure_in']

In [68]:
df_copy.dtypes

Provider                            object
PotentialFraud                      object
BeneID                              object
DOB                                 object
DOD                                 object
RenalDiseaseIndicator                int64
NoOfMonths_PartACov                  int64
NoOfMonths_PartBCov                  int64
ChronicCond_Alzheimer                int64
ChronicCond_Heartfailure             int64
ChronicCond_KidneyDisease            int64
ChronicCond_Cancer                   int64
ChronicCond_ObstrPulmonary           int64
ChronicCond_Depression               int64
ChronicCond_Diabetes                 int64
ChronicCond_IschemicHeart            int64
ChronicCond_Osteoporasis             int64
ChronicCond_rheumatoidarthritis      int64
ChronicCond_stroke                   int64
IPAnnualReimbursementAmt             int64
IPAnnualDeductibleAmt                int64
OPAnnualReimbursementAmt             int64
OPAnnualDeductibleAmt                int64
ClaimID    

In [69]:
type(num_cols)

list

In [70]:
# select columns using correlation between categorical and numerical data
df_copy['PotentialFraud'] = df_copy['PotentialFraud'].map({'Yes': 1, 'No': 0})
num_cols = num_cols + ['PotentialFraud']

len(num_cols)

26

In [71]:
df_copy.dtypes

Provider                            object
PotentialFraud                       int64
BeneID                              object
DOB                                 object
DOD                                 object
RenalDiseaseIndicator                int64
NoOfMonths_PartACov                  int64
NoOfMonths_PartBCov                  int64
ChronicCond_Alzheimer                int64
ChronicCond_Heartfailure             int64
ChronicCond_KidneyDisease            int64
ChronicCond_Cancer                   int64
ChronicCond_ObstrPulmonary           int64
ChronicCond_Depression               int64
ChronicCond_Diabetes                 int64
ChronicCond_IschemicHeart            int64
ChronicCond_Osteoporasis             int64
ChronicCond_rheumatoidarthritis      int64
ChronicCond_stroke                   int64
IPAnnualReimbursementAmt             int64
IPAnnualDeductibleAmt                int64
OPAnnualReimbursementAmt             int64
OPAnnualDeductibleAmt                int64
ClaimID    

In [72]:
# calculate the correlation matrix
corr = df_copy[num_cols].corr(method='kendall')['PotentialFraud'].reset_index().rename(columns={'index': 'feature'})

selected_cols = []

for i in corr['feature'].unique():
    if abs(corr.loc[corr['feature'] == i, 'PotentialFraud'].values[0]) >= 0.01:
        selected_cols.append(i)

selected_cols

['RenalDiseaseIndicator',
 'NoOfMonths_PartACov',
 'NoOfMonths_PartBCov',
 'ChronicCond_Alzheimer',
 'ChronicCond_Heartfailure',
 'ChronicCond_KidneyDisease',
 'ChronicCond_Cancer',
 'ChronicCond_ObstrPulmonary',
 'ChronicCond_Depression',
 'ChronicCond_Diabetes',
 'ChronicCond_IschemicHeart',
 'ChronicCond_Osteoporasis',
 'ChronicCond_rheumatoidarthritis',
 'ChronicCond_stroke',
 'IPAnnualReimbursementAmt',
 'IPAnnualDeductibleAmt',
 'OPAnnualReimbursementAmt',
 'OPAnnualDeductibleAmt',
 'InscClaimAmtReimbursed_In',
 'AttendingPhysician_In',
 'OperatingPhysician_In',
 'OtherPhysician_In',
 'DeductibleAmtPaid_In',
 'num_claim_diagnose_in',
 'num_claim_procedure_in',
 'PotentialFraud']

In [73]:
len(selected_cols)

26

In [74]:
## Now, use these selected-columns features to train the model

df_temp = df_train[selected_cols]

# remove duplicate columns
df_temp = df_temp.loc[:,~df_temp.columns.duplicated()]

df_temp.shape

(2092, 26)

In [75]:
df_temp.dtypes

RenalDiseaseIndicator                int64
NoOfMonths_PartACov                  int64
NoOfMonths_PartBCov                  int64
ChronicCond_Alzheimer                int64
ChronicCond_Heartfailure             int64
ChronicCond_KidneyDisease            int64
ChronicCond_Cancer                   int64
ChronicCond_ObstrPulmonary           int64
ChronicCond_Depression               int64
ChronicCond_Diabetes                 int64
ChronicCond_IschemicHeart            int64
ChronicCond_Osteoporasis             int64
ChronicCond_rheumatoidarthritis      int64
ChronicCond_stroke                   int64
IPAnnualReimbursementAmt             int64
IPAnnualDeductibleAmt                int64
OPAnnualReimbursementAmt             int64
OPAnnualDeductibleAmt                int64
InscClaimAmtReimbursed_In          float64
AttendingPhysician_In                int64
OperatingPhysician_In                int64
OtherPhysician_In                    int64
DeductibleAmtPaid_In               float64
num_claim_d

In [76]:
df_temp.head()

Unnamed: 0,RenalDiseaseIndicator,NoOfMonths_PartACov,NoOfMonths_PartBCov,ChronicCond_Alzheimer,ChronicCond_Heartfailure,ChronicCond_KidneyDisease,ChronicCond_Cancer,ChronicCond_ObstrPulmonary,ChronicCond_Depression,ChronicCond_Diabetes,...,OPAnnualReimbursementAmt,OPAnnualDeductibleAmt,InscClaimAmtReimbursed_In,AttendingPhysician_In,OperatingPhysician_In,OtherPhysician_In,DeductibleAmtPaid_In,num_claim_diagnose_in,num_claim_procedure_in,PotentialFraud
0,2,60,60,2,4,4,1,2,4,4,...,6750,1180,97000.0,5,2,1,5340.0,36,3,No
1,17,732,732,32,36,39,7,23,25,49,...,137500,44940,573000.0,62,40,0,66216.0,503,48,Yes
2,1,36,36,2,3,1,0,0,2,3,...,7240,1410,19000.0,3,1,0,3204.0,22,1,No
3,0,24,24,1,0,1,1,1,0,1,...,640,330,25000.0,2,2,0,2136.0,15,2,No
4,1,12,12,1,0,1,0,0,1,1,...,590,90,5000.0,1,0,0,1068.0,8,0,No


In [77]:
df_temp['PotentialFraud'] = df_train['PotentialFraud'].map({'Yes': 1, 'No': 0}) 

X_train_val = df_temp.drop(columns='PotentialFraud')
y_train_val = df_temp['PotentialFraud']

In [78]:
X_train_val.dtypes

RenalDiseaseIndicator                int64
NoOfMonths_PartACov                  int64
NoOfMonths_PartBCov                  int64
ChronicCond_Alzheimer                int64
ChronicCond_Heartfailure             int64
ChronicCond_KidneyDisease            int64
ChronicCond_Cancer                   int64
ChronicCond_ObstrPulmonary           int64
ChronicCond_Depression               int64
ChronicCond_Diabetes                 int64
ChronicCond_IschemicHeart            int64
ChronicCond_Osteoporasis             int64
ChronicCond_rheumatoidarthritis      int64
ChronicCond_stroke                   int64
IPAnnualReimbursementAmt             int64
IPAnnualDeductibleAmt                int64
OPAnnualReimbursementAmt             int64
OPAnnualDeductibleAmt                int64
InscClaimAmtReimbursed_In          float64
AttendingPhysician_In                int64
OperatingPhysician_In                int64
OtherPhysician_In                    int64
DeductibleAmtPaid_In               float64
num_claim_d

In [79]:
df_test.columns

Index(['Provider', 'BeneID', 'DOB', 'DOD', 'RenalDiseaseIndicator',
       'NoOfMonths_PartACov', 'NoOfMonths_PartBCov', 'ChronicCond_Alzheimer',
       'ChronicCond_Heartfailure', 'ChronicCond_KidneyDisease',
       'ChronicCond_Cancer', 'ChronicCond_ObstrPulmonary',
       'ChronicCond_Depression', 'ChronicCond_Diabetes',
       'ChronicCond_IschemicHeart', 'ChronicCond_Osteoporasis',
       'ChronicCond_rheumatoidarthritis', 'ChronicCond_stroke',
       'IPAnnualReimbursementAmt', 'IPAnnualDeductibleAmt',
       'OPAnnualReimbursementAmt', 'OPAnnualDeductibleAmt', 'ClaimID',
       'ClaimStartDt_In', 'ClaimEndDt_In', 'InscClaimAmtReimbursed_In',
       'AttendingPhysician_In', 'OperatingPhysician_In', 'OtherPhysician_In',
       'AdmissionDt', 'ClmAdmitDiagnosisCode_In', 'DeductibleAmtPaid_In',
       'DischargeDt', 'DiagnosisGroupCode', 'ClaimStartDt_Out',
       'ClaimEndDt_Out', 'InscClaimAmtReimbursed_Out',
       'AttendingPhysician_Out', 'OperatingPhysician_Out',
       'Other

In [80]:
columns_to_ignore = ['PotentialFraud']

In [81]:
# use columns to be features of test data
selected_test = [i for i in selected_cols if i not in columns_to_ignore] + ['Provider']
selected_test

['RenalDiseaseIndicator',
 'NoOfMonths_PartACov',
 'NoOfMonths_PartBCov',
 'ChronicCond_Alzheimer',
 'ChronicCond_Heartfailure',
 'ChronicCond_KidneyDisease',
 'ChronicCond_Cancer',
 'ChronicCond_ObstrPulmonary',
 'ChronicCond_Depression',
 'ChronicCond_Diabetes',
 'ChronicCond_IschemicHeart',
 'ChronicCond_Osteoporasis',
 'ChronicCond_rheumatoidarthritis',
 'ChronicCond_stroke',
 'IPAnnualReimbursementAmt',
 'IPAnnualDeductibleAmt',
 'OPAnnualReimbursementAmt',
 'OPAnnualDeductibleAmt',
 'InscClaimAmtReimbursed_In',
 'AttendingPhysician_In',
 'OperatingPhysician_In',
 'OtherPhysician_In',
 'DeductibleAmtPaid_In',
 'num_claim_diagnose_in',
 'num_claim_procedure_in',
 'Provider']

In [82]:
# use columns to be features of test data
df_test_copy = df_test[selected_test]
df_test_copy = df_test_copy.loc[:,~df_test_copy.columns.duplicated()]

df_test_copy.shape

(520, 26)

In [83]:
df_test_copy.dtypes

RenalDiseaseIndicator                int64
NoOfMonths_PartACov                  int64
NoOfMonths_PartBCov                  int64
ChronicCond_Alzheimer                int64
ChronicCond_Heartfailure             int64
ChronicCond_KidneyDisease            int64
ChronicCond_Cancer                   int64
ChronicCond_ObstrPulmonary           int64
ChronicCond_Depression               int64
ChronicCond_Diabetes                 int64
ChronicCond_IschemicHeart            int64
ChronicCond_Osteoporasis             int64
ChronicCond_rheumatoidarthritis      int64
ChronicCond_stroke                   int64
IPAnnualReimbursementAmt             int64
IPAnnualDeductibleAmt                int64
OPAnnualReimbursementAmt             int64
OPAnnualDeductibleAmt                int64
InscClaimAmtReimbursed_In          float64
AttendingPhysician_In                int64
OperatingPhysician_In                int64
OtherPhysician_In                    int64
DeductibleAmtPaid_In               float64
num_claim_d

In [84]:
test_id = df_test_copy[['Provider']]
test_id.shape

(520, 1)

In [85]:
X_test = df_test_copy.drop(columns='Provider', axis=1)
X_test.shape

(520, 25)

In [86]:
X_test.shape, X_train_val.shape, y_train_val.shape

((520, 25), (2092, 25), (2092,))

In [87]:
X_train_val.columns

Index(['RenalDiseaseIndicator', 'NoOfMonths_PartACov', 'NoOfMonths_PartBCov',
       'ChronicCond_Alzheimer', 'ChronicCond_Heartfailure',
       'ChronicCond_KidneyDisease', 'ChronicCond_Cancer',
       'ChronicCond_ObstrPulmonary', 'ChronicCond_Depression',
       'ChronicCond_Diabetes', 'ChronicCond_IschemicHeart',
       'ChronicCond_Osteoporasis', 'ChronicCond_rheumatoidarthritis',
       'ChronicCond_stroke', 'IPAnnualReimbursementAmt',
       'IPAnnualDeductibleAmt', 'OPAnnualReimbursementAmt',
       'OPAnnualDeductibleAmt', 'InscClaimAmtReimbursed_In',
       'AttendingPhysician_In', 'OperatingPhysician_In', 'OtherPhysician_In',
       'DeductibleAmtPaid_In', 'num_claim_diagnose_in',
       'num_claim_procedure_in'],
      dtype='object')

In [88]:
X_test.columns

Index(['RenalDiseaseIndicator', 'NoOfMonths_PartACov', 'NoOfMonths_PartBCov',
       'ChronicCond_Alzheimer', 'ChronicCond_Heartfailure',
       'ChronicCond_KidneyDisease', 'ChronicCond_Cancer',
       'ChronicCond_ObstrPulmonary', 'ChronicCond_Depression',
       'ChronicCond_Diabetes', 'ChronicCond_IschemicHeart',
       'ChronicCond_Osteoporasis', 'ChronicCond_rheumatoidarthritis',
       'ChronicCond_stroke', 'IPAnnualReimbursementAmt',
       'IPAnnualDeductibleAmt', 'OPAnnualReimbursementAmt',
       'OPAnnualDeductibleAmt', 'InscClaimAmtReimbursed_In',
       'AttendingPhysician_In', 'OperatingPhysician_In', 'OtherPhysician_In',
       'DeductibleAmtPaid_In', 'num_claim_diagnose_in',
       'num_claim_procedure_in'],
      dtype='object')

In [89]:
X_train_val.dtypes

RenalDiseaseIndicator                int64
NoOfMonths_PartACov                  int64
NoOfMonths_PartBCov                  int64
ChronicCond_Alzheimer                int64
ChronicCond_Heartfailure             int64
ChronicCond_KidneyDisease            int64
ChronicCond_Cancer                   int64
ChronicCond_ObstrPulmonary           int64
ChronicCond_Depression               int64
ChronicCond_Diabetes                 int64
ChronicCond_IschemicHeart            int64
ChronicCond_Osteoporasis             int64
ChronicCond_rheumatoidarthritis      int64
ChronicCond_stroke                   int64
IPAnnualReimbursementAmt             int64
IPAnnualDeductibleAmt                int64
OPAnnualReimbursementAmt             int64
OPAnnualDeductibleAmt                int64
InscClaimAmtReimbursed_In          float64
AttendingPhysician_In                int64
OperatingPhysician_In                int64
OtherPhysician_In                    int64
DeductibleAmtPaid_In               float64
num_claim_d

In [90]:
y_train_val.dtypes

dtype('int64')

In [91]:
y_train_val.value_counts()

PotentialFraud
0    1652
1     440
Name: count, dtype: int64

In [92]:
X_test.shape

(520, 25)

In [93]:
X_train_val.shape, y_train_val.shape

((2092, 25), (2092,))

## Using PyTorch library, train the models

In [94]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score , roc_auc_score

In [95]:
# Define k-fold cross-validation
kf = KFold(n_splits=5, random_state=42, shuffle=True)

In [96]:
# Function to calculate various risk metrics


def calculate_risks(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    roc_auc = roc_auc_score(y_true, y_pred)
    return accuracy, precision, recall, f1, roc_auc

##### 1. Logistic Regression Model

In [97]:
# Define logistic regression model


class LogisticRegression(nn.Module):
    def __init__(self, input_dim):
        super(LogisticRegression, self).__init__()
        self.linear = nn.Linear(input_dim, 1)

    def forward(self, x):
        return torch.sigmoid(self.linear(x))

In [98]:
# Define hyperparameters
learning_rate = 0.01
num_epochs = 100
input_dim = X_train_val.shape[1]

In [99]:
# Convert pandas dataframe to PyTorch tensors
X_train_val_tensor = torch.tensor(X_train_val.values.astype(np.float32))
y_train_val_tensor = torch.tensor(y_train_val.values.reshape(-1, 1).astype(np.float32))

In [100]:
# Define lists to store performance metrics
accuracies = []
precisions = []
recalls = []
f1_scores = []
roc_auc_scores = []

# Perform k-fold cross-validation
for train_index, val_index in kf.split(X_train_val_tensor):
    X_train, X_val = X_train_val_tensor[train_index], X_train_val_tensor[val_index]
    y_train, y_val = y_train_val_tensor[train_index], y_train_val_tensor[val_index]

    # Initialize model, loss function, and optimizer
    model = LogisticRegression(input_dim)
    criterion = nn.BCELoss()
    optimizer = optim.SGD(model.parameters(), lr=learning_rate)

    # Train model
    for epoch in range(num_epochs):
        model.train()
        optimizer.zero_grad()
        outputs = model(X_train)
        loss = criterion(outputs, y_train)
        loss.backward()
        optimizer.step()

    # Evaluate model on validation set
    model.eval()
    with torch.no_grad():
        y_pred = (model(X_val) > 0.5).float()
        accuracy, precision, recall, f1, roc_auc = calculate_risks(y_val.numpy(), y_pred.numpy())
        accuracies.append(accuracy)
        precisions.append(precision)
        recalls.append(recall)
        f1_scores.append(f1)
        roc_auc_scores.append(roc_auc)


print(f'Accuracies: {accuracies}')
print(f'Precisions: {precisions}')
print(f'Recalls: {recalls}')
print(f'F1 Scores: {f1_scores}')
print(f'ROC AUC Scores: {roc_auc_scores}')
print('----' * 10)

# Calculate average performance metrics
avg_accuracy = np.mean(accuracies)
avg_precision = np.mean(precisions)
avg_recall = np.mean(recalls)
avg_f1 = np.mean(f1_scores)
avg_roc_auc = np.mean(roc_auc_scores)


# Print average performance metrics
print(f'Average Accuracy: {avg_accuracy}')
print(f'Average Precision: {avg_precision}')
print(f'Average Recall: {avg_recall}')
print(f'Average F1 Score: {avg_f1}')
print(f'Average ROC AUC: {avg_roc_auc}')

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Accuracies: [0.7947494033412887, 0.2100238663484487, 0.7727272727272727, 0.7918660287081339, 0.7990430622009569]
Precisions: [0.0, 0.2100238663484487, 0.0, 0.0, 0.0]
Recalls: [0.0, 1.0, 0.0, 0.0, 0.0]
F1 Scores: [0.0, 0.34714003944773175, 0.0, 0.0, 0.0]
ROC AUC Scores: [0.5, 0.5, 0.5, 0.5, 0.5]
----------------------------------------
Average Accuracy: 0.6736819266652202
Average Precision: 0.04200477326968974
Average Recall: 0.2
Average F1 Score: 0.06942800788954635
Average ROC AUC: 0.5


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [101]:
## calculate mean and standard deviation of the performance metrics
mean_accuracy = np.mean(accuracies)
std_accuracy = np.std(accuracies)

mean_precision = np.mean(precisions)
std_precision = np.std(precisions)

mean_recall = np.mean(recalls)
std_recall = np.std(recalls)

mean_f1 = np.mean(f1_scores)
std_f1 = np.std(f1_scores)

mean_roc_auc = np.mean(roc_auc_scores)
std_roc_auc = np.std(roc_auc_scores)

# Print average performance metrics
print(f'Average Accuracy: {mean_accuracy: .2f} +/- {std_accuracy: .2f}')
print(f'Average Precision: {mean_precision: .2f} +/- {std_precision: .2f}')
print(f'Average Recall: {mean_recall: .2f} +/- {std_recall: .2f}')
print(f'Average F1 Score: {mean_f1: .2f} +/- {std_f1: .2f}')
print(f'Average ROC AUC: {mean_roc_auc: .2f} +/- {std_roc_auc: .2f}')

Average Accuracy:  0.67 +/-  0.23
Average Precision:  0.04 +/-  0.08
Average Recall:  0.20 +/-  0.40
Average F1 Score:  0.07 +/-  0.14
Average ROC AUC:  0.50 +/-  0.00


##### 2. Random Forest

In [102]:
from sklearn.ensemble import RandomForestClassifier

In [103]:
# Convert pandas dataframe to numpy arrays
X_train_val_np = X_train_val.values
y_train_val_np = y_train_val.values.reshape(-1)

# Define hyperparameters
num_estimators = 100
max_depth = 5  # Adjust as needed
random_state = 42

# Define lists to store performance metrics
accuracies = []
precisions = []
recalls = []
f1_scores = []
roc_auc_scores = []

# Perform k-fold cross-validation
for train_index, val_index in kf.split(X_train_val_np):
    X_train, X_val = X_train_val_np[train_index], X_train_val_np[val_index]
    y_train, y_val = y_train_val_np[train_index], y_train_val_np[val_index]

    # Initialize and train Random Forest classifier
    rf_model = RandomForestClassifier(n_estimators=num_estimators, max_depth=max_depth, random_state=random_state)
    rf_model.fit(X_train, y_train)

    # Evaluate model on validation set
    y_pred = rf_model.predict(X_val)
    accuracy, precision, recall, f1, roc_auc = calculate_risks(y_val, y_pred)
    accuracies.append(accuracy)
    precisions.append(precision)
    recalls.append(recall)
    f1_scores.append(f1)
    roc_auc_scores.append(roc_auc)


print(f'Accuracies: {accuracies}')
print(f'Precisions: {precisions}')
print(f'Recalls: {recalls}')
print(f'F1 Scores: {f1_scores}')
print(f'ROC AUC Scores: {roc_auc_scores}')
print('----' * 10)

# Calculate average performance metrics
avg_accuracy = np.mean(accuracies)
avg_precision = np.mean(precisions)
avg_recall = np.mean(recalls)
avg_f1 = np.mean(f1_scores)
avg_roc_auc = np.mean(roc_auc_scores)


# Print average performance metrics
print(f'Average Accuracy: {avg_accuracy}')
print(f'Average Precision: {avg_precision}')
print(f'Average Recall: {avg_recall}')
print(f'Average F1 Score: {avg_f1}')
print(f'Average ROC AUC: {avg_roc_auc}')

Accuracies: [0.8162291169451074, 0.8186157517899761, 0.854066985645933, 0.854066985645933, 0.8732057416267942]
Precisions: [0.5882352941176471, 0.6304347826086957, 0.75, 0.7166666666666667, 0.8604651162790697]
Recalls: [0.3488372093023256, 0.32954545454545453, 0.5368421052631579, 0.4942528735632184, 0.44047619047619047]
F1 Scores: [0.43795620437956206, 0.43283582089552236, 0.6257668711656442, 0.5850340136054422, 0.5826771653543307]
ROC AUC Scores: [0.6428870731196312, 0.6390929689645702, 0.7421052631578947, 0.721446678473452, 0.7112560593099515]
----------------------------------------
Average Accuracy: 0.8432369163307488
Average Precision: 0.7091603719344158
Average Recall: 0.4299907666300694
Average F1 Score: 0.5328540150801002
Average ROC AUC: 0.6913576086050999


In [104]:
## calculate mean and standard deviation of the performance metrics
mean_accuracy = np.mean(accuracies)
std_accuracy = np.std(accuracies)

mean_precision = np.mean(precisions)
std_precision = np.std(precisions)

mean_recall = np.mean(recalls)
std_recall = np.std(recalls)

mean_f1 = np.mean(f1_scores)
std_f1 = np.std(f1_scores)

mean_roc_auc = np.mean(roc_auc_scores)
std_roc_auc = np.std(roc_auc_scores)

# Print average performance metrics
print(f'Average Accuracy: {mean_accuracy: .2f} +/- {std_accuracy: .2f}')
print(f'Average Precision: {mean_precision: .2f} +/- {std_precision: .2f}')
print(f'Average Recall: {mean_recall: .2f} +/- {std_recall: .2f}')
print(f'Average F1 Score: {mean_f1: .2f} +/- {std_f1: .2f}')
print(f'Average ROC AUC: {mean_roc_auc: .2f} +/- {std_roc_auc: .2f}')

Average Accuracy:  0.84 +/-  0.02
Average Precision:  0.71 +/-  0.10
Average Recall:  0.43 +/-  0.08
Average F1 Score:  0.53 +/-  0.08
Average ROC AUC:  0.69 +/-  0.04


##### 3. Decision Tree

#

In [105]:
from sklearn.tree import DecisionTreeClassifier

In [106]:
# Convert pandas dataframe to numpy arrays
X_train_val_np = X_train_val.values
y_train_val_np = y_train_val.values.reshape(-1)

# Define hyperparameters
num_estimators = 100
max_depth = 5  # Adjust as needed
random_state = 42

# Define lists to store performance metrics
accuracies = []
precisions = []
recalls = []
f1_scores = []
roc_auc_scores = []

# Perform k-fold cross-validation
for train_index, val_index in kf.split(X_train_val_np):
    X_train, X_val = X_train_val_np[train_index], X_train_val_np[val_index]
    y_train, y_val = y_train_val_np[train_index], y_train_val_np[val_index]

    # Initialize and train Decision Tree classifier
    dt_model = DecisionTreeClassifier()
    dt_model.fit(X_train, y_train)

    # Evaluate model on validation set
    y_pred = rf_model.predict(X_val)
    accuracy, precision, recall, f1, roc_auc = calculate_risks(y_val, y_pred)
    accuracies.append(accuracy)
    precisions.append(precision)
    recalls.append(recall)
    f1_scores.append(f1)
    roc_auc_scores.append(roc_auc)


print(f'Accuracies: {accuracies}')
print(f'Precisions: {precisions}')
print(f'Recalls: {recalls}')
print(f'F1 Scores: {f1_scores}')
print(f'ROC AUC Scores: {roc_auc_scores}')
print('----' * 10)

# Calculate average performance metrics
avg_accuracy = np.mean(accuracies)
avg_precision = np.mean(precisions)
avg_recall = np.mean(recalls)
avg_f1 = np.mean(f1_scores)
avg_roc_auc = np.mean(roc_auc_scores)


# Print average performance metrics
print(f'Average Accuracy: {avg_accuracy}')
print(f'Average Precision: {avg_precision}')
print(f'Average Recall: {avg_recall}')
print(f'Average F1 Score: {avg_f1}')
print(f'Average ROC AUC: {avg_roc_auc}')

Accuracies: [0.8926014319809069, 0.8758949880668258, 0.8995215311004785, 0.8947368421052632, 0.8732057416267942]
Precisions: [0.9019607843137255, 0.9090909090909091, 0.9206349206349206, 0.9056603773584906, 0.8604651162790697]
Recalls: [0.5348837209302325, 0.45454545454545453, 0.6105263157894737, 0.5517241379310345, 0.44047619047619047]
F1 Scores: [0.6715328467153284, 0.6060606060606061, 0.7341772151898734, 0.6857142857142857, 0.5826771653543307]
ROC AUC Scores: [0.7599343529576088, 0.7212304312002197, 0.7975232198142416, 0.7683091988748828, 0.7112560593099515]
----------------------------------------
Average Accuracy: 0.8871921069760538
Average Precision: 0.899562421535423
Average Recall: 0.5184311639344772
Average F1 Score: 0.6560324238068848
Average ROC AUC: 0.7516506524313809


In [107]:
## calculate mean and standard deviation of the performance metrics
mean_accuracy = np.mean(accuracies)
std_accuracy = np.std(accuracies)

mean_precision = np.mean(precisions)
std_precision = np.std(precisions)

mean_recall = np.mean(recalls)
std_recall = np.std(recalls)

mean_f1 = np.mean(f1_scores)
std_f1 = np.std(f1_scores)

mean_roc_auc = np.mean(roc_auc_scores)
std_roc_auc = np.std(roc_auc_scores)

# Print average performance metrics
print(f'Average Accuracy: {mean_accuracy: .2f} +/- {std_accuracy: .2f}')
print(f'Average Precision: {mean_precision: .2f} +/- {std_precision: .2f}')
print(f'Average Recall: {mean_recall: .2f} +/- {std_recall: .2f}')
print(f'Average F1 Score: {mean_f1: .2f} +/- {std_f1: .2f}')
print(f'Average ROC AUC: {mean_roc_auc: .2f} +/- {std_roc_auc: .2f}')

Average Accuracy:  0.89 +/-  0.01
Average Precision:  0.90 +/-  0.02
Average Recall:  0.52 +/-  0.06
Average F1 Score:  0.66 +/-  0.05
Average ROC AUC:  0.75 +/-  0.03


##### 4. SVM 

In [108]:
from sklearn.svm import SVC

In [109]:
# Convert pandas dataframe to numpy arrays
X_train_val_np = X_train_val.values
y_train_val_np = y_train_val.values.reshape(-1)

# Define hyperparameters
num_estimators = 100
max_depth = 5  # Adjust as needed
random_state = 42

# Define lists to store performance metrics
accuracies = []
precisions = []
recalls = []
f1_scores = []
roc_auc_scores = []

# Perform k-fold cross-validation
for train_index, val_index in kf.split(X_train_val_np):
    X_train, X_val = X_train_val_np[train_index], X_train_val_np[val_index]
    y_train, y_val = y_train_val_np[train_index], y_train_val_np[val_index]

    # Initialize and train SVM classifier
    svm_model = SVC()
    svm_model.fit(X_train, y_train)

    # Evaluate model on validation set
    y_pred = rf_model.predict(X_val)
    accuracy, precision, recall, f1, roc_auc = calculate_risks(y_val, y_pred)
    accuracies.append(accuracy)
    precisions.append(precision)
    recalls.append(recall)
    f1_scores.append(f1)
    roc_auc_scores.append(roc_auc)


print(f'Accuracies: {accuracies}')
print(f'Precisions: {precisions}')
print(f'Recalls: {recalls}')
print(f'F1 Scores: {f1_scores}')
print(f'ROC AUC Scores: {roc_auc_scores}')
print('----' * 10)

# Calculate average performance metrics
avg_accuracy = np.mean(accuracies)
avg_precision = np.mean(precisions)
avg_recall = np.mean(recalls)
avg_f1 = np.mean(f1_scores)
avg_roc_auc = np.mean(roc_auc_scores)


# Print average performance metrics
print(f'Average Accuracy: {avg_accuracy}')
print(f'Average Precision: {avg_precision}')
print(f'Average Recall: {avg_recall}')
print(f'Average F1 Score: {avg_f1}')
print(f'Average ROC AUC: {avg_roc_auc}')

Accuracies: [0.8926014319809069, 0.8758949880668258, 0.8995215311004785, 0.8947368421052632, 0.8732057416267942]
Precisions: [0.9019607843137255, 0.9090909090909091, 0.9206349206349206, 0.9056603773584906, 0.8604651162790697]
Recalls: [0.5348837209302325, 0.45454545454545453, 0.6105263157894737, 0.5517241379310345, 0.44047619047619047]
F1 Scores: [0.6715328467153284, 0.6060606060606061, 0.7341772151898734, 0.6857142857142857, 0.5826771653543307]
ROC AUC Scores: [0.7599343529576088, 0.7212304312002197, 0.7975232198142416, 0.7683091988748828, 0.7112560593099515]
----------------------------------------
Average Accuracy: 0.8871921069760538
Average Precision: 0.899562421535423
Average Recall: 0.5184311639344772
Average F1 Score: 0.6560324238068848
Average ROC AUC: 0.7516506524313809


In [110]:
## calculate mean and standard deviation of the performance metrics
mean_accuracy = np.mean(accuracies)
std_accuracy = np.std(accuracies)

mean_precision = np.mean(precisions)
std_precision = np.std(precisions)

mean_recall = np.mean(recalls)
std_recall = np.std(recalls)

mean_f1 = np.mean(f1_scores)
std_f1 = np.std(f1_scores)

mean_roc_auc = np.mean(roc_auc_scores)
std_roc_auc = np.std(roc_auc_scores)

# Print average performance metrics
print(f'Average Accuracy: {mean_accuracy: .2f} +/- {std_accuracy: .2f}')
print(f'Average Precision: {mean_precision: .2f} +/- {std_precision: .2f}')
print(f'Average Recall: {mean_recall: .2f} +/- {std_recall: .2f}')
print(f'Average F1 Score: {mean_f1: .2f} +/- {std_f1: .2f}')
print(f'Average ROC AUC: {mean_roc_auc: .2f} +/- {std_roc_auc: .2f}')

Average Accuracy:  0.89 +/-  0.01
Average Precision:  0.90 +/-  0.02
Average Recall:  0.52 +/-  0.06
Average F1 Score:  0.66 +/-  0.05
Average ROC AUC:  0.75 +/-  0.03


##### 5. ANN (Artificial Neural Network)

In [111]:
from tqdm import tqdm

In [112]:
class ANN(nn.Module):
    def __init__(self, input_dim):
        super(ANN, self).__init__()
        self.fc1 = nn.Linear(input_dim, 64)  # 64 hidden units
        self.bn1 = nn.BatchNorm1d(64)  # Batch normalization for the first hidden layer
        self.fc2 = nn.Linear(64, 32)  # 32 hidden units
        self.bn2 = nn.BatchNorm1d(32)  # Batch normalization for the second hidden layer
        self.fc3 = nn.Linear(32, 1)  # 1 output unit (binary classification)
        self.dropout = nn.Dropout(0.5)  # Dropout with 50% probability

    def forward(self, x):
        x = torch.relu(self.bn1(self.fc1(x)))
        x = self.dropout(x)  # Apply dropout after activation
        x = torch.relu(self.bn2(self.fc2(x)))
        x = self.dropout(x)  # Apply dropout after activation
        x = torch.sigmoid(self.fc3(x))  # Sigmoid activation for binary classification
        return x

In [113]:
# Convert pandas dataframe to PyTorch tensors
X_train_val_tensor = torch.tensor(X_train_val.values.astype(np.float32))
y_train_val_tensor = torch.tensor(y_train_val.values.reshape(-1, 1).astype(np.float32))

# Define hyperparameters
learning_rate = 0.001
num_epochs = 100
input_dim = X_train_val.shape[1]

In [114]:
# Define lists to store performance metrics
accuracies = []
precisions = []
recalls = []
f1_scores = []
roc_auc_scores = []

# Perform k-fold cross-validation
for train_index, val_index in kf.split(X_train_val_np):
    X_train, X_val = X_train_val_tensor[train_index], X_train_val_tensor[val_index]
    y_train, y_val = y_train_val_tensor[train_index], y_train_val_tensor[val_index]

    # Initialize model, loss function, and optimizer
    model = ANN(input_dim)
    criterion = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    # Create DataLoader for training data
    train_dataset = TensorDataset(X_train, y_train)
    train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)

    # Train model
    for epoch in range(num_epochs):
        model.train()
        for inputs, targets in tqdm(train_loader, desc=f'Epoch {epoch+1}/{num_epochs}', leave=False):
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()

    # Evaluate model on validation set
    model.eval()
    with torch.no_grad():
        y_pred = (model(X_val) > 0.5).float()
        accuracy, precision, recall, f1, roc_auc = calculate_risks(y_val, y_pred)
        accuracies.append(accuracy)
        precisions.append(precision)
        recalls.append(recall)
        f1_scores.append(f1)
        roc_auc_scores.append(roc_auc)


print(f'Accuracies: {accuracies}')
print(f'Precisions: {precisions}')
print(f'Recalls: {recalls}')
print(f'F1 Scores: {f1_scores}')
print(f'ROC AUC Scores: {roc_auc_scores}')
print('----' * 10)

# Calculate average performance metrics
avg_accuracy = np.mean(accuracies)
avg_precision = np.mean(precisions)
avg_recall = np.mean(recalls)
avg_f1 = np.mean(f1_scores)
avg_roc_auc = np.mean(roc_auc_scores)


# Print average performance metrics
print(f'Average Accuracy: {avg_accuracy}')
print(f'Average Precision: {avg_precision}')
print(f'Average Recall: {avg_recall}')
print(f'Average F1 Score: {avg_f1}')
print(f'Average ROC AUC: {avg_roc_auc}')

                                                               

Accuracies: [0.8281622911694511, 0.8305489260143198, 0.8492822966507177, 0.8708133971291866, 0.8564593301435407]
Precisions: [0.6590909090909091, 0.7428571428571429, 0.82, 0.7796610169491526, 1.0]
Recalls: [0.3372093023255814, 0.29545454545454547, 0.43157894736842106, 0.5287356321839081, 0.2857142857142857]
F1 Scores: [0.4461538461538462, 0.42276422764227645, 0.5655172413793104, 0.6301369863013698, 0.4444444444444444]
ROC AUC Scores: [0.6460821286402681, 0.6341321065641308, 0.7018575851393188, 0.7447303538563045, 0.6428571428571428]
----------------------------------------
Average Accuracy: 0.8470532482214432
Average Precision: 0.8003218137794409
Average Recall: 0.37573854260934836
Average F1 Score: 0.5018033491842495
Average ROC AUC: 0.673931863411433


In [115]:
## calculate mean and standard deviation of the performance metrics
mean_accuracy = np.mean(accuracies)
std_accuracy = np.std(accuracies)

mean_precision = np.mean(precisions)
std_precision = np.std(precisions)

mean_recall = np.mean(recalls)
std_recall = np.std(recalls)

mean_f1 = np.mean(f1_scores)
std_f1 = np.std(f1_scores)

mean_roc_auc = np.mean(roc_auc_scores)
std_roc_auc = np.std(roc_auc_scores)

# Print average performance metrics
print(f'Average Accuracy: {mean_accuracy: .2f} +/- {std_accuracy: .2f}')
print(f'Average Precision: {mean_precision: .2f} +/- {std_precision: .2f}')
print(f'Average Recall: {mean_recall: .2f} +/- {std_recall: .2f}')
print(f'Average F1 Score: {mean_f1: .2f} +/- {std_f1: .2f}')
print(f'Average ROC AUC: {mean_roc_auc: .2f} +/- {std_roc_auc: .2f}')

Average Accuracy:  0.85 +/-  0.02
Average Precision:  0.80 +/-  0.11
Average Recall:  0.38 +/-  0.09
Average F1 Score:  0.50 +/-  0.08
Average ROC AUC:  0.67 +/-  0.04


#### Make the Prediction using the Best Model (either Decision Tree or SVM)
</br>
</br>
Here, we are using **Decision Tree** 

In [116]:
# Make predictions on X_test data
predictions = dt_model.predict(X_test.values)

In [117]:
predictions.shape

(520,)

In [118]:
# Map the predicted values back to 'Yes' and 'No'
predicted_labels = ['Yes' if pred == 1 else 'No' for pred in predictions]

In [119]:
X_test.columns

Index(['RenalDiseaseIndicator', 'NoOfMonths_PartACov', 'NoOfMonths_PartBCov',
       'ChronicCond_Alzheimer', 'ChronicCond_Heartfailure',
       'ChronicCond_KidneyDisease', 'ChronicCond_Cancer',
       'ChronicCond_ObstrPulmonary', 'ChronicCond_Depression',
       'ChronicCond_Diabetes', 'ChronicCond_IschemicHeart',
       'ChronicCond_Osteoporasis', 'ChronicCond_rheumatoidarthritis',
       'ChronicCond_stroke', 'IPAnnualReimbursementAmt',
       'IPAnnualDeductibleAmt', 'OPAnnualReimbursementAmt',
       'OPAnnualDeductibleAmt', 'InscClaimAmtReimbursed_In',
       'AttendingPhysician_In', 'OperatingPhysician_In', 'OtherPhysician_In',
       'DeductibleAmtPaid_In', 'num_claim_diagnose_in',
       'num_claim_procedure_in'],
      dtype='object')

In [120]:
X_test.shape, df_test.shape

((520, 25), (520, 46))

In [121]:
df_test.columns

Index(['Provider', 'BeneID', 'DOB', 'DOD', 'RenalDiseaseIndicator',
       'NoOfMonths_PartACov', 'NoOfMonths_PartBCov', 'ChronicCond_Alzheimer',
       'ChronicCond_Heartfailure', 'ChronicCond_KidneyDisease',
       'ChronicCond_Cancer', 'ChronicCond_ObstrPulmonary',
       'ChronicCond_Depression', 'ChronicCond_Diabetes',
       'ChronicCond_IschemicHeart', 'ChronicCond_Osteoporasis',
       'ChronicCond_rheumatoidarthritis', 'ChronicCond_stroke',
       'IPAnnualReimbursementAmt', 'IPAnnualDeductibleAmt',
       'OPAnnualReimbursementAmt', 'OPAnnualDeductibleAmt', 'ClaimID',
       'ClaimStartDt_In', 'ClaimEndDt_In', 'InscClaimAmtReimbursed_In',
       'AttendingPhysician_In', 'OperatingPhysician_In', 'OtherPhysician_In',
       'AdmissionDt', 'ClmAdmitDiagnosisCode_In', 'DeductibleAmtPaid_In',
       'DischargeDt', 'DiagnosisGroupCode', 'ClaimStartDt_Out',
       'ClaimEndDt_Out', 'InscClaimAmtReimbursed_Out',
       'AttendingPhysician_Out', 'OperatingPhysician_Out',
       'Other

In [122]:
# Create DataFrame with 'Provider' and 'PotentialFraud' columns
output_df = pd.DataFrame({'Provider': df_test['Provider'], 'PotentialFraud': predicted_labels})
output_df

Unnamed: 0,Provider,PotentialFraud
0,PRV51009,No
1,PRV51010,No
2,PRV51020,No
3,PRV51022,No
4,PRV51033,No
...,...,...
515,PRV57608,No
516,PRV57644,No
517,PRV57649,No
518,PRV57666,No


In [123]:
output_df['PotentialFraud'].value_counts()

PotentialFraud
No     416
Yes    104
Name: count, dtype: int64

In [124]:
# Create the output dir and confirm that it was created
output_dir = 'output'
os.makedirs(output_dir, exist_ok=True)

# Save the output to a CSV file
output_file = os.path.join(output_dir, 'predictions.csv')
output_df.to_csv(output_file, index=False)