In [1]:
import pandas as pd
import numpy as np
%matplotlib inline
import seaborn as sns
import datetime as dt
from matplotlib import pyplot as plt
pd.set_option('display.max_columns', None)

In [2]:
# Training data sets
benef = pd.read_csv('Data/Train_Beneficiary.csv')
inpat = pd.read_csv('Data/Train_Inpatient.csv')
outpat = pd.read_csv('Data/Train_Outpatient.csv')
fraud = pd.read_csv('Data/Train-Potential Fraud.csv')

In [3]:
#Create columns for inpatient and outpatient data.
inpat['patientType'] = np.repeat('inpatient', len(inpat))
outpat['patientType'] = np.repeat('outpatient', len(outpat))

In [4]:
patient = pd.concat([inpat, outpat], axis=0)

In [5]:
# Combining patient dataset with beneficiary dataset 
full_data =pd.merge(patient, benef, on='BeneID', how='left')

In [6]:
# Creating DaysAdmitted Feature by substracting claim start date from claim end date 
full_data['ClaimStartDt'] = pd.to_datetime(full_data['ClaimStartDt'])
full_data['ClaimEndDt'] = pd.to_datetime(full_data['ClaimEndDt'])
full_data['DaysAdmitted'] = full_data['ClaimEndDt'] - full_data['ClaimStartDt']
# Chaging DaysAdmitted into integer object 
full_data['DaysAdmitted'] = full_data['DaysAdmitted'].astype(str)
full_data['DaysAdmitted'] = full_data['DaysAdmitted'].str.replace('days', ' ')

full_data['DaysAdmitted'] = full_data['DaysAdmitted'].str.replace('00:00:00.000000000', '')

full_data['DaysAdmitted'] = full_data['DaysAdmitted'].astype(int)

full_data['DaysAdmitted'] = full_data['DaysAdmitted'] + 1

  full_data['DaysAdmitted'] = full_data['DaysAdmitted'].str.replace('00:00:00.000000000', '')


In [7]:
diagnoses_cols= ['ClmDiagnosisCode_1', 'ClmDiagnosisCode_2','ClmDiagnosisCode_3', \
                 'ClmDiagnosisCode_4', 'ClmDiagnosisCode_5','ClmDiagnosisCode_6', \
                 'ClmDiagnosisCode_7', 'ClmDiagnosisCode_8','ClmDiagnosisCode_9', \
                 'ClmDiagnosisCode_10']

procedure_cols = ['ClmProcedureCode_1','ClmProcedureCode_2', 'ClmProcedureCode_3', \
                  'ClmProcedureCode_4','ClmProcedureCode_5', 'ClmProcedureCode_6']

full_data["NumDiagnoses"] = 10 - full_data[diagnoses_cols].isnull().sum(axis=1)
full_data["NumProcedures"] = 6 - full_data[procedure_cols].isnull().sum(axis=1)

In [8]:
full_data.columns.values

array(['BeneID', 'ClaimID', 'ClaimStartDt', 'ClaimEndDt', 'Provider',
       'InscClaimAmtReimbursed', 'AttendingPhysician',
       'OperatingPhysician', 'OtherPhysician', 'AdmissionDt',
       'ClmAdmitDiagnosisCode', 'DeductibleAmtPaid', 'DischargeDt',
       'DiagnosisGroupCode', 'ClmDiagnosisCode_1', 'ClmDiagnosisCode_2',
       'ClmDiagnosisCode_3', 'ClmDiagnosisCode_4', 'ClmDiagnosisCode_5',
       'ClmDiagnosisCode_6', 'ClmDiagnosisCode_7', 'ClmDiagnosisCode_8',
       'ClmDiagnosisCode_9', 'ClmDiagnosisCode_10', 'ClmProcedureCode_1',
       'ClmProcedureCode_2', 'ClmProcedureCode_3', 'ClmProcedureCode_4',
       'ClmProcedureCode_5', 'ClmProcedureCode_6', 'patientType', 'DOB',
       'DOD', 'Gender', 'Race', 'RenalDiseaseIndicator', 'State',
       'County', 'NoOfMonths_PartACov', 'NoOfMonths_PartBCov',
       'ChronicCond_Alzheimer', 'ChronicCond_Heartfailure',
       'ChronicCond_KidneyDisease', 'ChronicCond_Cancer',
       'ChronicCond_ObstrPulmonary', 'ChronicCond_Depressio

In [9]:
len(full_data)

558211

In [10]:
full_data.isna().sum()

BeneID                                  0
ClaimID                                 0
ClaimStartDt                            0
ClaimEndDt                              0
Provider                                0
InscClaimAmtReimbursed                  0
AttendingPhysician                   1508
OperatingPhysician                 443764
OtherPhysician                     358475
AdmissionDt                        517737
ClmAdmitDiagnosisCode              412312
DeductibleAmtPaid                     899
DischargeDt                        517737
DiagnosisGroupCode                 517737
ClmDiagnosisCode_1                  10453
ClmDiagnosisCode_2                 195606
ClmDiagnosisCode_3                 315156
ClmDiagnosisCode_4                 393675
ClmDiagnosisCode_5                 446287
ClmDiagnosisCode_6                 473819
ClmDiagnosisCode_7                 492034
ClmDiagnosisCode_8                 504767
ClmDiagnosisCode_9                 516396
ClmDiagnosisCode_10               

In [11]:
full_data.drop(["DischargeDt", "DiagnosisGroupCode", "OperatingPhysician", "OtherPhysician", "AdmissionDt", "ClmAdmitDiagnosisCode", "DOD"], axis=1, inplace=True)

In [12]:
full_data.isna().sum()

BeneID                                  0
ClaimID                                 0
ClaimStartDt                            0
ClaimEndDt                              0
Provider                                0
InscClaimAmtReimbursed                  0
AttendingPhysician                   1508
DeductibleAmtPaid                     899
ClmDiagnosisCode_1                  10453
ClmDiagnosisCode_2                 195606
ClmDiagnosisCode_3                 315156
ClmDiagnosisCode_4                 393675
ClmDiagnosisCode_5                 446287
ClmDiagnosisCode_6                 473819
ClmDiagnosisCode_7                 492034
ClmDiagnosisCode_8                 504767
ClmDiagnosisCode_9                 516396
ClmDiagnosisCode_10                553201
ClmProcedureCode_1                 534901
ClmProcedureCode_2                 552721
ClmProcedureCode_3                 557242
ClmProcedureCode_4                 558093
ClmProcedureCode_5                 558202
ClmProcedureCode_6                

In [14]:
full_data = pd.merge(full_data, fraud, on = 'Provider', how = 'left')
full_data.loc[:,'PotentialFraud'] = np.where(full_data['PotentialFraud'] == 'Yes', 1, 0)

In [15]:
full_data.to_csv("Merged_Data.csv")