# 
# Feature Engineering



### Table of content
- Admitted Date Mean
- Deductible Amount Paid Average
- Insurance Claim Amount Reimbursed Average
- Number of Patients
- Number of Providers
- Number of country per provider
- Number of claims
- Average number of chronic conditions per provider
- Average Age 
- Gender of Patients per Providers

In [1]:
import pandas as pd
from pandas_profiling import ProfileReport
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
plt.rcParams['figure.figsize'] = (12,8)
import warnings
warnings.filterwarnings("ignore")

In [2]:
Beneficiary = pd.read_csv('Data/Train_Beneficiary.csv')
Inpatient = pd.read_csv('Data/Train_Inpatient.csv')
Outpatient = pd.read_csv('Data/Train_Outpatient.csv')
Label = pd.read_csv('Data/Train_Label.csv')

In [3]:
Inpatient['Patient_type'] = 'inpatient'
Outpatient['Patient_type'] = 'outpatient'

In [4]:
Patient = pd.concat([Inpatient, Outpatient], axis=0)

In [5]:
fraud = pd.merge(Patient, Beneficiary, on='BeneID', how='left')
fraud = pd.merge(fraud, Label, on='Provider', how='left')

### Let's create the average of the "admitted date "

In [6]:
# Creating DaysAdmitted Feature by substracting claim start date from claim end date 
fraud['ClaimStartDt'] = pd.to_datetime(fraud['ClaimStartDt'])
fraud['ClaimEndDt'] = pd.to_datetime(fraud['ClaimEndDt'])
fraud['DaysAdmitted'] = fraud['ClaimEndDt'] - fraud['ClaimStartDt']

In [7]:
fraud['DaysAdmitted'] = fraud['DaysAdmitted'].astype(str)
fraud['DaysAdmitted'] = fraud['DaysAdmitted'].str.replace('days', ' ')
fraud['DaysAdmitted'] = fraud['DaysAdmitted'].astype(int)
fraud['DaysAdmitted'] = fraud['DaysAdmitted'] + 1

In [8]:
DaysAdmitted_mean = pd.DataFrame(fraud.groupby('Provider')['DaysAdmitted'].mean()).reset_index()

In [9]:
Label_Fraud_1 = Label.copy()
Label_Fraud_2 = pd.merge(Label_Fraud_1, DaysAdmitted_mean, on='Provider')

### DeductibleAmountPaid Mean

In [10]:
DeductibleAmtPaid_mean = pd.DataFrame(fraud.groupby('Provider') \
                              ['DeductibleAmtPaid'].mean())
Label_Fraud_2 = pd.merge(Label_Fraud_2, DeductibleAmtPaid_mean, on='Provider')

### Insurance Claim Amount Reimbursed Mean

In [11]:
InscClaimAmtReimbursed_mean = pd.DataFrame(fraud.groupby('Provider') \
                              ['InscClaimAmtReimbursed'].mean())
Label_Fraud_2 = pd.merge(Label_Fraud_2, InscClaimAmtReimbursed_mean, on='Provider')

### Add number of patient and Doctors

In [12]:
doctors= fraud.groupby(['Provider', 'AttendingPhysician']) \
['AttendingPhysician'].count().reset_index(name='NumOfDoctors'). \
groupby('Provider')['NumOfDoctors'].count().reset_index()

Label_Fraud_2 = pd.merge(Label_Fraud_2, doctors, on='Provider')


In [13]:
patient= fraud.groupby(['Provider','BeneID'])['BeneID'].count(). \
reset_index(name='NumOfPatients').groupby('Provider')['NumOfPatients']. \
count().reset_index()

Label_Fraud_2 = pd.merge(Label_Fraud_2, patient, on='Provider')


### Number of claims

In [14]:
claims= fraud.groupby(['Provider', 'ClaimID'])['ClaimID'].count().\
reset_index(name='NumOfClms').groupby('Provider')['NumOfClms'].count().reset_index()

Label_Fraud_2 = pd.merge(Label_Fraud_2, claims, on='Provider')

### Number of country per provider

In [15]:
numCounties = fraud.groupby(['Provider', 'County'])['County'].count().reset_index(name='a'). \
groupby('Provider')['County'].count().reset_index(name='numCounties')
Label_Fraud_2 = pd.merge(Label_Fraud_2, numCounties, on='Provider')

### Add average number of chronic conditions per provider

In [16]:
patientChronic_mean = fraud.filter(regex='Chronic').replace(to_replace=2, value=0).sum(axis=1).\
reset_index(name='NumChronicCond')
chronic = pd.concat([patientChronic_mean, fraud[['Provider', 'ClaimID']]], axis=1)

In [17]:
patientChronic_mean = chronic.groupby(['Provider', 'NumChronicCond'])['ClaimID'].count().reset_index(name='NumClaims').\
groupby('Provider').agg({'NumChronicCond':'mean', 'NumClaims':'mean'}).reset_index().\
rename(columns={'NumChronicCond':'AvgChronic', 'NumClaims':'AvgClaim'})


Label_Fraud_2 = pd.merge(Label_Fraud_2, patientChronic_mean, on='Provider')

### average Age 

In [18]:
fraud['ClaimStartDt2'] =  pd.to_datetime(fraud['ClaimStartDt'], format='%Y/%m/%d')
fraud['DOB'] =  pd.to_datetime(fraud['DOB'], format='%Y/%m/%d')
fraud['Age'] = fraud['ClaimStartDt'] - fraud['DOB']

In [19]:
fraud['Age'] = (fraud['Age']/pd.Timedelta(365, unit='d')).astype('int64')

In [20]:
age_mean = fraud.groupby(['Provider', 'BeneID'])['Age']. \
mean().reset_index(name = "Age").dropna() \
.groupby('Provider')['Age'].mean().reset_index()

In [21]:
Label_Fraud_2 = pd.merge(Label_Fraud_2, age_mean, on='Provider', how='left')

### Gender of patients per provider

In [22]:
gender = fraud.groupby(['Provider', 'BeneID', 'Gender'])['Gender'].count().reset_index(name = 'GenderCount')
gender['GenderCount'] = gender['GenderCount'].apply(lambda x: 1 if x >= 1 else 0)
gender = gender.groupby('Provider')['Gender'].value_counts().reset_index(name = 'GenderCount')

In [23]:
gender['male'] = 0
gender['female'] = 0

for index in gender.index:
    if gender['Gender'][index] == 1:
        gender['male'][index] += gender['GenderCount'][index]
    if gender['Gender'][index] == 2:
        gender['female'][index] += gender['GenderCount'][index]

In [24]:
gender = gender.drop('Gender', axis = 1)
gender = gender.drop('GenderCount', axis = 1)

In [25]:
genderCount = gender.groupby('Provider').agg({'male' : 'sum', 'female' : 'sum'}).reset_index()

In [26]:
Label_Fraud_2 = pd.merge(Label_Fraud_2, genderCount, on = 'Provider', how = 'left')