##Feature Engineering Notebook

**Goal: create new features for demographics, coverage periods, reimbursement/beneficiary and other potential derived features

**Input ../data/interim/beneficiary_cleaned.csv

**Output ../data/processed/beneficiary_features.csv

In [39]:
import os 
import pandas as pd 
import numpy as np

INPUT_PATH = '../data/interim/beneficiary_cleaned.csv'
OUTPUT_PATH = '../data/processed/beneficiary_features'
AS_OF_DATE = '2020-12-31'

In [40]:
#Read In data and observe

df = pd.read_csv(INPUT_PATH, parse_dates=['Birth_date', 'Date_of_Death'], low_memory=False )
print(df.shape)

df.head()

df.dtypes.head(15)

(349064, 36)


Beneficiary Code                             object
Birth_date                           datetime64[ns]
Date_of_Death                        datetime64[ns]
Gender                                        int64
Race                                          int64
End_Stage_Renal_Disease_Indicator            object
State_code                                    int64
County_code                                   int64
Number_of_months_covered_a                    int64
Numver_of_months_covered_b                    int64
Number_of_months_HMO_coverage                 int64
Number_of_months_covered_d                    int64
Alzhiemers_or_senile                          int64
Heart_Failure                                 int64
Chronic_Kidney                                int64
dtype: object

In [41]:
#Creating helper functions 

def safe_divide(numer, denom, eps=1.0):
    return numer / (denom + eps)

def to_datetime(s):
    return pd.to_datetime(s, errors='coerce')

#defining chronic conditions columns

chronic_columns = ['Alzhiemers_or_senile',
    'Heart_Failure',
    'Chronic_Kidney',
    'Cancer',
    'COPD',
    'Depression',
    'Diabetes',
    'ischemic_Heart_Disease',
    'Osteoporosis',
    'Rheumatoid_Arthritis',
    'Stroke']

In [42]:
#Creating and Mapping Demographic Features

fe = df.copy()

#Age

if 'Birth_date' in fe:
    birth = pd.to_datetime(fe["Birth_date"], errors="coerce")    

# Ensure as_of is a Timestamp, not a string
as_of = pd.Timestamp(AS_OF_DATE)

fe['AGE'] = ((as_of - birth).dt.days // 365).astype("Int64")

#Is Dead

if "Date_of_Death" in fe:
    death = to_datetime(fe['Date_of_Death'])
    fe['Date_of_Death'] = death.notna().astype(int)

#Mapping Gender and Race for Human comprehension

sex_map = {1:"Male", 2: "Female"}

race_map = {1:"White", 2:"Black", 3:"Other",4:"Asian", 5: "Hispanic"}

if "Gender" in fe:
    fe["Gender"] = fe["Gender"].map(sex_map).astype("string")

if "Race" in fe:
    fe["Race"] = fe["Race"].map(race_map).astype("string")

fe[["AGE","Date_of_Death","Gender","Race"]].head(3)

Unnamed: 0,AGE,Date_of_Death,Gender,Race
0,51,0,Female,Hispanic
1,51,0,Female,White
2,51,0,Male,White


In [43]:
#Creating Coverage & Utilization features

pa = pd.to_numeric(fe.get("Number_of_months_covered_a", 0), errors="coerce").fillna(0)
pb = pd.to_numeric(fe.get("Numver_of_months_covered_b", 0), errors="coerce").fillna(0)
hmo = pd.to_numeric(fe.get("Number_of_months_HMO_coverage", 0), errors="coerce").fillna(0)
pd_m = pd.to_numeric(fe.get("Number_of_months_covered_d", 0), errors="coerce").fillna(0)

fe['total_coverage_months'] = pa + pb + hmo + pd_m

#Chronic Burden

if chronic_columns:
    fe['chronic_count'] = (
        fe[chronic_columns].apply(pd.to_numeric, errors="coerce").fillna(0).astype(int).sum(axis=1)
    )

#Reimbursement totals
ip = pd.to_numeric(fe.get('IP_reimbursement',0), errors='coerce').fillna(0)
op = pd.to_numeric(fe.get('OP_medicare_reimbursement',0), errors='coerce').fillna(0)
car = pd.to_numeric(fe.get('Carrier_medicare_reimb',0), errors='coerce').fillna(0)

total_reimb = ip + op + car

#Ratio features

fe["avg_reimb"] = safe_divide(total_reimb, fe['total_coverage_months']).astype(float)
fe["op_ratio"] = safe_divide(op, ip).astype(float)
fe["car_ratio"] = safe_divide(car, (op + ip)).astype(float)

fe[["total_coverage_months","chronic_count","avg_reimb","op_ratio","car_ratio"]].describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
total_coverage_months,349064.0,31.031874,11.900086,0.0,24.0,36.0,36.0,48.0
chronic_count,349064.0,19.775694,2.451995,11.0,18.0,21.0,22.0,22.0
avg_reimb,349064.0,138.894367,675.673983,-60.0,0.0,30.0,99.2,115800.0
op_ratio,349064.0,417.883978,1375.417411,-100.0,0.0,0.004999,280.0,56930.0
car_ratio,349064.0,176.413754,549.163544,-301.052632,0.0,0.882598,14.705882,12800.0


In [44]:
#Creating Flags 

#High Chronic

if "chronic_count" in fe:
    fe['high_chronic_flag'] = (fe['chronic_count'] >= 5).astype(int)

#High reimbursement

q75 = fe['avg_reimb'].quantile(0.75) if "avg_reimb" in fe else np.nan
if np.isfinite(q75):
    fe['high_reimb_flag'] = (fe['avg_reimb']>q75).astype(int)

#HMO Coverage
fe['hmo_covered_flag'] = (pd.to_numeric(fe.get('Number_of_months_HMO_coverage',0),errors="coerce").fillna(0) > 0).astype(int)

#A + B coverage present

fe["dual_elig_flag"] = (
    (pd.to_numeric(fe.get("Number_of_months_covered_a", 0), errors="coerce").fillna(0) > 0) &
    (pd.to_numeric(fe.get("Numver_of_months_covered_b", 0), errors="coerce").fillna(0) > 0)
).astype(int)

fe[["high_chronic_flag","high_reimb_flag","hmo_covered_flag","dual_elig_flag"]].mean().round(3)

high_chronic_flag    1.000
high_reimb_flag      0.250
hmo_covered_flag     0.231
dual_elig_flag       0.898
dtype: float64

In [45]:
os.makedirs(os.path.dirname(OUTPUT_PATH),exist_ok=True)
fe.to_csv(OUTPUT_PATH, index=False)
OUTPUT_PATH, fe.shape 

('../data/processed/beneficiary_features', (349064, 45))