# Labeling (Rule-based + Isolation Forest)

**Goal** Create y labels for high-cost detecton:
-y_rule (transparent, programatic rules)
-y_iforest (unsupervised labeling approach)
-y_final (consensus options)

**Input** ../data/preprocessed/beneficiary_features.csv
**Output** ../data/preprocessed/beneficiary_labeled.csv

In [8]:
import pandas as pd
import numpy as np

FEATURE_PATH = '../data/processed/beneficiary_features'
df = pd.read_csv(FEATURE_PATH)

print(df.shape)
df.head(3)
df.columns

(349064, 45)


Index(['Beneficiary Code', 'Birth_date', 'Date_of_Death', 'Gender', 'Race',
       'End_Stage_Renal_Disease_Indicator', 'State_code', 'County_code',
       'Number_of_months_covered_a', 'Numver_of_months_covered_b',
       'Number_of_months_HMO_coverage', 'Number_of_months_covered_d',
       'Alzhiemers_or_senile', 'Heart_Failure', 'Chronic_Kidney', 'Cancer',
       'COPD', 'Depression', 'Diabetes', 'ischemic_Heart_Disease',
       'Osteoporosis', 'Rheumatoid_Arthritis', 'Stroke', 'IP_reimbursement',
       'IP_Bene_Amount', 'IP_primary_payer_reimbursement',
       'OP_medicare_reimbursement', 'OP_bene_resp_amount', 'OP_primary',
       'Carrier_medicare_reimb', 'Carrier_bene_amount',
       'Carrier_annual_payer_reimb', 'AGE', 'total_reimbursement',
       'total_beneficiary_amount', 'total_primary_payment',
       'total_coverage_months', 'chronic_count', 'avg_reimb', 'op_ratio',
       'car_ratio', 'high_chronic_flag', 'high_reimb_flag', 'hmo_covered_flag',
       'dual_elig_flag'],

In [9]:
#Columns needed for labeling

needed = [
    'AGE','Date_of_Death', 'Gender', 'Race', 'total_coverage_months', 'chronic_count',
    'avg_reimb', 'op_ratio','car_ratio', 'Number_of_months_covered_a', 'Numver_of_months_covered_b',
       'Number_of_months_HMO_coverage', 'Number_of_months_covered_d',
]

present = [c for c in needed if c in df.columns]
w = df[present].copy()

#Clean ratios/numerics
for c in ["avg_reimb", "op_ratio", "car_ratio"]:
    if c in w:
        w[c] = pd.to_numeric(w[c], errors="coerce").replace([np.inf, -np.inf], np.nan)

for c in w.select_dtypes(include=[np.number]).columns:
    w[c] = w[c].fillna(w[c].median())

w.head(3)

Unnamed: 0,AGE,Date_of_Death,Gender,Race,total_coverage_months,chronic_count,avg_reimb,op_ratio,car_ratio,Number_of_months_covered_a,Numver_of_months_covered_b,Number_of_months_HMO_coverage,Number_of_months_covered_d
0,51,0,Female,Hispanic,24,13,3551.2,0.018765,0.075859,12,12,0,0
1,51,0,Female,White,36,15,509.459459,0.13272,0.254976,12,12,0,12
2,51,0,Male,White,48,13,798.77551,0.002667,0.040956,12,12,12,12


In [10]:
#Rules Based lableing

w = w.copy()

q = w.quantile([0.75, 0.90, 0.95])
thr_avg_cost_hi  = q.loc[0.90, "avg_reimb"] if "avg_reimb" in w else np.inf
thr_out_ratio_hi = q.loc[0.95, "op_ratio"]   if "op_ratio"   in w else np.inf
thr_car_ratio_hi = q.loc[0.95, "car_ratio"]      if "car_ratio" in w else np.inf
thr_avg_cost_q75 = q.loc[0.75, "avg_reimb"]  if "avg_reimb"  in w else np.inf

rules = pd.DataFrame(index=w.index)
rules["R_high_cost"]        = (w["avg_reimb"] > thr_avg_cost_hi).astype(int)      if "avg_reimb" in w else 0
rules["R_high_out_ratio"]   = (w["op_ratio"]  > thr_out_ratio_hi).astype(int)      if "op_ratio"  in w else 0
rules["R_high_car_ratio"]   = (w["car_ratio"]     > thr_car_ratio_hi).astype(int)      if "car_ratio"     in w else 0
rules["R_high_chronic"]     = (w["chronic_count"]     >= 5).astype(int)                    if "chronic_count"     in w else 0
rules["R_edge_age"]         = ((w["AGE"] <= 5) | (w["AGE"] >= 90)).astype(int)             if "AGE"               in w else 0
rules["R_low_cov_high_cost"]= ((w["total_coverage_months"] <= 3) & (w["avg_reimb"] > thr_avg_cost_q75)).astype(int) \
                               if set(["total_coverage_months","avg_reimb"]).issubset(w.columns) else 0

y_rule = (rules.sum(axis=1) > 0).astype(int)

rules_summary = pd.DataFrame({
    "trigger_rate_%": (rules.mean()*100).round(2),
    "count": rules.sum()
}).sort_values("trigger_rate_%", ascending=False)

y_rule_rate = (y_rule.mean()*100).round(2)
rules_summary, y_rule_rate

TypeError: unsupported operand type(s) for -: 'str' and 'str'