In [1]:
import os
import sys

In [2]:
parent_dir = os.path.dirname(os.getcwd())
sys.path.append(parent_dir)

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv('../data/Insurance_claims_data.csv')

print(df.shape)
df.head()

test_df = df.sample(20000, random_state=23)

# Create train/test split with stratification since data is imbalanced
X = test_df.drop('claim_status', axis=1)
y = test_df['claim_status']

# Use stratify parameter to maintain class distribution in both splits
# Use test_size=0.2 for 80/20 split which is common practice
X_train, X_test, y_train, y_test = train_test_split(
    X, 
    y,
    test_size=0.2,
    random_state=42, # For reproducibility
    stratify=y # Maintain class distribution
)

# Print shapes to verify split
print(f"Training set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")
print("\nClass distribution in splits:")
print("Training set:", pd.Series(y_train).value_counts(normalize=True))
print("Test set:", pd.Series(y_test).value_counts(normalize=True))


(58592, 41)
Training set shape: (16000, 40)
Test set shape: (4000, 40)

Class distribution in splits:
Training set: claim_status
0    0.935063
1    0.064937
Name: proportion, dtype: float64
Test set: claim_status
0    0.935
1    0.065
Name: proportion, dtype: float64


In [4]:
import nest_asyncio
nest_asyncio.apply()

In [5]:
from xgboost import XGBClassifier

from src.modules.fe import CAAFETransformer

model = XGBClassifier(
        seed=42,
        objective="binary:logistic",
        eval_metric="auc",
        n_jobs=-1,
        use_label_encoder=False,
        verbosity=0,
        enable_categorical=True,
    )

fe = CAAFETransformer(
    llm_model='gpt-4.1-mini',
    target_name="claim_status",
    dataset_description="Insurance claim data.",
    max_features=5,
    iterations=3,
    n_splits=5,
    n_repeats=2,
    random_state=123,
    base_classifier=model,
)

In [6]:
fe.fit(X, y, show_prompts=True)

[2025-06-02 17:12:54] INFO:src.modules.fe.CAAFETransformer: Starting CAAFETransformer.fit(): running iterative feature engineering.
[2025-06-02 17:12:54] INFO:src.modules.fe.CAAFETransformer: CAAFE transformer initialization completed:
  Target: claim_status
  Dataset shape: (20000, 41)
  Original features: 40
  Max features per iteration: 5
  Max iterations: 3
  Optimization metric: accuracy
  LLM model: gpt-4.1-mini
  CV splits: 5
  CV repeats: 2
[2025-06-02 17:12:54] INFO:src.modules.fe.CAAFETransformer: Starting iterative feature engineering process...
[2025-06-02 17:12:54] INFO:src.modules.fe.CAAFETransformer: 

→ Evaluating baseline performance (no added features)...

[2025-06-02 17:13:03] INFO:src.modules.fe.CAAFETransformer: 
Baseline ROC AUC: 0.536 (±0.018)
[2025-06-02 17:13:03] INFO:src.modules.fe.CAAFETransformer: 
Baseline Accuracy: 0.934 (±0.004)
[2025-06-02 17:13:03] INFO:src.modules.fe.CAAFETransformer: 

--- Iteration 1/3 ---

[2025-06-02 17:13:29] INFO:src.modules.fe.C

In [7]:
fe.usages

[Usage(requests=6, request_tokens=28728, response_tokens=1561, total_tokens=30289, details={'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0, 'cached_tokens': 18432}),
 Usage(requests=5, request_tokens=30489, response_tokens=947, total_tokens=31436, details={'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0, 'cached_tokens': 24576}),
 Usage(requests=2, request_tokens=11544, response_tokens=732, total_tokens=12276, details={'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0, 'cached_tokens': 6016})]

In [15]:
train_transformed = fe.transform(test_df)

In [9]:
fe.save_code("features_day1.md")

[2025-06-02 17:17:13] INFO:src.modules.fe.CAAFETransformer: Feature-generation code saved (as Markdown) to features_day1.md


In [10]:
print(fe.get_formatted_agent_notepad(n=-1))

Iteration 1
Features created: max_torque_value, max_power_value, age_to_vehicle_ratio, safety_feature_count, torque_power_ratio
Features dropped: length, width, displacement, turning_radius, gross_weight
Performance before adding features ROC 0.5365, ACC 0.9342.
Performance after adding features ROC 0.5422, ACC 0.9341.
Improvement ROC +0.005756, ACC -0.0001.
Note: Code was ACCEPTED and applied to the dataset. Columns were successfully added/dropped.

Iteration 2
Features created: customer_vehicle_age_interaction, power_safety_index, segment_ordinal, subscription_safety_interaction, old_vehicle_low_ncap
Features dropped: max_power_value
Performance before adding features ROC 0.5422, ACC 0.9341.
Performance after adding features ROC 0.5508, ACC 0.9343.
Improvement ROC +0.008559, ACC +0.0002.
Note: Code was ACCEPTED and applied to the dataset. Columns were successfully added/dropped.

Iteration 3
Features created: vehicle_risk_age_index, log_region_density, safety_feature_per_age, high_to

In [16]:
train_transformed

Unnamed: 0,policy_id,subscription_length,vehicle_age,customer_age,region_code,region_density,segment,model,fuel_type,max_torque,...,is_brake_assist,is_power_door_locks,is_central_locking,is_power_steering,is_driver_seat_height_adjustable,is_day_night_rear_view_mirror,is_ecw,is_speed_alert,ncap_rating,claim_status
40725,POL003122,11.1,1.2,48,C8,8794,B2,M5,Diesel,200Nm@3000rpm,...,No,Yes,Yes,Yes,No,No,Yes,Yes,5,0
51580,POL015772,1.9,0.0,47,C19,27742,Utility,M10,CNG,85Nm@3000rpm,...,No,No,No,No,No,No,No,Yes,0,0
17526,POL024362,3.2,0.0,36,C9,17804,A,M1,CNG,60Nm@3500rpm,...,No,No,No,Yes,No,No,No,Yes,0,0
46321,POL013699,11.9,2.0,35,C8,8794,B2,M6,Petrol,113Nm@4400rpm,...,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,2,0
11059,POL034627,11.4,1.2,52,C8,8794,C2,M4,Diesel,250Nm@2750rpm,...,Yes,Yes,Yes,Yes,Yes,No,Yes,Yes,3,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45419,POL037132,1.0,1.6,44,C8,8794,B2,M6,Petrol,113Nm@4400rpm,...,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,2,0
4331,POL032698,11.8,1.0,44,C2,27003,B2,M6,Petrol,113Nm@4400rpm,...,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,2,1
18996,POL053253,8.7,0.2,40,C5,34738,B2,M6,Petrol,113Nm@4400rpm,...,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,2,0
30170,POL022214,12.5,0.4,35,C18,35036,C1,M9,Diesel,200Nm@1750rpm,...,No,Yes,Yes,Yes,Yes,Yes,Yes,Yes,4,0


In [17]:
from sklearn.calibration import LabelEncoder

engineered_df = train_transformed


categorical_columns = engineered_df.select_dtypes(include=['object', 'category']).columns

for col in categorical_columns:
    le = LabelEncoder()
    
    # Handle pandas Categorical columns specially
    if pd.api.types.is_categorical_dtype(engineered_df[col]):
        # For categorical columns, add 'missing' to categories first if needed
        if 'missing' not in engineered_df[col].cat.categories:
            engineered_df[col] = engineered_df[col].cat.add_categories(['missing'])
        engineered_df[col] = engineered_df[col].fillna('missing')
    else:
        # For object columns, fillna directly
        engineered_df[col] = engineered_df[col].fillna('missing')
    
    # Fit encoder on training data
    le.fit(engineered_df[col])
    engineered_df[col] = le.transform(engineered_df[col])

In [18]:
from src.modules.xgb_tune import XGBoostTuner


llm_tuner = XGBoostTuner(engineered_df, 'claim_status')

In [None]:
import nest_asyncio
nest_asyncio.apply()

llm_tuner.tune()

Metric Explanation: Your insurance claims dataset is a binary, highly imbalanced classification problem with only 6.5% positive class. You wish to optimize for PPV (Positive Predictive Value), which is equivalent to precision in scikit-learn metrics. Among built-in classifier metrics in scikit-learn, 'precision' directly aligns with PPV and is understood by AutoML and XGBoost. Therefore, for hyperparameter optimization, 'precision' is the most suitable built-in metric, both reflecting your business objective and working with existing AutoML/XGBoost infrastructure.
Metric: precision
Initial Search Space: Given the dataset (20,000 rows, 41 features, severe class imbalance: ~6.5% positive), an initial search space must balance overfitting risk, model capacity, and effectively handle class imbalance. Typical XGBoost practices, binary imbalanced task requirements, and dataset size guide the following choices:

- n_estimators: Wide range, since large numbers may be needed for small learning 

In [7]:
# Get just the best configuration
best_config = llm_tuner.get_best_config()
print("Best hyperparameters:", best_config)

# Get comprehensive summary
summary = llm_tuner.get_tuning_summary()
print(f"Best score: {summary['best_score']}")
print(f"Best config: {summary['best_config']}")
print(f"Completed {summary['total_iterations']} iterations")
print(f"Score improvement: {summary['improvement_over_baseline']}")

Best hyperparameters: {'colsample_bylevel': np.float64(0.8678895162042739), 'colsample_bytree': np.float64(0.6683147011933056), 'gamma': np.float64(0.9514155170441483), 'learning_rate': np.float64(0.012031598579099684), 'max_depth': 10, 'min_child_weight': np.float64(2.1410086500482213), 'n_estimators': 525, 'reg_alpha': np.float64(0.0006467995505792484), 'reg_lambda': np.float64(0.0008126714621986295), 'scale_pos_weight': np.float64(3.4535261779853093), 'subsample': np.float64(0.7192068248303468)}
Best score: 1.0
Best config: {'colsample_bylevel': np.float64(0.8678895162042739), 'colsample_bytree': np.float64(0.6683147011933056), 'gamma': np.float64(0.9514155170441483), 'learning_rate': np.float64(0.012031598579099684), 'max_depth': 10, 'min_child_weight': np.float64(2.1410086500482213), 'n_estimators': 525, 'reg_alpha': np.float64(0.0006467995505792484), 'reg_lambda': np.float64(0.0008126714621986295), 'scale_pos_weight': np.float64(3.4535261779853093), 'subsample': np.float64(0.7192

In [None]:
if best_config:
    final_model = XGBClassifier(**best_config)
    final_model.fit(X_train, y_train)

In [None]:
y_pred = final_model.predict(X_test)
y_pred_proba = final_model.predict_proba(X_test)[:, 1]

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import (
    precision_score,
    recall_score,
    accuracy_score,
    roc_auc_score,
    f1_score,
    matthews_corrcoef
)


def metrics_display(y_test, y_pred, y_pred_proba):
    
    # Obtain confusion matrix
    cm = confusion_matrix(y_test, y_pred)
   
    # Output classification metrics
    tn, fp, fn, tp = cm.ravel()
   
    print(f'ROC_AUC score: {roc_auc_score(y_test, y_pred_proba):.3f}')
    print(f'f1 score: {f1_score(y_test, y_pred):.3f}')
    print(f'Accuracy: {accuracy_score(y_test, y_pred)*100:.2f}%')
    print(f'Precision: {precision_score(y_test, y_pred)*100:.2f}%')
    print(f'Detection rate: {recall_score(y_test, y_pred)*100:.2f}%')
    print(f'False alarm rate: {fp / (tn+fp)*100}%')
    print(f'MCC: {matthews_corrcoef(y_test, y_pred):.2f}')
   
    # Display confusion matrix
    # ConfusionMatrixDisplay.from_predictions(y_test, y_pred, values_format='.5g', colorbar=False)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm)
    disp.plot()

In [11]:
from sklearn import metrics