In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from pygam import LinearGAM, s, f
import logging

# Setup logging for debugging
logging.basicConfig(level=logging.INFO)
def log_progress(message):
    logging.info(message)

# Load dataset
log_progress("Loading dataset...")
df = pd.read_csv('synthetic_medical_cost_data.csv')

# Step 1: Check for missing values
log_progress("Checking for missing values...")
print("Missing values:")
print(df.isnull().sum())

# Fill missing values (if any)
for col in df.select_dtypes(include=['float64', 'int64']).columns:
    df[col].fillna(df[col].mean(), inplace=True)
for col in df.select_dtypes(include=['object']).columns:
    df[col].fillna(df[col].mode()[0], inplace=True)

# Step 2: Convert categorical features
log_progress("Encoding categorical features...")
df['Stress Level'] = df['Stress Level'].map({'Low': 1, 'Mild': 2, 'High': 3})

# Define feature categories
categorical_cols = ['Gender', 'Region', 'Diet Type', 'Employment Type', 'Policy Type', 'Age Group']
binary_cols = ['Smoking Status', 'Cancer', 'Diabetes', 'Cardiovascular Disease', 'Stroke', 'COPD', 'TB',
               'HIV/AIDS', 'Liver Disease', 'Kidney Disease', 'Alcohol Consumption', 'Has Diabetes',
               'Has Hypertension', 'Has Heart Disease', 'Has Cancer History', 'Policy Renewal Status']
numerical_cols = ['Age', 'BMI', 'Medical History Score', 'Annual Income', 'Credit Score',
                  'Savings Amount', 'Health Risk Score', 'Expense Ratio', 'Medication Costs Per Year', 'Hospital Visits Per Year']
discrete_cols = ['Number of Children', 'Exercise Frequency', 'Number of Dependents', 'Previous Insurance Claims']

# One-hot encode categorical variables
encoder = OneHotEncoder(drop='first', sparse_output=False)
categorical_encoded = encoder.fit_transform(df[categorical_cols])
categorical_df = pd.DataFrame(categorical_encoded, columns=encoder.get_feature_names_out())

# Convert 'Smoking Status' to numerical values
df['Smoking Status'] = df['Smoking Status'].map({'Non-Smoker': 0, 'Smoker': 1})

# Standardize numerical variables
scaler = StandardScaler()
numerical_scaled = scaler.fit_transform(df[numerical_cols])
numerical_df = pd.DataFrame(numerical_scaled, columns=numerical_cols)

# Merge all preprocessed features
X = pd.concat([numerical_df, categorical_df, df[binary_cols], df[discrete_cols]], axis=1)
Y = df['Insurance Cost']

# Split data
log_progress("Splitting dataset into training and testing sets...")
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Handle NaN/Inf values in X_train before model training
log_progress("Checking and handling NaN/Inf values in training data...")
if X_train.isnull().sum().sum() > 0:
    X_train.fillna(X_train.median(), inplace=True)
if not np.isfinite(X_train.select_dtypes(include=[np.number])).all().all():
    X_train.replace([np.inf, -np.inf], X_train.median(), inplace=True)

# Ensure data is clean
assert X_train.isnull().sum().sum() == 0, "X_train still contains NaN values!"
assert np.isfinite(X_train).all().all(), "X_train still contains infinite values!"

# Step 3: Train Random Forest Model
log_progress("Training Random Forest Model...")
rf_model = RandomForestRegressor(n_estimators=300, max_depth=10, random_state=42)
rf_model.fit(X_train, Y_train)

# Step 4: Train GAM Model
log_progress("Training Generalized Additive Model (GAM)...")
gam_model = LinearGAM(s(0) + s(1) + s(2) + s(3) + s(4) + s(5) + f(6) + f(7) + f(8)).fit(X_train, Y_train)

# Step 5: Predictions
log_progress("Making predictions...")
rf_preds = rf_model.predict(X_test)
gam_preds = gam_model.predict(X_test)

# Step 6: Evaluate Models
def evaluate_model(y_true, y_pred, model_name):
    log_progress(f"Evaluating {model_name}...")
    print(f"\n{model_name} Performance:")
    print(f"MAE: {mean_absolute_error(y_true, y_pred)}")
    print(f"MSE: {mean_squared_error(y_true, y_pred)}")
    print(f"R²: {r2_score(y_true, y_pred)}")

evaluate_model(Y_test, rf_preds, "Random Forest")
evaluate_model(Y_test, gam_preds, "GAM")

log_progress("Model training and evaluation completed.")

INFO:root:Loading dataset...
INFO:root:Checking for missing values...
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)
INFO:root:Encoding categorical features...
INFO:root:Splitting datase

Missing values:
Age                          0
Gender                       0
BMI                          0
Number of Children           0
Smoking Status               0
Region                       0
Cancer                       0
Diabetes                     0
Cardiovascular Disease       0
Stroke                       0
COPD                         0
TB                           0
HIV/AIDS                     0
Liver Disease                0
Kidney Disease               0
Alcohol Consumption          0
Exercise Frequency           0
Diet Type                    0
Stress Level                 0
Medical History Score        0
Has Diabetes                 0
Has Hypertension             0
Has Heart Disease            0
Has Cancer History           0
Annual Income                0
Employment Type              0
Credit Score                 0
Savings Amount               0
Number of Dependents         0
Previous Insurance Claims    0
Policy Type                  0
Policy Renewal Status  

INFO:root:Training Generalized Additive Model (GAM)...
  np.fill_diagonal(Dinv, d**-1)  # invert the singular values
  np.fill_diagonal(Dinv, d**-1)  # invert the singular values


KeyboardInterrupt: 