In [2]:
%pip install pygam

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from pygam import LinearGAM, s, f

# Load dataset (assuming dataset is preprocessed and stored as CSV)
df = pd.read_csv('insurance_data.csv')

# Define categorical and numerical columns
categorical_cols = ['Gender', 'Region', 'Diet Type', 'Employment Type', 'Policy Type', 'Age Group']
binary_cols = ['Smoking Status', 'Cancer', 'Diabetes', 'Cardiovascular Disease', 'Stroke', 'COPD', 'TB',
               'HIV/AIDS', 'Liver Disease', 'Kidney Disease', 'Alcohol Consumption', 'Has Diabetes',
               'Has Hypertension', 'Has Heart Disease', 'Has Cancer History', 'Policy Renewal Status']
numerical_cols = ['Age', 'BMI', 'Stress Level', 'Medical History Score', 'Annual Income', 'Credit Score',
                  'Savings Amount', 'Health Risk Score', 'Expense Ratio', 'Medication Costs Per Year', 'Hospital Visits Per Year']
discrete_cols = ['Number of Children', 'Exercise Frequency', 'Number of Dependents', 'Previous Insurance Claims']

# One-hot encode categorical variables
encoder = OneHotEncoder(drop='first', sparse_output=False)
categorical_encoded = encoder.fit_transform(df[categorical_cols])
categorical_df = pd.DataFrame(categorical_encoded, columns=encoder.get_feature_names_out())

# Standardize numerical variables
scaler = StandardScaler()
numerical_scaled = scaler.fit_transform(df[numerical_cols])
numerical_df = pd.DataFrame(numerical_scaled, columns=numerical_cols)

# Merge all preprocessed features
X = pd.concat([numerical_df, categorical_df, df[binary_cols], df[discrete_cols]], axis=1)
Y = df['Insurance Cost']

# Split data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Train Random Forest Model
rf_model = RandomForestRegressor(n_estimators=300, max_depth=10, random_state=42)
rf_model.fit(X_train, Y_train)

# Train GAM Model
gam_model = LinearGAM(s(0) + s(1) + s(2) + s(3) + s(4) + s(5) + f(6) + f(7) + f(8)).fit(X_train, Y_train)

# Predictions
rf_preds = rf_model.predict(X_test)
gam_preds = gam_model.predict(X_test)

# Evaluate Models
def evaluate_model(y_true, y_pred, model_name):
    print(f"\n{model_name} Performance:")
    print(f"MAE: {mean_absolute_error(y_true, y_pred)}")
    print(f"MSE: {mean_squared_error(y_true, y_pred)}")
    print(f"R²: {r2_score(y_true, y_pred)}")

evaluate_model(Y_test, rf_preds, "Random Forest")
evaluate_model(Y_test, gam_preds, "GAM")


^C
Note: you may need to restart the kernel to use updated packages.


ModuleNotFoundError: No module named 'pygam'