In [17]:
!pip install pandas numpy scikit-learn xgboost pygam
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from pygam import LinearGAM

# Load dataset
df = pd.read_csv('insurance.csv')

# Drop 'Name' as it's not useful for prediction
df.drop(columns=['Name'], inplace=True)

# Check for missing values
print(df.isnull().sum())

# Fill missing values
df.fillna(df.median(numeric_only=True), inplace=True)  # Median for numerical
categorical_cols = df.select_dtypes(include=['object']).columns
for col in categorical_cols:
    df[col].fillna(df[col].mode()[0], inplace=True)  # Mode for categorical

# Drop redundant or unnecessary columns
drop_cols = ['BMI Smoker', 'Income Dependents']
df.drop(columns=[col for col in drop_cols if col in df.columns], errors='ignore', inplace=True)

# One-hot encode categorical variables
categorical_features = ['Gender', 'Smoking Status', 'Region', 'Diet Type', 'Employment Type', 'Policy Type']
encoder = OneHotEncoder(drop='first', sparse_output=False)
encoded_features = encoder.fit_transform(df[categorical_features])
encoded_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(categorical_features))
df = pd.concat([df.drop(columns=categorical_features), encoded_df], axis=1)

# Standardize numerical features
numerical_features = ['Age', 'BMI', 'Annual Income', 'Credit Score', 'Savings Amount']
scaler = StandardScaler()
df[numerical_features] = scaler.fit_transform(df[numerical_features])

# Ensure target variable exists
if 'Insurance Cost' not in df.columns:
    raise ValueError("Column 'Insurance Cost' not found in dataset.")

# Split dataset into training and testing sets
X = df.drop(columns=['Insurance Cost'])
y = df['Insurance Cost']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train models
models = {
    "Random Forest": RandomForestRegressor(n_estimators=200, max_depth=10, random_state=42),
    "Decision Tree": DecisionTreeRegressor(max_depth=10, random_state=42),
    "XGBoost": XGBRegressor(n_estimators=200, max_depth=10, learning_rate=0.1, random_state=42),
    "GAM": LinearGAM(n_splines=10)
}

# Fit and evaluate models
results = {}
for name, model in models.items():
    print(f"Training {name}...")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    
    results[name] = {"MAE": mae, "RMSE": rmse, "R²": r2}
    print(f"{name} - MAE: {mae}, RMSE: {rmse}, R²: {r2}\n")

# Feature importance (Random Forest)
rf_importance = pd.DataFrame({'Feature': X.columns, 'Importance': models["Random Forest"].feature_importances_})
rf_importance.sort_values(by='Importance', ascending=False, inplace=True)
print("\nFeature Importance (Random Forest):\n", rf_importance)


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m
Age                          0
Gender                       0
BMI                          0
Smoking Status               0
Region                       0
Diabetes                     0
Hypertension                 0
Heart Disease                0
Cancer History               0
Stroke                       0
Liver Disease                0
Kidney Disease               0
COPD                         0
TB                           0
HIV/AIDS                     0
Alcohol Consumption          0
Exercise Frequency           0
Diet Type                    0
Stress Level                 0
Medical History Score        0
Annual Income                0
Employment Type              0
Credit Score                 0
Savings Amount         

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)  # Mode for categorical


Random Forest - MAE: 133.60854166182298, RMSE: 271.63061870617827, R²: 0.9906397805971923

Training Decision Tree...
Decision Tree - MAE: 180.78444199256754, RMSE: 335.88179079360947, R²: 0.9856879612631307

Training XGBoost...
XGBoost - MAE: 38.7338828125, RMSE: 196.03802180946778, R²: 0.9951246117544627

Training GAM...
GAM - MAE: 2.5885833474603715e-07, RMSE: 6.114029995905742e-07, R²: 1.0


Feature Importance (Random Forest):
                           Feature    Importance
25          Smoking Status_Smoker  5.293140e-01
3                    Hypertension  1.532099e-01
2                        Diabetes  1.059024e-01
0                             Age  5.498631e-02
4                   Heart Disease  4.579814e-02
8                  Kidney Disease  3.537377e-02
7                   Liver Disease  3.058533e-02
6                          Stroke  2.182769e-02
5                  Cancer History  2.092566e-02
17                   Credit Score  3.225303e-04
1                             BMI  2.