In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error

In [2]:
df = pd.read_csv("../data/insurance_cleaned.csv")

In [13]:
X = df.drop('charges', axis=1)   # features
y = df['charges']                # target

In [14]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

y_train_log = np.log1p(y_train)
y_test_log  = np.log1p(y_test)

In [15]:
def evaluate_model(y_true, y_pred):
    r2 = r2_score(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    return r2, rmse

In [16]:
#

In [17]:
lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_test)
lr_r2, lr_rmse = evaluate_model(y_test, y_pred)


In [18]:
lr.fit(X_train, y_train_log)

y_pred_log = lr.predict(X_test)
y_pred_actual = np.expm1(y_pred_log)

lr_log_r2, lr_log_rmse = evaluate_model(y_test, y_pred_actual)


In [19]:
#

In [20]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [21]:
ridge = Ridge(alpha=1.0)
ridge.fit(X_train_scaled, y_train)

y_pred = ridge.predict(X_test_scaled)
ridge_r2, ridge_rmse = evaluate_model(y_test, y_pred)


In [22]:
ridge.fit(X_train_scaled, y_train_log)

y_pred_log = ridge.predict(X_test_scaled)
y_pred_actual = np.expm1(y_pred_log)

ridge_log_r2, ridge_log_rmse = evaluate_model(y_test, y_pred_actual)


In [23]:
param_grid = {'alpha': [0.01, 0.1, 1, 10, 100]}

grid_ridge = GridSearchCV(
    Ridge(),
    param_grid,
    cv=5,
    scoring='r2'
)

grid_ridge.fit(X_train_scaled, y_train_log)
best_ridge = grid_ridge.best_estimator_

y_pred_log = best_ridge.predict(X_test_scaled)
y_pred_actual = np.expm1(y_pred_log)

ridge_tuned_r2, ridge_tuned_rmse = evaluate_model(y_test, y_pred_actual)


In [24]:
#

In [25]:
rf = RandomForestRegressor(
    n_estimators=200,
    random_state=42
)

rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

rf_r2, rf_rmse = evaluate_model(y_test, y_pred)


In [26]:
rf.fit(X_train, y_train_log)

y_pred_log = rf.predict(X_test)
y_pred_actual = np.expm1(y_pred_log)

rf_log_r2, rf_log_rmse = evaluate_model(y_test, y_pred_actual)


In [27]:
param_grid = {
    'n_estimators': [200, 500],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

grid_rf = GridSearchCV(
    RandomForestRegressor(random_state=42),
    param_grid,
    cv=5,
    scoring='r2',
    n_jobs=-1
)

grid_rf.fit(X_train, y_train_log)
best_rf = grid_rf.best_estimator_

y_pred_log = best_rf.predict(X_test)
y_pred_actual = np.expm1(y_pred_log)

rf_tuned_r2, rf_tuned_rmse = evaluate_model(y_test, y_pred_actual)


In [28]:
#

In [29]:
results = pd.DataFrame({
    'Model': [
        'Linear',
        'Linear (Log)',
        'Ridge',
        'Ridge (Log)',
        'Ridge (Tuned + Log)',
        'RF',
        'RF (Log)',
        'RF (Tuned + Log)'
    ],
    'R2': [
        lr_r2,
        lr_log_r2,
        ridge_r2,
        ridge_log_r2,
        ridge_tuned_r2,
        rf_r2,
        rf_log_r2,
        rf_tuned_r2
    ],
    'RMSE': [
        lr_rmse,
        lr_log_rmse,
        ridge_rmse,
        ridge_log_rmse,
        ridge_tuned_rmse,
        rf_rmse,
        rf_log_rmse,
        rf_tuned_rmse
    ]
})

results.sort_values(by='R2', ascending=False)


Unnamed: 0,Model,R2,RMSE
7,RF (Tuned + Log),0.879605,4323.320953
6,RF (Log),0.876955,4370.647674
5,RF,0.864476,4586.935636
0,Linear,0.783593,5796.284659
2,Ridge,0.783541,5796.979691
3,Ridge (Log),0.6082,7799.129377
4,Ridge (Tuned + Log),0.6082,7799.129377
1,Linear (Log),0.606698,7814.064026
