In [125]:
#importing the Libraies
import pickle
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.svm import SVR
from sklearn import tree
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt

In [126]:
dataset=pd.read_csv("insurance_pre.csv")

In [127]:
dataset

Unnamed: 0,age,sex,bmi,children,smoker,charges
0,19,female,27.900,0,yes,16884.92400
1,18,male,33.770,1,no,1725.55230
2,28,male,33.000,3,no,4449.46200
3,33,male,22.705,0,no,21984.47061
4,32,male,28.880,0,no,3866.85520
...,...,...,...,...,...,...
1333,50,male,30.970,3,no,10600.54830
1334,18,female,31.920,0,no,2205.98080
1335,18,female,36.850,0,no,1629.83350
1336,21,female,25.800,0,no,2007.94500


In [128]:
# One-hot encode categorical columns
dataset = pd.get_dummies(dataset, columns=['sex', 'smoker'], drop_first=True)

In [129]:
dataset

Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes
0,19,27.900,0,16884.92400,False,True
1,18,33.770,1,1725.55230,True,False
2,28,33.000,3,4449.46200,True,False
3,33,22.705,0,21984.47061,True,False
4,32,28.880,0,3866.85520,True,False
...,...,...,...,...,...,...
1333,50,30.970,3,10600.54830,True,False
1334,18,31.920,0,2205.98080,False,False
1335,18,36.850,0,1629.83350,False,False
1336,21,25.800,0,2007.94500,False,False


In [130]:
# Step 2: Split into features (X) and target (y)
independent=dataset[['age', 'bmi', 'children', 'sex_male','smoker_yes']]

In [131]:
dependent=dataset[["charges"]]

In [132]:
# Step 3: Train-test split
X_train,X_test,y_train,y_test=train_test_split(independent, dependent, test_size=0.30,random_state=0)

In [133]:
# ----------------------------------
# ▶️ Multiple Linear Regression (MLR)
# ----------------------------------
model=LinearRegression()
model.fit(X_train,y_train)

In [134]:
y_pred=model.predict(X_test)

In [135]:
r2_mlr= r2_score(y_test, y_pred)

In [136]:
# -------------------------
# ▶️ Support Vector Regression
# -------------------------
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

svr = SVR(kernel='rbf')
svr.fit(X_train_scaled, y_train)
y_pred_svr = svr.predict(X_test_scaled)
r2_svr = r2_score(y_test, y_pred_svr)

  y = column_or_1d(y, warn=True)


In [137]:
# -------------------------
# ▶️ Decision Tree with Hyperparameter Tuning
# -------------------------
param_grid = {
    'criterion': ['squared_error', 'friedman_mse', 'absolute_error','poisson'],
    'splitter': ['best', 'random'],
    'max_features': [None, 'sqrt', 'log2', 0.8, 5]  # mix of types to try
}


In [138]:
dtr = DecisionTreeRegressor(random_state=0)
grid_search = GridSearchCV(dtr, param_grid, cv=5, scoring='r2', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best estimator prediction
best_dtr = grid_search.best_estimator_
y_pred_dtr = best_dtr.predict(X_test)
r2_dtr = r2_score(y_test, y_pred_dtr)

# Random Forest parameter grid
rf_params = {
    'n_estimators': [50, 100],
    'max_depth': [5, 10, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'max_features': ['sqrt', 'log2', None],  # removed 'auto'
    'bootstrap': [True]
}
rfr = RandomForestRegressor(random_state=42)
rf_grid = GridSearchCV(rfr, rf_params, cv=5, scoring='r2', n_jobs=-1)
rf_grid.fit(X_train, y_train.values.ravel())

best_rfr = rf_grid.best_estimator_
y_pred_rfr = best_rfr.predict(X_test)
r2_rfr = r2_score(y_test, y_pred_rfr)

In [139]:
# Print summary of all grid search results sorted by mean test score
results = pd.DataFrame(grid_search.cv_results_)
results = results.sort_values(by='mean_test_score', ascending=False)

In [140]:
# -------------------------
# ✅ Print Results Summary
# -------------------------
print("\nModel Evaluation Summary:")
print("----------------------------")
print(f"Multiple Linear Regression R² Score : {r2_mlr:.4f}")
print(f"Support Vector Regression R² Score  : {r2_svr:.4f}")
# Print all hyperparameter results
print("\nDecision Tree Hyperparameter Tuning Results:")
print("----------------------------------------------------")
results = grid_search.cv_results_
for mean_score, params in zip(results['mean_test_score'], results['params']):
    print(f"R2 Score: {mean_score:.4f} | Parameters: {params}")
print(f"Best Decision Tree R² Score from CV      : {grid_search.best_score_:.4f}")
print(f"Best Decision Tree Parameters            : {grid_search.best_params_}")
print(f"Decision Tree R² Score on Test Set       : {r2_dtr:.4f}")

print("Random Forest Best Params:", rf_grid.best_params_)
print(f"Random Forest Test R2 Score: {r2_rfr:.4f}\n")

# Print all Random Forest CV results in requested format
print("All Random Forest CV Results:")
results_df = pd.DataFrame(rf_grid.cv_results_)
results_df = results_df[['mean_test_score', 'params']].sort_values(by='mean_test_score', ascending=False)

for _, row in results_df.iterrows():
    print(f"R2 Score: {row['mean_test_score']:.4f} | Parameters: {row['params']}")


Model Evaluation Summary:
----------------------------
Multiple Linear Regression R² Score : 0.7895
Support Vector Regression R² Score  : -0.0834

Decision Tree Hyperparameter Tuning Results:
----------------------------------------------------
R2 Score: 0.6673 | Parameters: {'criterion': 'squared_error', 'max_features': None, 'splitter': 'best'}
R2 Score: 0.6510 | Parameters: {'criterion': 'squared_error', 'max_features': None, 'splitter': 'random'}
R2 Score: 0.6513 | Parameters: {'criterion': 'squared_error', 'max_features': 'sqrt', 'splitter': 'best'}
R2 Score: 0.6009 | Parameters: {'criterion': 'squared_error', 'max_features': 'sqrt', 'splitter': 'random'}
R2 Score: 0.6513 | Parameters: {'criterion': 'squared_error', 'max_features': 'log2', 'splitter': 'best'}
R2 Score: 0.6009 | Parameters: {'criterion': 'squared_error', 'max_features': 'log2', 'splitter': 'random'}
R2 Score: 0.6545 | Parameters: {'criterion': 'squared_error', 'max_features': 0.8, 'splitter': 'best'}
R2 Score: 0.6