In [10]:
#importing the Libraies
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor

In [11]:
dataset=pd.read_csv("insurance_pre.csv")

In [12]:
dataset

Unnamed: 0,age,sex,bmi,children,smoker,charges
0,19,female,27.900,0,yes,16884.92400
1,18,male,33.770,1,no,1725.55230
2,28,male,33.000,3,no,4449.46200
3,33,male,22.705,0,no,21984.47061
4,32,male,28.880,0,no,3866.85520
...,...,...,...,...,...,...
1333,50,male,30.970,3,no,10600.54830
1334,18,female,31.920,0,no,2205.98080
1335,18,female,36.850,0,no,1629.83350
1336,21,female,25.800,0,no,2007.94500


In [13]:
# One-hot encode categorical columns
dataset = pd.get_dummies(dataset, columns=['sex', 'smoker'], drop_first=True)

In [14]:
dataset

Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes
0,19,27.900,0,16884.92400,False,True
1,18,33.770,1,1725.55230,True,False
2,28,33.000,3,4449.46200,True,False
3,33,22.705,0,21984.47061,True,False
4,32,28.880,0,3866.85520,True,False
...,...,...,...,...,...,...
1333,50,30.970,3,10600.54830,True,False
1334,18,31.920,0,2205.98080,False,False
1335,18,36.850,0,1629.83350,False,False
1336,21,25.800,0,2007.94500,False,False


In [15]:
# Split into features (X) and target (y)
independent=dataset[['age', 'bmi', 'children', 'sex_male','smoker_yes']]

In [16]:
dependent=dataset[["charges"]]

In [17]:
X_train, X_test, y_train, y_test = train_test_split(independent, dependent, test_size=1/3, random_state=0)

In [18]:
# Initialize the StandardScaler
# This scaler will standardize features by removing the mean and scaling to unit variance
sc = StandardScaler()

# Fit the scaler on the training data and transform it
# This computes the mean and standard deviation from X_train and applies scaling
X_train = sc.fit_transform(X_train)

# Transform the test data using the same parameters learned from X_train
# IMPORTANT: We only use transform (not fit_transform) to avoid data leakage
X_test = sc.transform(X_test)

In [19]:
# Define a grid of hyperparameters to search
# Define the grid of hyperparameters to search during model tuning
param_grid = {
    # Criterion to measure the quality of a split
    # 'mse' = mean squared error, 'mae' = mean absolute error
    'criterion': ['squared_error', 'absolute_error'],
    
    # Number of features to consider when looking for the best split
    # 'auto' = all features, 'sqrt' = square root of features, 'log2' = log base 2 of features
     'max_features': ['sqrt', 'log2'],
    
    # Number of trees in the ensemble (only applicable for ensemble methods like RandomForest)
    'n_estimators': [10, 100]
}

grid = GridSearchCV(
    RandomForestRegressor(),   # The model to tune: Random Forest Regressor (ensemble of decision trees)
    param_grid,               # Dictionary of hyperparameters to try during grid search
    refit=True,               # Refit the model with the best found parameters on the whole training set
    verbose=3,                # Verbosity level: 3 means detailed logs during training
    n_jobs=-1                 # Use all CPU cores to parallelize the grid search
)

# Fitting the model to the training data using grid search
# This will train the DecisionTreeRegressor on all combinations of hyperparameters in param_grid
# and select the best one based on cross-validation performance
grid.fit(X_train, y_train)

Fitting 5 folds for each of 8 candidates, totalling 40 fits


  return fit_method(estimator, *args, **kwargs)


In [20]:
# Print the best combination of parameters found by GridSearchCV
# print(grid.best_params_)

# Access full cross-validation results
re = grid.cv_results_
# print(re)  # Uncomment this line to see detailed CV results for all parameter combinations

# Use the best estimator from grid search to make predictions on the test set
grid_predictions = grid.predict(X_test)

# Evaluate the performance of the best model using R² score
from sklearn.metrics import r2_score
r_score = r2_score(y_test, grid_predictions)

# Print the R² score along with the best parameters
print("The R² score for the best parameters {}:".format(grid.best_params_), r_score)

The R² score for the best parameters {'criterion': 'squared_error', 'max_features': 'log2', 'n_estimators': 100}: 0.8728152638059138


In [21]:
# Convert the GridSearchCV results dictionary (cv_results_) into a pandas DataFrame
# This makes it easier to view, filter, and sort all the hyperparameter combinations and their performance
table=pd.DataFrame.from_dict(re)

In [22]:
table

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_features,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.055164,0.007709,0.006036,0.001908,squared_error,sqrt,10,"{'criterion': 'squared_error', 'max_features':...",0.798278,0.754292,0.804651,0.820186,0.747199,0.784921,0.028887,6
1,0.316194,0.016718,0.013048,0.000432,squared_error,sqrt,100,"{'criterion': 'squared_error', 'max_features':...",0.806333,0.769869,0.834854,0.834372,0.761091,0.801304,0.031144,2
2,0.032899,0.002393,0.004209,0.000927,squared_error,log2,10,"{'criterion': 'squared_error', 'max_features':...",0.782788,0.745273,0.834465,0.802868,0.76206,0.785491,0.031222,5
3,0.342061,0.035452,0.018867,0.001964,squared_error,log2,100,"{'criterion': 'squared_error', 'max_features':...",0.809629,0.766413,0.837306,0.8336,0.76287,0.801963,0.03194,1
4,0.095254,0.003995,0.005215,0.002022,absolute_error,sqrt,10,"{'criterion': 'absolute_error', 'max_features'...",0.775861,0.753991,0.816712,0.764253,0.770478,0.776259,0.021494,8
5,0.957846,0.014683,0.016843,0.001283,absolute_error,sqrt,100,"{'criterion': 'absolute_error', 'max_features'...",0.804077,0.771041,0.838121,0.82439,0.764983,0.800522,0.028734,3
6,0.104363,0.003712,0.005004,0.001571,absolute_error,log2,10,"{'criterion': 'absolute_error', 'max_features'...",0.769277,0.774631,0.820025,0.79974,0.74059,0.780853,0.027148,7
7,0.889686,0.077676,0.011965,0.000792,absolute_error,log2,100,"{'criterion': 'absolute_error', 'max_features'...",0.807072,0.768748,0.839176,0.821286,0.766281,0.800513,0.02881,4


In [None]:
4
bmi_input=float(input("BMI:"))
children_input=float(input("Children:"))
sex_male_input=int(input("Sex Male 0 or 1:"))
smoker_yes_input=int(input("Smoker Yes 0 or 1:"))

BMI: 4
Children: 5
Sex Male 0 or 1: 0


In [None]:
# Make a future prediction with the best estimator using custom input values
# Input: [age_input, bmi_input, children_input, sex_male_input, smoker_yes_input]
# These must match the feature order and count used in training
Future_Prediction = grid.predict([[age_input, bmi_input, children_input, sex_male_input, smoker_yes_input]])

# Print the predicted result
print("Future Prediction = {}".format(Future_Prediction[0]))