In [1]:
#importing the Libraies
import pickle
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor

In [2]:
dataset=pd.read_csv("insurance_pre.csv")

In [3]:
dataset

Unnamed: 0,age,sex,bmi,children,smoker,charges
0,19,female,27.900,0,yes,16884.92400
1,18,male,33.770,1,no,1725.55230
2,28,male,33.000,3,no,4449.46200
3,33,male,22.705,0,no,21984.47061
4,32,male,28.880,0,no,3866.85520
...,...,...,...,...,...,...
1333,50,male,30.970,3,no,10600.54830
1334,18,female,31.920,0,no,2205.98080
1335,18,female,36.850,0,no,1629.83350
1336,21,female,25.800,0,no,2007.94500


In [4]:
# One-hot encode categorical columns
dataset = pd.get_dummies(dataset, columns=['sex', 'smoker'], drop_first=True)

In [5]:
dataset

Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes
0,19,27.900,0,16884.92400,False,True
1,18,33.770,1,1725.55230,True,False
2,28,33.000,3,4449.46200,True,False
3,33,22.705,0,21984.47061,True,False
4,32,28.880,0,3866.85520,True,False
...,...,...,...,...,...,...
1333,50,30.970,3,10600.54830,True,False
1334,18,31.920,0,2205.98080,False,False
1335,18,36.850,0,1629.83350,False,False
1336,21,25.800,0,2007.94500,False,False


In [6]:
# Split into features (X) and target (y)
independent=dataset[['age', 'bmi', 'children', 'sex_male','smoker_yes']]

In [7]:
dependent=dataset[["charges"]]

In [8]:
X_train, X_test, y_train, y_test = train_test_split(independent, dependent, test_size=1/3, random_state=0)

In [9]:
# Initialize the StandardScaler
# This scaler will standardize features by removing the mean and scaling to unit variance
sc = StandardScaler()

# Fit the scaler on the training data and transform it
# This computes the mean and standard deviation from X_train and applies scaling
X_train = sc.fit_transform(X_train)

# Transform the test data using the same parameters learned from X_train
# IMPORTANT: We only use transform (not fit_transform) to avoid data leakage
X_test = sc.transform(X_test)

In [10]:
# Define a grid of hyperparameters to search
# Define the grid of hyperparameters to search during model tuning
param_grid = {
    # 'criterion' defines the function to measure the quality of a split
    # 'mse' (Mean Squared Error), 'mae' (Mean Absolute Error), 'friedman_mse' (used in Gradient Boosting)
    'criterion': ['mse', 'mae', 'friedman_mse'],
    
    # 'max_features' determines the number of features to consider when looking for the best split
    # 'auto' = all features, 'sqrt' = square root of the number of features, 'log2' = log base 2
    'max_features': ['auto', 'sqrt', 'log2'],
    
    # 'splitter' defines the strategy used to choose the split at each node
    # 'best' chooses the best split, 'random' chooses the best random split
    'splitter': ['best', 'random']
}

# Perform hyperparameter tuning using GridSearchCV on DecisionTreeRegressor
grid = GridSearchCV(
    estimator=DecisionTreeRegressor(),   # The model to optimize (regression tree)
    param_grid=param_grid,               # Dictionary of hyperparameters to try
    refit=True,                          # Once the best params are found, refit the model using them on the entire training set
    verbose=3,                           # Controls the verbosity: 3 = detailed messages for each fit
    n_jobs=-1                            # Use all available CPU cores to speed up the grid search
)

# Fitting the model to the training data using grid search
# This will train the DecisionTreeRegressor on all combinations of hyperparameters in param_grid
# and select the best one based on cross-validation performance
grid.fit(X_train, y_train)

Fitting 5 folds for each of 18 candidates, totalling 90 fits


70 fits failed out of a total of 90.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
12 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\mukil\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\mukil\anaconda3\Lib\site-packages\sklearn\base.py", line 1466, in wrapper
    estimator._validate_params()
  File "C:\Users\mukil\anaconda3\Lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\mukil\anaconda3\Lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParameter

In [11]:
# Print the best combination of parameters found by GridSearchCV
# print(grid.best_params_)

# Access full cross-validation results
re = grid.cv_results_
# print(re)  # Uncomment this line to see detailed CV results for all parameter combinations

# Use the best estimator from grid search to make predictions on the test set
grid_predictions = grid.predict(X_test)

# Evaluate the performance of the best model using R² score
from sklearn.metrics import r2_score
r_score = r2_score(y_test, grid_predictions)

# Print the R² score along with the best parameters
print("The R² score for the best parameters {}:".format(grid.best_params_), r_score)

The R² score for the best parameters {'criterion': 'friedman_mse', 'max_features': 'log2', 'splitter': 'best'}: 0.7695398256547402


In [12]:
# Convert the GridSearchCV results dictionary (cv_results_) into a pandas DataFrame
# This makes it easier to view, filter, and sort all the hyperparameter combinations and their performance
table=pd.DataFrame.from_dict(re)

In [13]:
table

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_features,param_splitter,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.002027,0.000311,0.0,0.0,mse,auto,best,"{'criterion': 'mse', 'max_features': 'auto', '...",,,,,,,,5
1,0.00251,0.001346,0.0,0.0,mse,auto,random,"{'criterion': 'mse', 'max_features': 'auto', '...",,,,,,,,5
2,0.001913,0.000927,0.0,0.0,mse,sqrt,best,"{'criterion': 'mse', 'max_features': 'sqrt', '...",,,,,,,,5
3,0.001999,0.001551,0.0,0.0,mse,sqrt,random,"{'criterion': 'mse', 'max_features': 'sqrt', '...",,,,,,,,5
4,0.0016,0.001019,0.0,0.0,mse,log2,best,"{'criterion': 'mse', 'max_features': 'log2', '...",,,,,,,,5
5,0.002806,0.00117,0.0,0.0,mse,log2,random,"{'criterion': 'mse', 'max_features': 'log2', '...",,,,,,,,5
6,0.001511,0.000778,0.0,0.0,mae,auto,best,"{'criterion': 'mae', 'max_features': 'auto', '...",,,,,,,,5
7,0.002004,0.001413,0.0,0.0,mae,auto,random,"{'criterion': 'mae', 'max_features': 'auto', '...",,,,,,,,5
8,0.000662,0.000427,0.0,0.0,mae,sqrt,best,"{'criterion': 'mae', 'max_features': 'sqrt', '...",,,,,,,,5
9,0.001401,0.00102,0.0,0.0,mae,sqrt,random,"{'criterion': 'mae', 'max_features': 'sqrt', '...",,,,,,,,5


In [None]:
age_input=float(input("Age:"))
bmi_input=float(input("BMI:"))
children_input=float(input("Children:"))
sex_male_input=int(input("Sex Male 0 or 1:"))
smoker_yes_input=int(input("Smoker Yes 0 or 1:"))

In [None]:
# Make a future prediction with the best estimator using custom input values
# Input: [age_input, bmi_input, children_input, sex_male_input, smoker_yes_input]
# These must match the feature order and count used in training
Future_Prediction = grid.predict([[age_input, bmi_input, children_input, sex_male_input, smoker_yes_input]])

# Print the predicted result
print("Future Prediction = {}".format(Future_Prediction[0]))