In [38]:
#importing the Libraies
import pickle
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR

In [39]:
dataset=pd.read_csv("insurance_pre.csv")

In [40]:
dataset

Unnamed: 0,age,sex,bmi,children,smoker,charges
0,19,female,27.900,0,yes,16884.92400
1,18,male,33.770,1,no,1725.55230
2,28,male,33.000,3,no,4449.46200
3,33,male,22.705,0,no,21984.47061
4,32,male,28.880,0,no,3866.85520
...,...,...,...,...,...,...
1333,50,male,30.970,3,no,10600.54830
1334,18,female,31.920,0,no,2205.98080
1335,18,female,36.850,0,no,1629.83350
1336,21,female,25.800,0,no,2007.94500


In [41]:
# One-hot encode categorical columns
dataset = pd.get_dummies(dataset, columns=['sex', 'smoker'], drop_first=True)

In [42]:
dataset

Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes
0,19,27.900,0,16884.92400,False,True
1,18,33.770,1,1725.55230,True,False
2,28,33.000,3,4449.46200,True,False
3,33,22.705,0,21984.47061,True,False
4,32,28.880,0,3866.85520,True,False
...,...,...,...,...,...,...
1333,50,30.970,3,10600.54830,True,False
1334,18,31.920,0,2205.98080,False,False
1335,18,36.850,0,1629.83350,False,False
1336,21,25.800,0,2007.94500,False,False


In [43]:
# Split into features (X) and target (y)
independent=dataset[['age', 'bmi', 'children', 'sex_male','smoker_yes']]

In [44]:
dependent=dataset[["charges"]]

In [45]:
X_train, X_test, y_train, y_test = train_test_split(independent, dependent, test_size=1/3, random_state=0)

In [46]:
# Initialize the StandardScaler
# This scaler will standardize features by removing the mean and scaling to unit variance
sc = StandardScaler()

# Fit the scaler on the training data and transform it
# This computes the mean and standard deviation from X_train and applies scaling
X_train = sc.fit_transform(X_train)

# Transform the test data using the same parameters learned from X_train
# IMPORTANT: We only use transform (not fit_transform) to avoid data leakage
X_test = sc.transform(X_test)

In [47]:
# Define a grid of hyperparameters to search
param_grid = {
    'kernel': ['rbf', 'poly', 'sigmoid', 'linear'],   # Different kernel functions to try
    'C': [10, 100, 1000, 2000, 3000],                  # Regularization parameter (higher = less regularization)
    'gamma': ['auto', 'scale']                        # Kernel coefficient for 'rbf', 'poly', and 'sigmoid'
}

# Set up GridSearchCV with SVR model
grid = GridSearchCV(
    estimator=SVR(),              # Model to optimize
    param_grid=param_grid,        # Dictionary of hyperparameters
    refit=True,                   # Automatically refit the best model on the full training set
    verbose=3,                    # Print progress during fitting
    n_jobs=-1                     # Use all CPU cores to speed up the search
)

# Fit the model using Grid Search
# It will train and evaluate SVR for every combination of parameters in param_grid
grid.fit(X_train, y_train)

Fitting 5 folds for each of 40 candidates, totalling 200 fits


  y = column_or_1d(y, warn=True)


In [48]:
# Get the cross-validation results as a dictionary
re = grid.cv_results_

# Print the best parameters found by GridSearchCV
print("The R² score value for the best parameters {}:".format(grid.best_params_))

# Print the best R² score achieved during cross-validation
print("Best R² score from Grid Search: {:.4f}".format(grid.best_score_))

The R² score value for the best parameters {'C': 3000, 'gamma': 'scale', 'kernel': 'poly'}:
Best R² score from Grid Search: 0.7998


In [49]:
# Convert the GridSearchCV results dictionary (cv_results_) into a pandas DataFrame
# This makes it easier to view, filter, and sort all the hyperparameter combinations and their performance
table = pd.DataFrame.from_dict(re)

In [50]:
table

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_gamma,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.050302,0.006442,0.022601,0.004254,10,auto,rbf,"{'C': 10, 'gamma': 'auto', 'kernel': 'rbf'}",-0.004176,0.022594,-0.118956,-0.082926,-0.103473,-0.057387,0.056205,35
1,0.034364,0.001432,0.007612,0.00074,10,auto,poly,"{'C': 10, 'gamma': 'auto', 'kernel': 'poly'}",0.04742,0.077536,-0.060527,-0.009476,-0.050823,0.000826,0.054025,32
2,0.049656,0.001363,0.010528,0.00138,10,auto,sigmoid,"{'C': 10, 'gamma': 'auto', 'kernel': 'sigmoid'}",0.044787,0.081689,-0.072355,-0.027541,-0.05147,-0.004978,0.058648,34
3,0.041682,0.00529,0.010328,0.003665,10,auto,linear,"{'C': 10, 'gamma': 'auto', 'kernel': 'linear'}",0.387624,0.461268,0.288301,0.34054,0.297825,0.355112,0.063693,25
4,0.057431,0.006257,0.022644,0.004325,10,scale,rbf,"{'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}",-0.003956,0.022453,-0.119035,-0.082925,-0.10351,-0.057395,0.05623,36
5,0.034165,0.002243,0.006484,0.000937,10,scale,poly,"{'C': 10, 'gamma': 'scale', 'kernel': 'poly'}",0.043648,0.07978,-0.059229,-0.009498,-0.050317,0.000877,0.053658,31
6,0.054164,0.003711,0.013876,0.00164,10,scale,sigmoid,"{'C': 10, 'gamma': 'scale', 'kernel': 'sigmoid'}",0.043946,0.08223,-0.072132,-0.027546,-0.051337,-0.004968,0.058595,33
7,0.041942,0.002532,0.008447,0.001037,10,scale,linear,"{'C': 10, 'gamma': 'scale', 'kernel': 'linear'}",0.387624,0.461268,0.288301,0.34054,0.297825,0.355112,0.063693,25
8,0.046843,0.001966,0.020525,0.000732,100,auto,rbf,"{'C': 100, 'gamma': 'auto', 'kernel': 'rbf'}",0.303414,0.319385,0.155546,0.208414,0.161756,0.229703,0.069348,29
9,0.039493,0.003047,0.00789,0.000848,100,auto,poly,"{'C': 100, 'gamma': 'auto', 'kernel': 'poly'}",0.542212,0.566743,0.471172,0.537557,0.413719,0.506281,0.056081,22


In [53]:
age_input=float(input("Age:"))
bmi_input=float(input("BMI:"))
children_input=float(input("Children:"))
sex_male_input=int(input("Sex Male 0 or 1:"))
smoker_yes_input=int(input("Smoker Yes 0 or 1:"))


Age: 32
BMI: 43
Children: 2
Sex Male 0 or 1: 0
Smoker Yes 0 or 1: 1


In [54]:
# Prepare the input in the format expected by the model (2D array)
input_features = [[age_input, bmi_input, children_input, sex_male_input, smoker_yes_input]]

# Use the best estimator from GridSearchCV to predict the target value
Future_Prediction = grid.predict(input_features)

# Print the prediction result
print("Future Prediction = {}".format(Future_Prediction[0]))

Future Prediction = 3316415.7200434203
