In [28]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

# Data

## Data Preprocessing
**Purpose:**
- Read and clean the data.
- Encode categorical columns (smoker, sex, region) into numerical values so that they can be used by machine learning models.

In [3]:
labelencoder = preprocessing.LabelEncoder()
data = pd.read_csv("insurance.csv")
# Remove duplicate rows in the data
data.drop_duplicates(inplace=True)
# Transforming categorical columns to numeric columns using LabelEncoder
data["smoker"] = labelencoder.fit_transform(data["smoker"])
data["sex"] = labelencoder.fit_transform(data["sex"])
data["region"] = labelencoder.fit_transform(data["region"])
data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,1,3,16884.924
1,18,1,33.77,1,0,2,1725.5523
2,28,1,33.0,3,0,2,4449.462
3,33,1,22.705,0,0,1,21984.47061
4,32,1,28.88,0,0,1,3866.8552


## Division of the Data Set
**Purpose:**
- To separate the features (X) from the target variable (y).
- Split the data into training and test sets to evaluate model performance.

In [9]:
# Separate the characteristics (X) and the target variable (y).
x = data[['age','sex','bmi','children','smoker','region']]
y = data[['charges']]
# Split the data into training and test sets
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.30)

## Data Scaling
**Purpose:**
- Standardize features so that models converge faster and perform better.

In [10]:
# Scale the features to have a mean of 0 and a standard deviation of 1.
scaler = StandardScaler()
scaled_x_train = scaler.fit_transform(x_train)
scaled_x_test = scaler.fit_transform(x_test)

In [39]:
# Define a function for calculating and displaying model errors
def modelresults(predictions):
    i = mean_absolute_error(y_test,predictions)
    j = np.sqrt(mean_squared_error(y_test, predictions))
    print(f"Mean absolute error on model is: {i}")
    print(f"Root mean squared error on model is: {j}")

# Training and Evaluation of Different Models

## Linear Regression
Linear regression is a basic model that assumes a linear relationship between the characteristics and the target variable. Its performance is evaluated using metrics such as mean absolute error and mean square error.

In [14]:
# Training a linear regression model
lr = LinearRegression()
lr.fit(scaled_x_train, y_train)

In [16]:
# Making predictions with linear regression modeling
predslr = lr.predict(scaled_x_test)
# Evaluate the linear regression model
modelresults(predslr)

Mean absolute error on model is: 4317.670041723096
Roor mean squared error on model is: 6195.900519918849


## Support Vector Machine (SVR)
SVR is useful for regression problems, especially when linearity is not assumed. A hyperparameter search is performed to find the best combination that optimizes model performance. Its performance is evaluated using metrics such as mean absolute error and mean square error.

In [27]:
# Training a Support Vector Machine (SVR) model with hyperparameter search
svrmodel = SVR()
param_gridsvr = {'C':[0.001, 0.01,0.1,0.5,1],'kernel':['linear','rbf','poly'],'gamma':['scale','auto'],'degree':[2,3,4,5]}
gridsvr = GridSearchCV(svrmodel,param_gridsvr)
gridsvr.fit(scaled_x_train,y_train)
a = gridsvr.best_params_
print(f"Best parameters for model are {a}")

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

Best parameters for model are {'C': 1, 'degree': 2, 'gamma': 'scale', 'kernel': 'linear'}


  y = column_or_1d(y, warn=True)


In [26]:
#Best parameters for model are {'C': 1, 'degree': 2, 'gamma': 'scale', 'kernel': 'linear'}
# Making predictions with the best SVR model
predsgridsvr = gridsvr.predict(scaled_x_test)
# Evaluating the SVR model
modelresults(predsgridsvr)

Mean absolute error on model is: 8146.552264675966
Roor mean squared error on model is: 12523.40716553332


## Random Forest
Random Forests are robust models that can capture nonlinear relationships and handle features with different scales. Hyperparameter search techniques are used to optimize their performance. Its performance is evaluated using metrics such as mean absolute error and mean square error.

In [29]:
# Training a Random Forest Regression model (RandomForestRegressor) with hyperparameter search
rfmodel = RandomForestRegressor()
param_gridrfr = {'bootstrap': [True], 'max_depth':[5,10,15], 'max_features': ['auto', 'log2'], 'n_estimators':[2,3,4,5,6,7,8,9,10]}
gridrfr = GridSearchCV(rfmodel, param_gridrfr)
gridrfr.fit(scaled_x_train,y_train)

  estimator.fit(X_train, y_train, **fit_params)
  warn(
  estimator.fit(X_train, y_train, **fit_params)
  warn(
  estimator.fit(X_train, y_train, **fit_params)
  warn(
  estimator.fit(X_train, y_train, **fit_params)
  warn(
  estimator.fit(X_train, y_train, **fit_params)
  warn(
  estimator.fit(X_train, y_train, **fit_params)
  warn(
  estimator.fit(X_train, y_train, **fit_params)
  warn(
  estimator.fit(X_train, y_train, **fit_params)
  warn(
  estimator.fit(X_train, y_train, **fit_params)
  warn(
  estimator.fit(X_train, y_train, **fit_params)
  warn(
  estimator.fit(X_train, y_train, **fit_params)
  warn(
  estimator.fit(X_train, y_train, **fit_params)
  warn(
  estimator.fit(X_train, y_train, **fit_params)
  warn(
  estimator.fit(X_train, y_train, **fit_params)
  warn(
  estimator.fit(X_train, y_train, **fit_params)
  warn(
  estimator.fit(X_train, y_train, **fit_params)
  warn(
  estimator.fit(X_train, y_train, **fit_params)
  warn(
  estimator.fit(X_train, y_train, **fit_params)


In [30]:
# Making predictions with the best model of random forests
predsgridrfr = gridrfr.predict(scaled_x_test)
# Evaluating the random forest model
modelresults(predsgridrfr)

Mean absolute error on model is: 2689.355493340766
Roor mean squared error on model is: 4699.819241753503


## Evaluation of selected model
Random Forest has lower errors (MAE and RMSE) compared to Linear Regression and SVR. This indicates that the Random Forest model has a better ability to predict insurance charges more accurately in this specific data set.

In [31]:
x.columns

Index(['age', 'sex', 'bmi', 'children', 'smoker', 'region'], dtype='object')

In [32]:
# Iterate through the columns of X and display the average of each column.
columiterate = 1
for index in x.columns:
    mean = data[index].mean()
    print(f"The mean of the column {columiterate} is {mean}")
    columiterate+=1

The mean of the column 1 is 39.222139117427076
The mean of the column 2 is 0.5048616305160808
The mean of the column 3 is 30.66345175766642
The mean of the column 4 is 1.0957367240089753
The mean of the column 5 is 0.2049364248317128
The mean of the column 6 is 1.5160807778608825


In [33]:
newcostumer = np.array([39,0,30,1,0,1])

In [35]:
# Predict the cost of insurance for a new client
p = gridrfr.predict(newcostumer.reshape(1,-1))

In [38]:
print(f"The insurance cost of the new costumer is {p[0]}")

The insurance cost of the new costumer is 17856.830896164927
