<a href="https://colab.research.google.com/github/mallowww/insurance-prediction/blob/main/MedicalCost_Prediction_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Insurance prediction

In [None]:
import pandas as pd
import numpy as np  
import matplotlib.pyplot as plt  
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [None]:
URL="https://raw.githubusercontent.com/stedy/Machine-Learning-with-R-datasets/master/insurance.csv"

In [None]:
df = pd.read_csv(URL)

## EDA

In [None]:
df

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [None]:
df.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [None]:
df.duplicated().sum()

1

In [None]:
X = df.drop(['charges'], axis=1)
y = df['charges']

In [None]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
dtypes: float64(1), int64(2), object(3)
memory usage: 62.8+ KB


Encode categorical features as an integer array

In [None]:
categorical_X = X.select_dtypes(include='object')
categorical_X.columns

Index(['sex', 'smoker', 'region'], dtype='object')

In [None]:
from sklearn.preprocessing import OrdinalEncoder
enc = OrdinalEncoder()
enc.fit(X[categorical_X.columns])
X[categorical_X.columns] = enc.transform(X[categorical_X.columns])

In [None]:
X

Unnamed: 0,age,sex,bmi,children,smoker,region
0,19,0.0,27.900,0,1.0,3.0
1,18,1.0,33.770,1,0.0,2.0
2,28,1.0,33.000,3,0.0,2.0
3,33,1.0,22.705,0,0.0,1.0
4,32,1.0,28.880,0,0.0,1.0
...,...,...,...,...,...,...
1333,50,1.0,30.970,3,0.0,1.0
1334,18,0.0,31.920,0,0.0,0.0
1335,18,0.0,36.850,0,0.0,2.0
1336,21,0.0,25.800,0,0.0,3.0


In [None]:
X.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
dtype: int64

Handle Imbalanced Classes

In [None]:
unique_elements, counts_elements = np.unique(y, return_counts=True)
print(np.asarray((unique_elements, counts_elements)).T)

[[1.12187390e+03 1.00000000e+00]
 [1.13150660e+03 1.00000000e+00]
 [1.13594070e+03 1.00000000e+00]
 ...
 [6.00213990e+04 1.00000000e+00]
 [6.25928731e+04 1.00000000e+00]
 [6.37704280e+04 1.00000000e+00]]


In [None]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   float64
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   float64
 5   region    1338 non-null   float64
dtypes: float64(4), int64(2)
memory usage: 62.8 KB


In [None]:
# Split dataset into training set and test set 70% training and 30% test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=104)

Standardization

In [None]:
# Standardization, or mean removal and variance scaling
# Standardization of datasets is a common requirement for many machine learning
# estimators implemented in scikit-learn; they might behave badly if the 
# individual features do not more or less look like standard normally distributed data: 
# Gaussian with zero mean and unit variance.

# Standardize features by removing the mean and scaling to unit variance.
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
# Fit only to the training data
scaler.fit(X_train)

# Now apply the transformations to the data:
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
X_train

array([[-0.07946364,  0.99573559, -1.84722724, -0.91727834,  1.97497432,
         1.34378726],
       [ 1.50304626,  0.99573559, -1.02819505, -0.91727834, -0.5063357 ,
        -0.45433265],
       [ 1.431114  ,  0.99573559, -0.84219969, -0.91727834, -0.5063357 ,
        -0.45433265],
       ...,
       [ 1.28724946,  0.99573559, -1.12935042, -0.91727834, -0.5063357 ,
         1.34378726],
       [ 0.13633317, -1.00428267,  0.89049392, -0.08613359, -0.5063357 ,
         0.44472731],
       [-0.79878632,  0.99573559, -1.08366735,  1.57615591,  1.97497432,
         0.44472731]])

## Choosing the right estimator

In [None]:
from sklearn.metrics import mean_squared_error, r2_score

LinearRegression

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# create and fit the model
model = LinearRegression()
model.fit(X_train, y_train)

# predict the target variable using test data
y_pred = model.predict(X_test)

# evaluate the performance of the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("Mean squared error:", mse)
print("R2 score:", r2)


Mean squared error: 31720801.346055053
R2 score: 0.7783016183871759


ElasticNet

In [None]:
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_squared_error, r2_score

# create and fit the model
model = ElasticNet()
model.fit(X_train, y_train)

# predict the target variable using test data
y_pred = model.predict(X_test)

# evaluate the performance of the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("Mean squared error:", mse)
print("R2 score:", r2)


Mean squared error: 86014273.28871724
R2 score: 0.39884163153137653


Lasso

In [None]:
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error, r2_score

# create and fit the model
model = Lasso()
model.fit(X_train, y_train)

# predict the target variable using test data
y_pred = model.predict(X_test)

# evaluate the performance of the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("Mean squared error:", mse)
print("R2 score:", r2)


Mean squared error: 31717524.758911017
R2 score: 0.7783245186304303


SVR (kernel='linear')

In [None]:
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score

# create and fit the model
model = SVR(kernel='linear')
model.fit(X_train, y_train)

# predict the target variable using test data
y_pred = model.predict(X_test)

# evaluate the performance of the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("Mean squared error:", mse)
print("R2 score:", r2)

Mean squared error: 166760393.56031412
R2 score: -0.16549733299966163


SVR (kernel='rbf'):

In [None]:
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score

# create and fit the model
model = SVR(kernel='rbf')
model.fit(X_train, y_train)

# predict the target variable using test data
y_pred = model.predict(X_test)

# evaluate the performance of the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("Mean squared error:", mse)
print("R2 score:", r2)


Mean squared error: 157320307.15834376
R2 score: -0.09952006291842097


EnsembleRegressor:

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import VotingRegressor

# create individual models
model1 = RandomForestRegressor()
model2 = AdaBoostRegressor()
model3 = GradientBoostingRegressor()

# create ensemble model
ensemble_model = VotingRegressor(estimators=[('rf', model1), ('ada', model2), ('gb', model3)])

# fit the model
ensemble_model.fit(X_train, y_train)

# predict the target variable using test data
y_pred = ensemble_model.predict(X_test)

# evaluate the performance of the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("Mean squared error:", mse)
print("R2 score:", r2)


Mean squared error: 18070964.104027204
R2 score: 0.873701062834451


Ridge Regression

In [None]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, r2_score

# create and fit the model
model = Ridge(alpha=1.0)
model.fit(X_train, y_train)

# predict the target variable using test data
y_pred = model.predict(X_test)

# evaluate the performance of the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("Mean squared error:", mse)
print("R2 score:", r2)


Mean squared error: 31684079.761372153
R2 score: 0.7785582675116934


MLPClassifier

In [None]:
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, r2_score

# create and fit the model
model = MLPRegressor(hidden_layer_sizes=(100, 100), activation='relu', solver='adam', alpha=0.0001, 
                      batch_size='auto', learning_rate='constant', learning_rate_init=0.001, 
                      shuffle=True, random_state=42, tol=0.0001, verbose=True, max_iter=2000)
model.fit(X_train, y_train)

# predict the target variable using test data
y_pred = model.predict(X_test)

# evaluate the performance of the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("Mean squared error:", mse)
print("R2 score:", r2)


Iteration 1, loss = 162282053.84390649
Iteration 2, loss = 162142853.16770035
Iteration 3, loss = 162015470.36818638
Iteration 4, loss = 161881611.32208744
Iteration 5, loss = 161732604.70378876
Iteration 6, loss = 161558318.24786866
Iteration 7, loss = 161357191.22129709
Iteration 8, loss = 161114857.16129392
Iteration 9, loss = 160826875.97026318
Iteration 10, loss = 160474404.74140573
Iteration 11, loss = 160058244.37816665
Iteration 12, loss = 159569873.04883951
Iteration 13, loss = 158981119.36753285
Iteration 14, loss = 158302508.33471650
Iteration 15, loss = 157525871.15469831
Iteration 16, loss = 156634682.13688010
Iteration 17, loss = 155587877.12772968
Iteration 18, loss = 154432268.76683149
Iteration 19, loss = 153105634.89138255
Iteration 20, loss = 151638539.25354871
Iteration 21, loss = 149983406.48173210
Iteration 22, loss = 148145005.80860263
Iteration 23, loss = 146202535.60051629
Iteration 24, loss = 144036946.66518158
Iteration 25, loss = 141689203.18733022
Iteration