In [48]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import r2_score,mean_absolute_error
import joblib

In [3]:
df=pd.read_csv('insurance.csv')

In [4]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [6]:
df.shape


(1338, 7)

In [7]:
df.describe()

Unnamed: 0,age,bmi,children,charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801


In [8]:
df.isna().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [9]:
df.columns

Index(['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges'], dtype='object')

In [10]:
df['sex'].unique()

array(['female', 'male'], dtype=object)

In [11]:
df['sex']=df['sex'].map({"female":1,"male":0}).astype('int')

In [12]:
df['sex']

0       1
1       0
2       0
3       0
4       0
       ..
1333    0
1334    1
1335    1
1336    1
1337    1
Name: sex, Length: 1338, dtype: int32

In [14]:
df['smoker'].unique()

array(['yes', 'no'], dtype=object)

In [15]:
df['smoker']=df['smoker'].map({"yes":0,"no":1}).astype('int')

In [18]:
df['region'].unique()

array(['southwest', 'southeast', 'northwest', 'northeast'], dtype=object)

In [22]:
#one hot encoding
df_new=pd.get_dummies(df,columns=['region'])

In [23]:
df_new.dtypes

age                   int64
sex                   int32
bmi                 float64
children              int64
smoker                int32
charges             float64
region_northeast      uint8
region_northwest      uint8
region_southeast      uint8
region_southwest      uint8
dtype: object

## Getting the Features matrix and the target vector

In [24]:
X=df_new.drop(['charges'],axis=1)
y=df_new['charges']

## Splitting the data 

In [26]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=22)

## Model training  

In [28]:
lr=LinearRegression()
sv=SVR()
rfr=RandomForestRegressor()
gbr=GradientBoostingRegressor()

In [29]:
lr.fit(X_train,y_train)
sv.fit(X_train,y_train)
rfr.fit(X_train,y_train)
gbr.fit(X_train,y_train)

GradientBoostingRegressor()

## Predictions using test data 

In [31]:
lr_result=lr.predict(X_test)
sv_result=sv.predict(X_test)
rfr_result=rfr.predict(X_test)
gbr_result=gbr.predict(X_test)

In [32]:
results={'actual':y_test,'lr_result':lr_result,'sv_result':sv_result,'rfr_result':rfr_result,'gbr_result':gbr_result}
res=pd.DataFrame(results)

In [34]:
res

Unnamed: 0,actual,lr_result,sv_result,rfr_result,gbr_result
1231,20167.33603,23196.042130,9122.304714,14788.456132,15340.533412
768,14319.03100,17482.967370,9339.977954,14263.868798,13661.251332
847,2438.05520,10348.659988,9153.405965,6477.134738,4038.523838
510,11763.00090,13651.924766,9309.433109,11974.813498,13618.262808
363,2597.77900,1954.470791,9123.414435,3074.910544,3772.851159
...,...,...,...,...,...
882,2585.85065,708.413526,9124.662434,2550.727691,2499.646738
1039,22493.65964,2413.130204,9118.078501,5832.610340,4856.333335
874,8891.13950,7827.117793,9237.208295,9591.136803,8133.740302
757,23065.42070,32157.680787,9258.113904,25051.769527,26273.425043


## Model evaluation

In [42]:
lr_score=r2_score(y_test,lr_result)
sv_score=r2_score(y_test,sv_result)
rfr_score=r2_score(y_test,rfr_result)
gbr_score=r2_score(y_test,gbr_result)


In [43]:
print(lr_score,
sv_score,
rfr_score,
gbr_score)

0.7573007577902942 -0.1186582084979746 0.8158596556680113 0.8337886489844972


In [44]:
lr_score=mean_absolute_error(y_test,lr_result)
sv_score=mean_absolute_error(y_test,sv_result)
rfr_score=mean_absolute_error(y_test,rfr_result)
gbr_score=mean_absolute_error(y_test,gbr_result)

In [45]:
print(lr_score,
sv_score,
rfr_score,
gbr_score)

4102.86488244744 8396.712866572545 2994.1048526621275 2739.9608155739074


## GradientBoostingRegressor is the best performing model for this dataset. So we will train it again on the entire dataset

In [46]:
gbr=GradientBoostingRegressor()

In [47]:
gbr.fit(X,y)

GradientBoostingRegressor()

In [62]:
joblib.dump(gbr,"insurance_model_final.pkl")

['insurance_model_final.pkl']

In [63]:
model=joblib.load("insurance_model_final.pkl")

In [64]:
df_new.columns

Index(['age', 'sex', 'bmi', 'children', 'smoker', 'charges',
       'region_northeast', 'region_northwest', 'region_southeast',
       'region_southwest'],
      dtype='object')

In [66]:
df_pred = pd.DataFrame({'age' : 40,
        'sex' : 1,
        'bmi' : 40.30,
        'children' : 4,
        'smoker' : 1,
        'region_northeast':1,
        'region_northwest' :0,
        'region_southeast' :0,
       'region_southwest':0},index=[0])
df_pred

Unnamed: 0,age,sex,bmi,children,smoker,region_northeast,region_northwest,region_southeast,region_southwest
0,40,1,40.3,4,1,1,0,0,0


In [73]:
prediction=model.predict(df_pred)
prediction

array([11599.03599307])