In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

In [3]:
data=pd.read_csv("insurance.csv")
data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [4]:
data.shape

(1338, 7)

In [5]:
data.size

9366

In [6]:
data.ndim

2

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [8]:
data.describe()

Unnamed: 0,age,bmi,children,charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801


In [9]:
data.nunique()

age           47
sex            2
bmi          548
children       6
smoker         2
region         4
charges     1337
dtype: int64

In [10]:
# Data Processing

data=pd.get_dummies(data,columns=['sex','smoker','region'])
data.head()

Unnamed: 0,age,bmi,children,charges,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
0,19,27.9,0,16884.924,True,False,False,True,False,False,False,True
1,18,33.77,1,1725.5523,False,True,True,False,False,False,True,False
2,28,33.0,3,4449.462,False,True,True,False,False,False,True,False
3,33,22.705,0,21984.47061,False,True,True,False,False,True,False,False
4,32,28.88,0,3866.8552,False,True,True,False,False,True,False,False


In [11]:
data.isnull().sum()

age                 0
bmi                 0
children            0
charges             0
sex_female          0
sex_male            0
smoker_no           0
smoker_yes          0
region_northeast    0
region_northwest    0
region_southeast    0
region_southwest    0
dtype: int64

In [12]:
data.duplicated().sum()

1

In [13]:
#Splitting data into features and target

X=data.drop(columns=['charges'])
y=data['charges']

In [14]:
#Splitting data into train and test sets

Xtrain,Xtest,ytrain,ytest=train_test_split(X,y,test_size=0.2,random_state=24)

In [15]:
# Feature Scaling

scaler=StandardScaler()
Xtrain_scaled=scaler.fit_transform(Xtrain)
Xtest_scaled=scaler.transform(Xtest)

In [16]:
Xtrain_scaled

array([[ 0.9514509 ,  0.90451524,  0.74091307, ..., -0.56943606,
        -0.59822071,  1.74729228],
       [-1.10278799, -0.87051774, -0.92283717, ..., -0.56943606,
        -0.59822071, -0.5723141 ],
       [ 0.03058519,  0.38269228, -0.09096205, ..., -0.56943606,
        -0.59822071, -0.5723141 ],
       ...,
       [-1.03195217, -0.79203557, -0.92283717, ..., -0.56943606,
         1.67162383, -0.5723141 ],
       [-1.45696711, -1.33055686, -0.92283717, ...,  1.75612342,
        -0.59822071, -0.5723141 ],
       [ 1.73064496,  1.44888095, -0.09096205, ..., -0.56943606,
         1.67162383, -0.5723141 ]])

In [18]:
# Modelling using LR(OLS Model)

ols_model=LinearRegression()
ols_model.fit(Xtrain_scaled,ytrain)

In [19]:
# Training prediction

ols_pred=ols_model.predict(Xtrain_scaled)

In [22]:
df=pd.DataFrame({'Actual':ytrain,'OLS predicted':ols_pred})

In [24]:
df.head()

Unnamed: 0,Actual,OLS predicted
132,11163.568,13918.788247
508,3044.2133,3089.568762
422,39125.33225,33745.568762
613,6753.038,4625.568762
1111,41949.2441,34881.568762


In [25]:
# Test prediction

test_pred=ols_model.predict(Xtest_scaled)

In [26]:
df_test=pd.DataFrame({'Actual':ytest,'OLS predicted':test_pred})

In [27]:
df_test.head()

Unnamed: 0,Actual,OLS predicted
736,40419.0191,33377.568762
561,10923.9332,13329.568762
930,2927.0647,10401.568762
271,42856.838,35998.788247
933,7348.142,10846.788247


In [29]:
# Evaluation

train_r2=r2_score(ytrain,ols_pred)
test_r2=r2_score(ytest,test_pred)
print("Training set r2 score: ",train_r2)
print("Test set r2 score: ",test_r2)
print("**********"*5)

train_mse=mean_squared_error(ytrain,ols_pred)
test_mse=mean_squared_error(ytest,test_pred)
print("Training set mse score: ",train_mse)
print("Test set mse score: ",test_mse)
print("**********"*5)

print("Training set rmse score: ",np.sqrt(train_mse))
print("Test set rmse score: ",np.sqrt(test_mse))
print("**********"*5)

train_mae=mean_absolute_error(ytrain,ols_pred)
test_mae=mean_absolute_error(ytest,test_pred)
print("Training set mae score: ",train_mae)
print("Test set mae score: ",test_mae)
print("**********"*5)

Training set r2 score:  0.7434308010774544
Test set r2 score:  0.7769835249510696
**************************************************
Training set mse score:  37224600.623151645
Test set mse score:  33948664.02833704
**************************************************
Training set rmse score:  6101.196655013805
Test set rmse score:  5826.548208702734
**************************************************
Training set mae score:  4219.43610064218
Test set mae score:  4307.106824001074
**************************************************
