In [6]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
import sklearn

In [7]:
## reading data 

data=pd.read_csv('insurance.csv')

In [8]:
data.head(10)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552
5,31,female,25.74,0,no,southeast,3756.6216
6,46,female,33.44,1,no,southeast,8240.5896
7,37,female,27.74,3,no,northwest,7281.5056
8,37,male,29.83,2,no,northeast,6406.4107
9,60,female,25.84,0,no,northwest,28923.13692


In [9]:
data.dtypes

age           int64
sex          object
bmi         float64
children      int64
smoker       object
region       object
charges     float64
dtype: object

In [10]:
from sklearn.linear_model import LinearRegression

In [11]:
## independent variables 
X=data.drop('charges',axis=1)

In [12]:
# traget variables
Y=data.charges

In [13]:
# dummy variable creation 
data_dummy=pd.get_dummies(data[['sex','smoker','region']])

In [14]:
X=X.drop(['sex','smoker','region'],axis=1)

In [15]:
data_dummy

Unnamed: 0,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
0,1,0,0,1,0,0,0,1
1,0,1,1,0,0,0,1,0
2,0,1,1,0,0,0,1,0
3,0,1,1,0,0,1,0,0
4,0,1,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...
1333,0,1,1,0,0,1,0,0
1334,1,0,1,0,1,0,0,0
1335,1,0,1,0,0,0,1,0
1336,1,0,1,0,0,0,0,1


In [16]:
## combining dummy variable to other i.v
X=pd.concat([X,data_dummy],axis=1)

In [17]:
X.describe()

Unnamed: 0,age,bmi,children,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
count,1338.0,1338.0,1338.0,1338.0,1338.0,1338.0,1338.0,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,0.494768,0.505232,0.795217,0.204783,0.242152,0.2429,0.272048,0.2429
std,14.04996,6.098187,1.205493,0.50016,0.50016,0.403694,0.403694,0.428546,0.428995,0.445181,0.428995
min,18.0,15.96,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,27.0,26.29625,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
50%,39.0,30.4,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
75%,51.0,34.69375,2.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
max,64.0,53.13,5.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [18]:
X.shape

(1338, 11)

In [19]:
## trying model on whole data 
lm=LinearRegression()

In [20]:
model=lm.fit(X,Y) 

In [21]:
?model

In [22]:
model

LinearRegression()

In [23]:
## predicting on whole data 
pred_charge=model.predict(X)

In [24]:
pred_charge

array([25293.7130284 ,  3448.60283431,  6706.9884907 , ...,
        4149.13248568,  1246.58493898, 37085.62326757])

In [25]:
## r square 
model.score(X,Y)

0.7509130345985207

In [26]:
 pd.Series(model.coef_)

0       256.856353
1       339.193454
2       475.500545
3        65.657180
4       -65.657180
5    -11924.267271
6     11924.267271
7       587.009235
8       234.045336
9      -448.012814
10     -373.041756
dtype: float64

In [27]:
X.columns

Index(['age', 'bmi', 'children', 'sex_female', 'sex_male', 'smoker_no',
       'smoker_yes', 'region_northeast', 'region_northwest',
       'region_southeast', 'region_southwest'],
      dtype='object')

In [28]:
## variable with different cofficients 
pd.concat([pd.Series(X.columns),pd.Series(model.coef_)],axis=1)

Unnamed: 0,0,1
0,age,256.856353
1,bmi,339.193454
2,children,475.500545
3,sex_female,65.65718
4,sex_male,-65.65718
5,smoker_no,-11924.267271
6,smoker_yes,11924.267271
7,region_northeast,587.009235
8,region_northwest,234.045336
9,region_southeast,-448.012814


In [37]:
## building model using statsmodels 
import pandas as pd
import numpy as np
from sklearn import datasets, linear_model
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
from scipy import stats
X2 = sm.add_constant(X)
est = sm.OLS(Y, X2)
est2 = est.fit()
print(est2.summary())

                            OLS Regression Results                            
Dep. Variable:                charges   R-squared:                       0.751
Model:                            OLS   Adj. R-squared:                  0.749
Method:                 Least Squares   F-statistic:                     500.8
Date:                Sat, 17 Dec 2022   Prob (F-statistic):               0.00
Time:                        21:18:56   Log-Likelihood:                -13548.
No. Observations:                1338   AIC:                         2.711e+04
Df Residuals:                    1329   BIC:                         2.716e+04
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
const             -296.4168    430.507  

In [38]:
## using significant variables based on P values 
X1=X.loc[:,['age','bmi','children','smoker_no','smoker_yes']]
X21 = sm.add_constant(X1)
est = sm.OLS(Y, X21)
est2 = est.fit()
print(est2.summary())

                            OLS Regression Results                            
Dep. Variable:                charges   R-squared:                       0.750
Model:                            OLS   Adj. R-squared:                  0.749
Method:                 Least Squares   F-statistic:                     998.1
Date:                Sat, 17 Dec 2022   Prob (F-statistic):               0.00
Time:                        21:18:57   Log-Likelihood:                -13551.
No. Observations:                1338   AIC:                         2.711e+04
Df Residuals:                    1333   BIC:                         2.714e+04
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       -131.3796    629.912     -0.209      0.8

In [39]:
## dividing data in train and test 
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=0.3)

In [40]:
x_train

Unnamed: 0,age,bmi,children,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
547,54,46.700,2,1,0,1,0,0,0,0,1
169,27,18.905,3,0,1,1,0,1,0,0,0
1244,18,33.330,0,0,1,1,0,0,0,1,0
306,28,27.500,2,1,0,1,0,0,0,0,1
1267,24,31.065,0,0,1,0,1,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
550,63,30.800,0,0,1,1,0,0,0,0,1
274,25,27.550,0,0,1,1,0,0,1,0,0
143,29,29.735,2,0,1,1,0,0,1,0,0
1034,61,38.380,0,0,1,1,0,0,1,0,0


In [41]:
## model building on train data 
X2_train = sm.add_constant(x_train)
est = sm.OLS(y_train, X2_train)
est2 = est.fit()
print(est2.summary())

                            OLS Regression Results                            
Dep. Variable:                charges   R-squared:                       0.762
Model:                            OLS   Adj. R-squared:                  0.760
Method:                 Least Squares   F-statistic:                     370.4
Date:                Sat, 17 Dec 2022   Prob (F-statistic):          1.47e-282
Time:                        21:18:59   Log-Likelihood:                -9464.2
No. Observations:                 936   AIC:                         1.895e+04
Df Residuals:                     927   BIC:                         1.899e+04
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
const             -378.1574    498.497  

In [42]:
## model using only signicant variables based on p values 
X2_train1=x_train.loc[:,['age','bmi','children','smoker_no','smoker_yes']]

In [43]:
X2_train2 = sm.add_constant(X2_train1)
est = sm.OLS(y_train, X2_train2)
est2 = est.fit()
print(est2.summary())

                            OLS Regression Results                            
Dep. Variable:                charges   R-squared:                       0.761
Model:                            OLS   Adj. R-squared:                  0.760
Method:                 Least Squares   F-statistic:                     740.1
Date:                Sat, 17 Dec 2022   Prob (F-statistic):          2.48e-287
Time:                        21:19:01   Log-Likelihood:                -9466.0
No. Observations:                 936   AIC:                         1.894e+04
Df Residuals:                     931   BIC:                         1.897e+04
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       -315.6667    724.811     -0.436      0.6

In [44]:
## prediction on train data 
x_train_pred=est2.predict(X2_train2)

In [45]:
from sklearn.metrics import mean_squared_error
from math import sqrt

rms_train = sqrt(mean_squared_error(y_train, x_train_pred))

In [46]:
## error on train data 
rms_train

5969.016717384857

In [47]:
X2_test1=x_test.loc[:,['age','bmi','children','smoker_no','smoker_yes']]
X2_test2 = sm.add_constant(X2_test1)

In [48]:
## prediction on test data 
x_test_pred=est2.predict(X2_test2)

In [49]:
from sklearn.metrics import mean_squared_error
from math import sqrt

rms_test = sqrt(mean_squared_error(y_test, x_test_pred))

In [50]:
## error on test data 
rms_test

6260.040161209079