In [259]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [260]:
dataset = pd.read_csv('insurance.csv')
dataset.shape

(1338, 7)

In [261]:
dataset.columns

Index(['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges'], dtype='object')

In [262]:
dataset.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [263]:
dataset.dtypes

age           int64
sex          object
bmi         float64
children      int64
smoker       object
region       object
charges     float64
dtype: object

In [264]:
np.sum((dataset.isna()))

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [265]:
X = dataset.iloc[:,:-1].values
y = dataset.iloc[:,-1].values
X

array([[19, 'female', 27.9, 0, 'yes', 'southwest'],
       [18, 'male', 33.77, 1, 'no', 'southeast'],
       [28, 'male', 33.0, 3, 'no', 'southeast'],
       ...,
       [18, 'female', 36.85, 0, 'no', 'southeast'],
       [21, 'female', 25.8, 0, 'no', 'southwest'],
       [61, 'female', 29.07, 0, 'yes', 'northwest']], dtype=object)

In [266]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer( transformers = [('encoder', OneHotEncoder(), [5, 4, 1])], remainder = 'passthrough')
X = np.array(ct.fit_transform(X))
X[:9,:] # region, smoker, sex

array([[0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 19, 27.9, 0],
       [0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 18, 33.77, 1],
       [0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 28, 33.0, 3],
       [0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 33, 22.705, 0],
       [0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 32, 28.88, 0],
       [0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 31, 25.74, 0],
       [0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 46, 33.44, 1],
       [0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 37, 27.74, 3],
       [1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 37, 29.83, 2]],
      dtype=object)

In [239]:
reg_data = X[:,[4,5,8,9,10]]
reg_data

array([[0.0, 1.0, 19, 27.9, 0],
       [1.0, 0.0, 18, 33.77, 1],
       [1.0, 0.0, 28, 33.0, 3],
       ...,
       [1.0, 0.0, 18, 36.85, 0],
       [1.0, 0.0, 21, 25.8, 0],
       [0.0, 1.0, 61, 29.07, 0]], dtype=object)

In [267]:
X.shape

(1338, 11)

### Building a model

In [268]:
import statsmodels.api as sm

In [269]:
X = np.append( arr = np.ones((1338,1)).astype(int), values = X, axis = 1).astype('float64')
X[:9,:]

array([[ 1.  ,  0.  ,  0.  ,  0.  ,  1.  ,  0.  ,  1.  ,  1.  ,  0.  ,
        19.  , 27.9 ,  0.  ],
       [ 1.  ,  0.  ,  0.  ,  1.  ,  0.  ,  1.  ,  0.  ,  0.  ,  1.  ,
        18.  , 33.77,  1.  ],
       [ 1.  ,  0.  ,  0.  ,  1.  ,  0.  ,  1.  ,  0.  ,  0.  ,  1.  ,
        28.  , 33.  ,  3.  ],
       [ 1.  ,  0.  ,  1.  ,  0.  ,  0.  ,  1.  ,  0.  ,  0.  ,  1.  ,
        33.  , 22.7 ,  0.  ],
       [ 1.  ,  0.  ,  1.  ,  0.  ,  0.  ,  1.  ,  0.  ,  0.  ,  1.  ,
        32.  , 28.88,  0.  ],
       [ 1.  ,  0.  ,  0.  ,  1.  ,  0.  ,  1.  ,  0.  ,  1.  ,  0.  ,
        31.  , 25.74,  0.  ],
       [ 1.  ,  0.  ,  0.  ,  1.  ,  0.  ,  1.  ,  0.  ,  1.  ,  0.  ,
        46.  , 33.44,  1.  ],
       [ 1.  ,  0.  ,  1.  ,  0.  ,  0.  ,  1.  ,  0.  ,  1.  ,  0.  ,
        37.  , 27.74,  3.  ],
       [ 1.  ,  1.  ,  0.  ,  0.  ,  0.  ,  1.  ,  0.  ,  0.  ,  1.  ,
        37.  , 29.83,  2.  ]])

In [270]:
X.shape

(1338, 12)

In [271]:
y.dtype

dtype('float64')

In [272]:
X_opt = X[:,[0,1,2,3,4,5,6,7,8,9,10,11]]
X_opt

array([[ 1.  ,  0.  ,  0.  , ..., 19.  , 27.9 ,  0.  ],
       [ 1.  ,  0.  ,  0.  , ..., 18.  , 33.77,  1.  ],
       [ 1.  ,  0.  ,  0.  , ..., 28.  , 33.  ,  3.  ],
       ...,
       [ 1.  ,  0.  ,  0.  , ..., 18.  , 36.85,  0.  ],
       [ 1.  ,  0.  ,  0.  , ..., 21.  , 25.8 ,  0.  ],
       [ 1.  ,  0.  ,  1.  , ..., 61.  , 29.07,  0.  ]])

In [273]:
reg_ols = sm.OLS(y,X_opt).fit()
reg_ols.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.751
Model:,OLS,Adj. R-squared:,0.749
Method:,Least Squares,F-statistic:,500.8
Date:,"Wed, 07 Jun 2023",Prob (F-statistic):,0.0
Time:,19:12:26,Log-Likelihood:,-13548.0
No. Observations:,1338,AIC:,27110.0
Df Residuals:,1329,BIC:,27160.0
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-296.4168,430.507,-0.689,0.491,-1140.964,548.130
x1,512.9050,300.348,1.708,0.088,-76.303,1102.113
x2,159.9411,301.334,0.531,0.596,-431.201,751.083
x3,-522.1170,330.759,-1.579,0.115,-1170.983,126.749
x4,-447.1459,310.933,-1.438,0.151,-1057.119,162.827
x5,-1.207e+04,282.338,-42.759,0.000,-1.26e+04,-1.15e+04
x6,1.178e+04,313.530,37.560,0.000,1.12e+04,1.24e+04
x7,-82.5512,269.226,-0.307,0.759,-610.706,445.604
x8,-213.8656,274.976,-0.778,0.437,-753.299,325.568

0,1,2,3
Omnibus:,300.366,Durbin-Watson:,2.088
Prob(Omnibus):,0.0,Jarque-Bera (JB):,718.887
Skew:,1.211,Prob(JB):,7.860000000000001e-157
Kurtosis:,5.651,Cond. No.,7.76e+16


In [274]:
X_opt = X[:,[0,1,2,3,4,5,6,8,9,10,11]]
reg_ols = sm.OLS(y,X_opt).fit()
reg_ols.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.751
Model:,OLS,Adj. R-squared:,0.749
Method:,Least Squares,F-statistic:,500.8
Date:,"Wed, 07 Jun 2023",Prob (F-statistic):,0.0
Time:,19:12:33,Log-Likelihood:,-13548.0
No. Observations:,1338,AIC:,27110.0
Df Residuals:,1329,BIC:,27160.0
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-343.5889,559.574,-0.614,0.539,-1441.333,754.155
x1,501.1120,310.125,1.616,0.106,-107.276,1109.500
x2,148.1481,311.252,0.476,0.634,-462.450,758.746
x3,-533.9100,347.973,-1.534,0.125,-1216.546,148.726
x4,-458.9390,323.880,-1.417,0.157,-1094.311,176.433
x5,-1.21e+04,327.918,-36.887,0.000,-1.27e+04,-1.15e+04
x6,1.175e+04,366.577,32.060,0.000,1.1e+04,1.25e+04
x7,-131.3144,332.945,-0.394,0.693,-784.470,521.842
x8,256.8564,11.899,21.587,0.000,233.514,280.199

0,1,2,3
Omnibus:,300.366,Durbin-Watson:,2.088
Prob(Omnibus):,0.0,Jarque-Bera (JB):,718.887
Skew:,1.211,Prob(JB):,7.860000000000001e-157
Kurtosis:,5.651,Cond. No.,7e+16


In [275]:
X_opt = X[:,[0,1,2,3,4,5,6,9,10,11]]
reg_ols = sm.OLS(y,X_opt).fit()
reg_ols.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.751
Model:,OLS,Adj. R-squared:,0.75
Method:,Least Squares,F-statistic:,572.7
Date:,"Wed, 07 Jun 2023",Prob (F-statistic):,0.0
Time:,19:12:34,Log-Likelihood:,-13548.0
No. Observations:,1338,AIC:,27110.0
Df Residuals:,1330,BIC:,27150.0
Df Model:,7,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-376.3422,553.201,-0.680,0.496,-1461.584,708.900
x1,492.3937,309.238,1.592,0.112,-114.254,1099.041
x2,140.2116,310.502,0.452,0.652,-468.915,749.338
x3,-541.9665,347.262,-1.561,0.119,-1223.208,139.275
x4,-466.9810,323.135,-1.445,0.149,-1100.890,166.928
x5,-1.211e+04,326.781,-37.047,0.000,-1.27e+04,-1.15e+04
x6,1.173e+04,361.998,32.403,0.000,1.1e+04,1.24e+04
x7,256.9736,11.891,21.610,0.000,233.646,280.301
x8,338.6646,28.559,11.858,0.000,282.639,394.690

0,1,2,3
Omnibus:,300.735,Durbin-Watson:,2.089
Prob(Omnibus):,0.0,Jarque-Bera (JB):,720.516
Skew:,1.212,Prob(JB):,3.48e-157
Kurtosis:,5.654,Cond. No.,6.77e+16


In [276]:
X_opt = X[:,[0,1,3,4,5,6,9,10,11]]
reg_ols = sm.OLS(y,X_opt).fit()
reg_ols.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.751
Model:,OLS,Adj. R-squared:,0.75
Method:,Least Squares,F-statistic:,572.7
Date:,"Wed, 07 Jun 2023",Prob (F-statistic):,0.0
Time:,19:12:34,Log-Likelihood:,-13548.0
No. Observations:,1338,AIC:,27110.0
Df Residuals:,1330,BIC:,27150.0
Df Model:,7,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-282.8678,654.466,-0.432,0.666,-1566.766,1001.030
x1,352.1821,476.120,0.740,0.460,-581.847,1286.211
x2,-682.1780,478.807,-1.425,0.154,-1621.477,257.121
x3,-607.1926,477.052,-1.273,0.203,-1543.050,328.664
x4,-1.206e+04,365.859,-32.962,0.000,-1.28e+04,-1.13e+04
x5,1.178e+04,406.353,28.981,0.000,1.1e+04,1.26e+04
x6,256.9736,11.891,21.610,0.000,233.646,280.301
x7,338.6646,28.559,11.858,0.000,282.639,394.690
x8,474.5665,137.740,3.445,0.001,204.355,744.778

0,1,2,3
Omnibus:,300.735,Durbin-Watson:,2.089
Prob(Omnibus):,0.0,Jarque-Bera (JB):,720.516
Skew:,1.212,Prob(JB):,3.48e-157
Kurtosis:,5.654,Cond. No.,6.12e+16


In [277]:
X_opt = X[:,[0,3,4,5,6,9,10,11]]
reg_ols = sm.OLS(y,X_opt).fit()
reg_ols.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.751
Model:,OLS,Adj. R-squared:,0.75
Method:,Least Squares,F-statistic:,668.3
Date:,"Wed, 07 Jun 2023",Prob (F-statistic):,0.0
Time:,19:12:35,Log-Likelihood:,-13548.0
No. Observations:,1338,AIC:,27110.0
Df Residuals:,1331,BIC:,27150.0
Df Model:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-162.2966,633.734,-0.256,0.798,-1405.524,1080.930
x1,-858.4696,415.206,-2.068,0.039,-1672.998,-43.941
x2,-782.7452,413.756,-1.892,0.059,-1594.430,28.940
x3,-1.2e+04,357.736,-33.553,0.000,-1.27e+04,-1.13e+04
x4,1.184e+04,396.946,29.830,0.000,1.11e+04,1.26e+04
x5,257.0064,11.889,21.617,0.000,233.683,280.330
x6,338.6413,28.554,11.860,0.000,282.625,394.657
x7,471.5441,137.656,3.426,0.001,201.498,741.590

0,1,2,3
Omnibus:,300.125,Durbin-Watson:,2.092
Prob(Omnibus):,0.0,Jarque-Bera (JB):,716.587
Skew:,1.211,Prob(JB):,2.4800000000000003e-156
Kurtosis:,5.643,Cond. No.,7.02e+16


In [278]:
X_opt = X[:,[0,3,5,6,9,10,11]]
reg_ols = sm.OLS(y,X_opt).fit()
reg_ols.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.75
Model:,OLS,Adj. R-squared:,0.749
Method:,Least Squares,F-statistic:,799.7
Date:,"Wed, 07 Jun 2023",Prob (F-statistic):,0.0
Time:,19:12:35,Log-Likelihood:,-13550.0
No. Observations:,1338,AIC:,27110.0
Df Residuals:,1332,BIC:,27140.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-232.4071,633.262,-0.367,0.714,-1474.707,1009.892
x1,-578.8620,388.384,-1.490,0.136,-1340.774,183.050
x2,-1.204e+04,357.453,-33.692,0.000,-1.27e+04,-1.13e+04
x3,1.181e+04,397.013,29.749,0.000,1.1e+04,1.26e+04
x4,257.1365,11.901,21.607,0.000,233.791,280.482
x5,333.4448,28.449,11.721,0.000,277.635,389.255
x6,468.0668,137.777,3.397,0.001,197.783,738.350

0,1,2,3
Omnibus:,302.906,Durbin-Watson:,2.089
Prob(Omnibus):,0.0,Jarque-Bera (JB):,726.891
Skew:,1.22,Prob(JB):,1.44e-158
Kurtosis:,5.662,Cond. No.,2.9e+16


In [361]:
Xr = dataset.iloc[:,:-1]
yr = dataset.iloc[:,-1]
Xr

Unnamed: 0,age,sex,bmi,children,smoker,region
0,19,female,27.900,0,yes,southwest
1,18,male,33.770,1,no,southeast
2,28,male,33.000,3,no,southeast
3,33,male,22.705,0,no,northwest
4,32,male,28.880,0,no,northwest
...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest
1334,18,female,31.920,0,no,northeast
1335,18,female,36.850,0,no,southeast
1336,21,female,25.800,0,no,southwest


In [362]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer

categories = [('sex',['male','female']),
              ('smoker',['yes','no']),
              ('region',['southwest', 'southeast', 'northwest', 'northeast'])]


ohe_columns = [x[0] for x in categories]
ohe_categories = [x[1] for x in categories]
enc = OneHotEncoder(sparse_output=False, categories=ohe_categories)

transformer = make_column_transformer((enc, ohe_columns), remainder='passthrough')
transformed=transformer.fit_transform(Xr)
Xr_enc = pd.DataFrame(
    transformer.transform(Xr),
    columns=transformer.get_feature_names_out(),
    index=Xr.index
)

In [363]:
Xr_enc

Unnamed: 0,onehotencoder__sex_male,onehotencoder__sex_female,onehotencoder__smoker_yes,onehotencoder__smoker_no,onehotencoder__region_southwest,onehotencoder__region_southeast,onehotencoder__region_northwest,onehotencoder__region_northeast,remainder__age,remainder__bmi,remainder__children
0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,19.0,27.900,0.0
1,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,18.0,33.770,1.0
2,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,28.0,33.000,3.0
3,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,33.0,22.705,0.0
4,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,32.0,28.880,0.0
...,...,...,...,...,...,...,...,...,...,...,...
1333,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,50.0,30.970,3.0
1334,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,18.0,31.920,0.0
1335,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,18.0,36.850,0.0
1336,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,21.0,25.800,0.0


In [364]:
Xr_needed = Xr_enc[['remainder__age', 'remainder__bmi', 'remainder__children', 'onehotencoder__smoker_no','onehotencoder__smoker_yes']]
Xr_needed

Unnamed: 0,remainder__age,remainder__bmi,remainder__children,onehotencoder__smoker_no,onehotencoder__smoker_yes
0,19.0,27.900,0.0,0.0,1.0
1,18.0,33.770,1.0,1.0,0.0
2,28.0,33.000,3.0,1.0,0.0
3,33.0,22.705,0.0,1.0,0.0
4,32.0,28.880,0.0,1.0,0.0
...,...,...,...,...,...
1333,50.0,30.970,3.0,1.0,0.0
1334,18.0,31.920,0.0,1.0,0.0
1335,18.0,36.850,0.0,1.0,0.0
1336,21.0,25.800,0.0,1.0,0.0


In [365]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(Xr_needed,yr, test_size= 0.2, random_state = 0)

In [366]:
print('X_train:',X_train.shape)
print('X_test:', X_test.shape)
print('y_train:', y_train.shape)
print('y_test:', y_test.shape)

X_train: (1070, 5)
X_test: (268, 5)
y_train: (1070,)
y_test: (268,)


In [375]:

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train.iloc[:, :2] = sc.fit_transform(X_train.iloc[:,:2])
X_test.iloc[:, :2] = sc.transform(X_test.iloc[:,:2])

print("X_train\n",X_train)
print("X_test\n",X_test)

X_train
       remainder__age  remainder__bmi  remainder__children  \
621        -0.148533        0.545305                  4.0   
194        -1.497808        0.598672                  0.0   
240        -1.142736        0.960921                  2.0   
1168       -0.503605        0.723195                  2.0   
1192        1.342771        0.269575                  1.0   
...              ...             ...                  ...   
763        -0.858678       -0.759761                  0.0   
835         0.206539        0.847718                  2.0   
1216        0.064510       -0.913394                  0.0   
559        -1.426793        0.776562                  0.0   
684        -0.432591       -1.977500                  1.0   

      onehotencoder__smoker_no  onehotencoder__smoker_yes  
621                        0.0                        1.0  
194                        1.0                        0.0  
240                        0.0                        1.0  
1168              

In [376]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train,y_train)

In [377]:
y_pred = np.array(regressor.predict(X_test))
y_pred

array([11461.22,  9920.25, 37782.65, 15948.08,  6880.18,  3611.72,
        1116.04, 14050.37,  8755.91,  7245.24,  4141.89, 10105.76,
        9070.29,  4297.77, 27698.89, 11158.3 , 11185.2 ,  5566.18,
        8020.78, 26614.64, 33491.4 , 14072.29, 11247.09, 32583.5 ,
        4526.14,  8649.41,   772.17, 10160.38,  4246.31, 10201.55,
        8778.52, 40341.15, 15888.84, 14071.98, 24375.44,  4693.49,
       12917.27, 30917.3 , 33245.26,  3865.5 ,  3634.32,  4441.73,
       30088.45, 39174.93, 28081.32,  4782.78, 11052.5 ,  7665.9 ,
        3205.14, 10741.82,  5132.19,  3180.45, 33323.53, 37851.14,
       16314.95,  6589.8 ,  6075.05,  9305.58,  9388.16, 11615.75,
        2135.69, 38676.  , 15295.08, 12055.12, 13472.33, 14188.  ,
       26220.2 , 31898.86,  1323.3 ,  9909.62, 12684.18, 12011.48,
       25529.35, 15967.47, 11093.83, 12220.8 ,  6967.95, 10009.1 ,
       29734.93, 39299.29, 11596.88, 37453.39,  3993.91,  9676.42,
       34280.18, 29318.8 ,  8743.51,  4678.75, 12380.35, 30644

In [378]:
y_test

578      9724.53000
610      8547.69130
569     45702.02235
1034    12950.07120
198      9644.25250
           ...     
1084    15019.76005
726      6664.68595
1132    20709.02034
725     40932.42950
963      9500.57305
Name: charges, Length: 268, dtype: float64

In [379]:
y_res = pd.DataFrame([y_pred, y_test]).T
y_res.columns= ['pred','test']
y_res

Unnamed: 0,pred,test
0,11461.217784,9724.53000
1,9920.246705,8547.69130
2,37782.650918,45702.02235
3,15948.078800,12950.07120
4,6880.179534,9644.25250
...,...,...
263,14536.060672,15019.76005
264,8081.020601,6664.68595
265,15537.369105,20709.02034
266,33137.972547,40932.42950


In [380]:
regressor.coef_

array([  3590.42,   1982.58,    430.55, -11793.78,  11793.78])

In [381]:
regressor.intercept_

19702.479042114937

In [383]:
sum((y_pred-y_test)**2)

8622017897.418137