In [4]:
import pandas as pd 
import numpy as np 
import seaborn as sns 
import matplotlib.pyplot as plt 
import statsmodels.api as sm 
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

In [6]:
# The Statsmodel API is good for inference type models, such as interpreting coeffcients, confidence intervals and p-values
data = pd.read_csv("Data/insurance.csv")
data.head(3)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462


In [7]:
y = data["charges"]
X = data.drop(columns = ["charges"])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.20, random_state = 42)

ohe = OneHotEncoder(handle_unknown= "ignore", sparse_output = False)

X_train_encoded = ohe.fit_transform(X_train[X.select_dtypes(exclude = "number").columns])
X_test_encoded = ohe.transform(X_test[X.select_dtypes(exclude = "number").columns])

X_train = np.hstack([X_train_encoded, X_train[X.select_dtypes(include = "number").columns]])
X_test = np.hstack([X_test_encoded, X_test[X.select_dtypes(include = "number").columns]])

In [8]:
X_train = sm.add_constant(X_train)
model = sm.OLS(y_train, X_train).fit()

X_test = sm.add_constant(X_test)
y_pred = model.predict(X_test)
r2_score(y_test, y_pred)

0.7835929767120724

In [9]:
model.summary()

0,1,2,3
Dep. Variable:,charges,R-squared:,0.742
Model:,OLS,Adj. R-squared:,0.74
Method:,Least Squares,F-statistic:,380.9
Date:,"Tue, 10 Feb 2026",Prob (F-statistic):,1.32e-305
Time:,08:00:31,Log-Likelihood:,-10845.0
No. Observations:,1070,AIC:,21710.0
Df Residuals:,1061,BIC:,21750.0
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-255.3492,486.430,-0.525,0.600,-1209.824,699.125
x1,-118.3788,306.741,-0.386,0.700,-720.266,483.509
x2,-136.9704,308.173,-0.444,0.657,-741.669,467.728
x3,-1.195e+04,320.386,-37.309,0.000,-1.26e+04,-1.13e+04
x4,1.17e+04,352.808,33.157,0.000,1.1e+04,1.24e+04
x5,395.7479,334.472,1.183,0.237,-260.555,1052.051
x6,25.0706,344.485,0.073,0.942,-650.878,701.019
x7,-262.1164,373.002,-0.703,0.482,-994.022,469.789
x8,-414.0514,351.224,-1.179,0.239,-1103.225,275.122

0,1,2,3
Omnibus:,252.33,Durbin-Watson:,2.085
Prob(Omnibus):,0.0,Jarque-Bera (JB):,613.798
Skew:,1.253,Prob(JB):,5.19e-134
Kurtosis:,5.737,Cond. No.,1.15e+17


In [10]:
model.cov_params()

Unnamed: 0,const,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11
const,236614.339201,117866.708309,118747.630892,107394.143621,129220.19558,37231.795784,46207.6839,90028.195354,63146.664163,-2338.226823,-13341.644023,-10098.840374
x1,117866.708309,94089.955563,23776.752746,50365.466418,67501.241891,18966.283772,24312.695446,43863.342449,30724.386643,-1190.448253,-6538.687992,-4587.785906
x2,118747.630892,23776.752746,94970.878146,57028.677203,61718.953689,18265.512013,21894.988454,46164.852905,32422.27752,-1147.778571,-6802.956031,-5511.054468
x3,107394.143621,50365.466418,57028.677203,102647.268726,4746.874895,16624.188065,17894.721639,46353.828992,26521.404925,-1324.617535,-6743.861304,-4413.482743
x4,129220.19558,67501.241891,61718.953689,4746.874895,124473.320685,20607.607719,28312.962261,43674.366362,36625.259238,-1013.609288,-6597.782719,-5685.357631
x5,37231.795784,18966.283772,18265.512013,16624.188065,20607.607719,111871.805765,-28845.468436,-20185.929068,-25608.612478,-583.071628,-1743.932345,-2250.507283
x6,46207.6839,24312.695446,21894.988454,17894.721639,28312.962261,-28845.468436,118669.689284,-18964.789733,-24651.747216,-690.163969,-2085.936302,-4154.643173
x7,90028.195354,43863.342449,46164.852905,46353.828992,43674.366362,-20185.929068,-18964.789733,139130.434988,-9951.520834,-398.065322,-6039.598864,-1353.018405
x8,63146.664163,30724.386643,32422.27752,26521.404925,36625.259238,-25608.612478,-24651.747216,-9951.520834,123358.54469,-666.925904,-3472.176511,-2340.671513
x9,-2338.226823,-1190.448253,-1147.778571,-1324.617535,-1013.609288,-583.071628,-690.163969,-398.065322,-666.925904,181.642773,-54.275526,-128.480823
