In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm

In [2]:
df = pd.read_csv('insurance_data.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1250 entries, 0 to 1249
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1250 non-null   int64  
 1   gender    1250 non-null   object 
 2   BMI       1250 non-null   float64
 3   weight    1248 non-null   float64
 4   children  1250 non-null   int64  
 5   smoker    1249 non-null   object 
 6   region    1250 non-null   object 
 7   expenses  1248 non-null   float64
dtypes: float64(3), int64(2), object(3)
memory usage: 78.3+ KB


In [3]:
# comments:

# (1) there is a small amount of rows with missing values - they can be dropped

# (2) you may want to make use of https://pandas.pydata.org/docs/reference/api/pandas.get_dummies.html
# read through the function documentation carefully, and use dtype=float for the "dtype" parameter

# (3) perform all your computations (solve the task) before the questions part, in a complete, clear and effective manner

# (4) in the questions part only print answers based on your solution

In [27]:
from sklearn.preprocessing import StandardScaler

df = df.dropna()
df_encoded = pd.get_dummies(df,columns=['gender','smoker','region'], drop_first=True, dtype=float)
scaler = StandardScaler()


x_multi_coll = np.array(df_encoded.drop(columns=['expenses']))
x_non_multi_coll = np.array(df_encoded.drop(columns=['expenses', 'weight']))
x_only_significant = np.array(df_encoded.drop(columns=['expenses', 'weight','children', 'region_northwest']))
x_only_significant_scaling = scaler.fit_transform(x_only_significant)

y = np.array(df_encoded['expenses'])

x_multi_coll = sm.add_constant(x_multi_coll)
x_non_multi_coll = sm.add_constant(x_non_multi_coll)
x_only_significant = sm.add_constant(x_only_significant)
x_only_significant_scaling = sm.add_constant(x_only_significant_scaling)

model_multi_coll = sm.OLS(y, x_multi_coll)
model_non_multi_coll = sm.OLS(y, x_non_multi_coll)
model_only_significant = sm.OLS(y, x_only_significant)
model_only_significant_scaling = sm.OLS(y, x_only_significant_scaling)

results_multi_coll = model_multi_coll.fit()
results_non_multi_coll = model_non_multi_coll.fit()
results_only_significant = model_only_significant.fit()
results_only_significant_scaling = model_only_significant_scaling.fit()


# print(df_encoded.drop(columns=['expenses', 'BMI']).columns)
results_multi_coll.summary()


0,1,2,3
Dep. Variable:,y,R-squared:,0.75
Model:,OLS,Adj. R-squared:,0.748
Method:,Least Squares,F-statistic:,412.0
Date:,"Mon, 12 May 2025",Prob (F-statistic):,0.0
Time:,17:20:57,Log-Likelihood:,-12603.0
No. Observations:,1245,AIC:,25230.0
Df Residuals:,1235,BIC:,25280.0
Df Model:,9,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-1.435e+04,1112.067,-12.904,0.000,-1.65e+04,-1.22e+04
x1,259.5958,12.354,21.012,0.000,235.358,283.834
x2,386.4056,40.218,9.608,0.000,307.503,465.308
x3,-20.7583,12.618,-1.645,0.100,-45.513,3.996
x4,124.4372,139.801,0.890,0.374,-149.836,398.711
x5,-123.5982,344.596,-0.359,0.720,-799.656,552.460
x6,2.397e+04,431.116,55.595,0.000,2.31e+04,2.48e+04
x7,-312.6693,493.827,-0.633,0.527,-1281.503,656.164
x8,-1113.1248,496.451,-2.242,0.025,-2087.106,-139.144

0,1,2,3
Omnibus:,268.86,Durbin-Watson:,2.07
Prob(Omnibus):,0.0,Jarque-Bera (JB):,608.893
Skew:,1.187,Prob(JB):,6.0300000000000005e-133
Kurtosis:,5.469,Cond. No.,607.0


#### Questions (answer the questions, all computations should precede this part)

#### Question 1

In [5]:
# did you remove any numerical predictor from the data based on multi-collinearity considerations?
# if not - why, if yes - how did you decide on the predictor to remove?
# print a short (one-sentence) answer using the print() command


#### Question 1 - solution

In [6]:
print("We removed the 'weight' predictor because it is highly correlated with the 'BMI' predictor, and we decided to keep 'BMI' because Pearson correlation to expanses is higher.")

We removed the 'weight' predictor because it is highly correlated with the 'BMI' predictor, and we decided to keep 'BMI' because Pearson correlation to expanses is higher.


#### Question 2

In [7]:
# what is the amount of money a person is likely to spend on medical expenses with each additional year of age?
# write here the value itself (hardcoded) based on your inspection of the regression summary (after taking care of multi-collinearity)
# display your answer as a dataframe (as in assignment 2)

#### Question 2 - solution

In [8]:
print("the amount of money a person is likely to spend on medical expenses with each additional year of age is 259.5958")

the amount of money a person is likely to spend on medical expenses with each additional year of age is 259.5958


#### Question 3

In [9]:
# consider the predictors: age, gender, BMI, weight, children, smoker
# what predictors (out of this list) have significant contribution to predicting medical expenses?

# report only signifnicant predictors sorted by their contribution to the prediction from highest to lowest
# for each predictor specify if it has a positive or a negative effect on the medical expenses

# display your answer as a dataframe with two columns: (1) predictor, (2) effect (positive or negative)
# no need to include the constant (b_0) value

#### Question 4

In [10]:
# compute R-squared for four regression versions:
# (1) including all predictors from the csv file
# (2) including predictors after taking care of the multi-collineraity issue
# (3) (2) above + including only predictors with signficant contribution to the model
# (4) (3) above + after preditor scaling

#### Question 4—solution

In [20]:
answer = [("R-squared for model with multi-collinearity", results_multi_coll.rsquared),
          ("R-squared for model without multi-collinearity",results_non_multi_coll.rsquared),
          ("R-squared for model with only significant predictors" ,results_only_significant.rsquared),
          ("R-squared for model with only significant predictors and scaling" ,results_only_significant_scaling.rsquared)]
df_answer = pd.DataFrame(answer, columns=["model type", "R-squared"])
df_answer


Unnamed: 0,model type,R-squared
0,R-squared for model with multi-collinearity,0.750133
1,R-squared for model without multi-collinearity,0.749585
2,R-squared for model with only significant pred...,0.749354
3,R-squared for model with only significant pred...,0.749354


In [12]:
# what medical expenses may expect a person with the following data?
# age=66, gender=female, BMI=35.4, weight=70.5, children=1, smoker=no, region=southeast

# for this question only, include you computation *in the answer below* using model (3) from Question 4

# !! you may face difficuly adding a constant (sm.add_constant()) to a DataFrame with a single row
# try to search for solution, and in case you need a hint, you may find these links useful - read carefully:
# https://github.com/statsmodels/statsmodels/issues/7057
# https://www.statsmodels.org/0.9.0/generated/statsmodels.tools.tools.add_constant.html
# in this specific case add_constant() has a somewhat unexpected behavior

In [26]:
x_test = pd.DataFrame([{
    'age': 66,
    'BMI': 35.4,
    'gender_male': 0,
    'smoker_yes': 0,
    'region_southeast': 1,
    'region_southwest': 0
}])
# Add a constant column
x_test = sm.add_constant(x_test,has_constant='add')

# Predict using the model
predict = results_only_significant.predict(x_test)
print(predict)

0    13920.90848
dtype: float64
