In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler

In [2]:
df = pd.read_csv('insurance_data.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1250 entries, 0 to 1249
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1250 non-null   int64  
 1   gender    1250 non-null   object 
 2   BMI       1250 non-null   float64
 3   weight    1248 non-null   float64
 4   children  1250 non-null   int64  
 5   smoker    1249 non-null   object 
 6   region    1250 non-null   object 
 7   expenses  1248 non-null   float64
dtypes: float64(3), int64(2), object(3)
memory usage: 78.3+ KB


In [3]:
# comments:

# (1) there is a small amount of rows with missing values - they can be dropped

# (2) you may want to make use of https://pandas.pydata.org/docs/reference/api/pandas.get_dummies.html
# read through the function documentation carefully, and use dtype=float for the "dtype" parameter

# (3) perform all your computations (solve the task) before the questions part, in a complete, clear and effective manner

# (4) in the questions part only print answers based on your solution

#### Drop missing values, change categorical values to numerical and check multi-colinearity

In [4]:
# Remove all the rows with the missing values
df.dropna(inplace=True) 

# Change categorical values to numerical
df_train = pd.get_dummies(df, columns=['gender', 'smoker', 'region'], drop_first=True, dtype=float)

#check multi-colinearity
print(df_train.corr())

                       age       BMI    weight  children  expenses  \
age               1.000000  0.110487  0.095153 -0.047744  0.297255   
BMI               0.110487  1.000000  0.702751  0.019617  0.204042   
weight            0.095153  0.702751  1.000000  0.041461  0.139946   
children         -0.047744  0.019617  0.041461  1.000000 -0.017320   
expenses          0.297255  0.204042  0.139946 -0.017320  1.000000   
gender_male      -0.027751  0.040130  0.031041 -0.035967  0.048353   
smoker_yes       -0.028560  0.009467  0.018486 -0.022356  0.786897   
region_northwest -0.009372 -0.143934 -0.098967  0.025904 -0.044917   
region_southeast -0.018734  0.278327  0.234791  0.042594  0.081188   
region_southwest  0.021022 -0.014695 -0.006866 -0.041533 -0.049730   

                  gender_male  smoker_yes  region_northwest  region_southeast  \
age                 -0.027751   -0.028560         -0.009372         -0.018734   
BMI                  0.040130    0.009467         -0.143934        

In [5]:
# Convert categorical variables into numerical dummy variables

#df_train.head()

# Shuffle the dataframe and reset the index
#df_train = df_train.sample(frac=1).reset_index(drop=True)
#df_train.head
#x = df_train.drop(['weight', 'expenses', 'region_northwest', 'region_southwest', 'children', 'gender_male'], axis=1)
#x = df_train.drop(['weight', 'expenses', 'children', 'gender_male'], axis=1)
#x = df_train.drop(['weight', 'expenses', 'region_northwest', 'region_southwest', 'region_southeast', 'children', 'gender_male'], axis=1)
x_train = df_train.drop(['weight', 'expenses', 'region_northwest', 'children', 'gender_male'], axis=1)
y_train = df['expenses']

#### Train model

In [6]:
x_train = sm.add_constant(x_train)
model = sm.OLS(y_train, x_train)
result = model.fit()

result.summary()

0,1,2,3
Dep. Variable:,expenses,R-squared:,0.749
Model:,OLS,Adj. R-squared:,0.748
Method:,Least Squares,F-statistic:,740.7
Date:,"Fri, 16 May 2025",Prob (F-statistic):,0.0
Time:,12:26:11,Log-Likelihood:,-12605.0
No. Observations:,1245,AIC:,25220.0
Df Residuals:,1239,BIC:,25250.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-1.432e+04,1031.334,-13.889,0.000,-1.63e+04,-1.23e+04
age,258.7057,12.327,20.987,0.000,234.522,282.890
BMI,341.7059,29.583,11.551,0.000,283.668,399.744
smoker_yes,2.394e+04,429.783,55.712,0.000,2.31e+04,2.48e+04
region_southeast,-991.5815,428.987,-2.311,0.021,-1833.203,-149.960
region_southwest,-938.8055,430.381,-2.181,0.029,-1783.161,-94.450

0,1,2,3
Omnibus:,268.187,Durbin-Watson:,2.081
Prob(Omnibus):,0.0,Jarque-Bera (JB):,604.425
Skew:,1.187,Prob(JB):,5.63e-132
Kurtosis:,5.453,Cond. No.,359.0


#### Questions (answer the questions, all computations should precede this part)

#### Question 1

In [7]:
# did you remove any numerical predictor from the data based on multi-collinearity considerations?
# if not - why, if yes - how did you decide on the predictor to remove?
# print a short (one-sentence) answer using the print() command

In [8]:
#### Question 1 - Solution

In [9]:
print("Yes we removed 'weight' because the multi-collinearity with BMI, we also removed 'children' and 'gender_male' because they had low correlation with 'expenses' and we removed 'region_northwest because it has 0 in the CI ")

Yes we removed 'weight' because the multi-collinearity with BMI, we also removed 'children' and 'gender_male' because they had low correlation with 'expenses' and we removed 'region_northwest because it has 0 in the CI 


#### Question 2

In [10]:
# what is the amount of money a person is likely to spend on medical expenses with each additional year of age?
# write here the value itself (hardcoded) based on your inspection of the regression summary (after taking care of multi-collinearity)
# display your answer as a dataframe (as in assignment 2)

#### Question 2 - Solution

In [11]:
data = {
    "Description": ["The amount of money a person is likely to spend on medical expenses with each additional year of age"],
    "Amount": [258.7057]
}

df_amount_per_year = pd.DataFrame(data)

df_amount_per_year

Unnamed: 0,Description,Amount
0,The amount of money a person is likely to spen...,258.7057


#### Question 3

In [12]:
# consider the predictors: age, gender, BMI, weight, children, smoker
# what predictors (out of this list) have significant contribution to predicting medical expenses?

# report only signifnicant predictors sorted by their contribution to the prediction from highest to lowest
# for each predictor specify if it has a positive or a negative effect on the medical expenses

# display your answer as a dataframe with two columns: (1) predictor, (2) effect (positive or negative)
# no need to include the constant (b_0) value

#### Question 3 - Solution

In [13]:
scaler= StandardScaler()    
scaler.fit(x_train)
predictors_scaled=scaler.transform(x_train)


#### Question 4

In [14]:
# compute R-squared for four regression versions:
# (1) including all predictors from the csv file
# (2) including predictors after taking care of the multi-collineraity issue
# (3) (2) above + including only predictors with signficant contribution to the model
# (4) (3) above + after preditor scaling

#### Question 4(1) - Solution

In [None]:
x = df_train.drop(['expenses'], axis=1)
y = df_train['expenses']

x = sm.add_constant(x)
model = sm.OLS(y, x)
result = model.fit()

result.summary()

0,1,2,3
Dep. Variable:,expenses,R-squared:,0.75
Model:,OLS,Adj. R-squared:,0.748
Method:,Least Squares,F-statistic:,412.0
Date:,"Fri, 16 May 2025",Prob (F-statistic):,0.0
Time:,12:26:11,Log-Likelihood:,-12603.0
No. Observations:,1245,AIC:,25230.0
Df Residuals:,1235,BIC:,25280.0
Df Model:,9,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-1.435e+04,1112.067,-12.904,0.000,-1.65e+04,-1.22e+04
age,259.5958,12.354,21.012,0.000,235.358,283.834
BMI,386.4056,40.218,9.608,0.000,307.503,465.308
weight,-20.7583,12.618,-1.645,0.100,-45.513,3.996
children,124.4372,139.801,0.890,0.374,-149.836,398.711
gender_male,-123.5982,344.596,-0.359,0.720,-799.656,552.460
smoker_yes,2.397e+04,431.116,55.595,0.000,2.31e+04,2.48e+04
region_northwest,-312.6693,493.827,-0.633,0.527,-1281.503,656.164
region_southeast,-1113.1248,496.451,-2.242,0.025,-2087.106,-139.144

0,1,2,3
Omnibus:,268.86,Durbin-Watson:,2.07
Prob(Omnibus):,0.0,Jarque-Bera (JB):,608.893
Skew:,1.187,Prob(JB):,6.0300000000000005e-133
Kurtosis:,5.469,Cond. No.,607.0


#### Question 4(2) - Solution

In [19]:
df_train.corr()

Unnamed: 0,age,BMI,weight,children,expenses,gender_male,smoker_yes,region_northwest,region_southeast,region_southwest
age,1.0,0.110487,0.095153,-0.047744,0.297255,-0.027751,-0.02856,-0.009372,-0.018734,0.021022
BMI,0.110487,1.0,0.702751,0.019617,0.204042,0.04013,0.009467,-0.143934,0.278327,-0.014695
weight,0.095153,0.702751,1.0,0.041461,0.139946,0.031041,0.018486,-0.098967,0.234791,-0.006866
children,-0.047744,0.019617,0.041461,1.0,-0.01732,-0.035967,-0.022356,0.025904,0.042594,-0.041533
expenses,0.297255,0.204042,0.139946,-0.01732,1.0,0.048353,0.786897,-0.044917,0.081188,-0.04973
gender_male,-0.027751,0.04013,0.031041,-0.035967,0.048353,1.0,0.070157,-0.008706,0.014111,-0.002994
smoker_yes,-0.02856,0.009467,0.018486,-0.022356,0.786897,0.070157,1.0,-0.04132,0.080366,-0.041536
region_northwest,-0.009372,-0.143934,-0.098967,0.025904,-0.044917,-0.008706,-0.04132,1.0,-0.350439,-0.318148
region_southeast,-0.018734,0.278327,0.234791,0.042594,0.081188,0.014111,0.080366,-0.350439,1.0,-0.346617
region_southwest,0.021022,-0.014695,-0.006866,-0.041533,-0.04973,-0.002994,-0.041536,-0.318148,-0.346617,1.0


In [17]:
x = df_train.drop(['expenses','weight'], axis=1)
y = df_train['expenses']

x = sm.add_constant(x)
model = sm.OLS(y, x)
result = model.fit()

result.summary()

0,1,2,3
Dep. Variable:,expenses,R-squared:,0.75
Model:,OLS,Adj. R-squared:,0.748
Method:,Least Squares,F-statistic:,462.5
Date:,"יום ו, 16 מאי 2025",Prob (F-statistic):,0.0
Time:,12:29:42,Log-Likelihood:,-12605.0
No. Observations:,1245,AIC:,25230.0
Df Residuals:,1236,BIC:,25270.0
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-1.433e+04,1112.737,-12.874,0.000,-1.65e+04,-1.21e+04
age,258.9759,12.357,20.958,0.000,234.733,283.219
BMI,341.6386,29.635,11.528,0.000,283.499,399.778
children,115.6520,139.795,0.827,0.408,-158.610,389.914
gender_male,-126.6326,344.829,-0.367,0.714,-803.147,549.882
smoker_yes,2.396e+04,431.366,55.538,0.000,2.31e+04,2.48e+04
region_northwest,-346.5612,493.738,-0.702,0.483,-1315.219,622.096
region_southeast,-1174.3269,495.397,-2.370,0.018,-2146.239,-202.415
region_southwest,-1102.3264,496.904,-2.218,0.027,-2077.194,-127.459

0,1,2,3
Omnibus:,268.981,Durbin-Watson:,2.073
Prob(Omnibus):,0.0,Jarque-Bera (JB):,608.281
Skew:,1.189,Prob(JB):,8.19e-133
Kurtosis:,5.465,Cond. No.,392.0


#### Question 4(3) - Solution

In [20]:
x = df_train.drop(['expenses','weight','children','gender_male','region_northwest'], axis=1)
y = df_train['expenses']

x = sm.add_constant(x)
model = sm.OLS(y, x)
result = model.fit()

result.summary()

0,1,2,3
Dep. Variable:,expenses,R-squared:,0.749
Model:,OLS,Adj. R-squared:,0.748
Method:,Least Squares,F-statistic:,740.7
Date:,"יום ו, 16 מאי 2025",Prob (F-statistic):,0.0
Time:,12:40:01,Log-Likelihood:,-12605.0
No. Observations:,1245,AIC:,25220.0
Df Residuals:,1239,BIC:,25250.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-1.432e+04,1031.334,-13.889,0.000,-1.63e+04,-1.23e+04
age,258.7057,12.327,20.987,0.000,234.522,282.890
BMI,341.7059,29.583,11.551,0.000,283.668,399.744
smoker_yes,2.394e+04,429.783,55.712,0.000,2.31e+04,2.48e+04
region_southeast,-991.5815,428.987,-2.311,0.021,-1833.203,-149.960
region_southwest,-938.8055,430.381,-2.181,0.029,-1783.161,-94.450

0,1,2,3
Omnibus:,268.187,Durbin-Watson:,2.081
Prob(Omnibus):,0.0,Jarque-Bera (JB):,604.425
Skew:,1.187,Prob(JB):,5.63e-132
Kurtosis:,5.453,Cond. No.,359.0


#### Question 4(4) - Solution

In [27]:
x = df_train.drop(['expenses','weight','children','gender_male','region_northwest'], axis=1)
scaler = StandardScaler()
scaler.fit(x)
predictors_scaled = scaler.transform(x)

df_scaler=pd.DataFrame(predictors_scaled, columns=x.columns, index=df_train.index)
x = sm.add_constant(df_scaler)
y= df_train['expenses']


model = sm.OLS(y, x)
result = model.fit()

result.summary()

0,1,2,3
Dep. Variable:,expenses,R-squared:,0.749
Model:,OLS,Adj. R-squared:,0.748
Method:,Least Squares,F-statistic:,740.7
Date:,"יום ו, 16 מאי 2025",Prob (F-statistic):,0.0
Time:,13:07:53,Log-Likelihood:,-12605.0
No. Observations:,1245,AIC:,25220.0
Df Residuals:,1239,BIC:,25250.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,1.326e+04,171.527,77.290,0.000,1.29e+04,1.36e+04
age,3628.1328,172.875,20.987,0.000,3288.973,3967.292
BMI,2086.3898,180.627,11.551,0.000,1732.021,2440.759
smoker_yes,9592.0390,172.172,55.712,0.000,9254.259,9929.819
region_southeast,-443.4051,191.830,-2.311,0.021,-819.753,-67.057
region_southwest,-400.5801,183.640,-2.181,0.029,-760.859,-40.301

0,1,2,3
Omnibus:,268.187,Durbin-Watson:,2.081
Prob(Omnibus):,0.0,Jarque-Bera (JB):,604.425
Skew:,1.187,Prob(JB):,5.63e-132
Kurtosis:,5.453,Cond. No.,1.63


#### Question 5

In [16]:
# what medical expenses may expect a person with the following data?
# age=66, gender=female, BMI=35.4, weight=70.5, children=1, smoker=no, region=southeast

# for this question only, include you computation *in the answer below* using model (3) from Question 4

# !! you may face difficuly adding a constant (sm.add_constant()) to a DataFrame with a single row
# try to search for solution, and in case you need a hint, you may find these links useful - read carefully:
# https://github.com/statsmodels/statsmodels/issues/7057
# https://www.statsmodels.org/0.9.0/generated/statsmodels.tools.tools.add_constant.html
# in this specific case add_constant() has a somewhat unexpected behavior