In [11]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler

In [12]:
df = pd.read_csv('insurance_data.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1250 entries, 0 to 1249
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1250 non-null   int64  
 1   gender    1250 non-null   object 
 2   BMI       1250 non-null   float64
 3   weight    1248 non-null   float64
 4   children  1250 non-null   int64  
 5   smoker    1249 non-null   object 
 6   region    1250 non-null   object 
 7   expenses  1248 non-null   float64
dtypes: float64(3), int64(2), object(3)
memory usage: 78.3+ KB


In [13]:
# comments:

# (1) there is a small amount of rows with missing values - they can be dropped

# (2) you may want to make use of https://pandas.pydata.org/docs/reference/api/pandas.get_dummies.html
# read through the function documentation carefully, and use dtype=float for the "dtype" parameter

# (3) perform all your computations (solve the task) before the questions part, in a complete, clear and effective manner

# (4) in the questions part only print answers based on your solution

#### Drop missing values, change categorical values to numerical and check multi-colinearity

In [14]:
# Remove all the rows with the missing values
df.dropna(inplace=True) 

# Change categorical values to numerical
df_no_dummies = pd.get_dummies(df, columns=['gender', 'smoker', 'region'], drop_first=True, dtype=float)

#check multi-colinearity
print(df_no_dummies.corr())

                       age       BMI    weight  children  expenses  \
age               1.000000  0.110487  0.095153 -0.047744  0.297255   
BMI               0.110487  1.000000  0.702751  0.019617  0.204042   
weight            0.095153  0.702751  1.000000  0.041461  0.139946   
children         -0.047744  0.019617  0.041461  1.000000 -0.017320   
expenses          0.297255  0.204042  0.139946 -0.017320  1.000000   
gender_male      -0.027751  0.040130  0.031041 -0.035967  0.048353   
smoker_yes       -0.028560  0.009467  0.018486 -0.022356  0.786897   
region_northwest -0.009372 -0.143934 -0.098967  0.025904 -0.044917   
region_southeast -0.018734  0.278327  0.234791  0.042594  0.081188   
region_southwest  0.021022 -0.014695 -0.006866 -0.041533 -0.049730   

                  gender_male  smoker_yes  region_northwest  region_southeast  \
age                 -0.027751   -0.028560         -0.009372         -0.018734   
BMI                  0.040130    0.009467         -0.143934        

In [15]:
# Convert categorical variables into numerical dummy variables

df_train=df_no_dummies

# Shuffle the dataframe and reset the index
#df_train = df_train.sample(frac=1).reset_index(drop=True)
#df_train.head
#x = df_train.drop(['weight', 'expenses', 'region_northwest', 'region_southwest', 'children', 'gender_male'], axis=1)
#x = df_train.drop(['weight', 'expenses', 'children', 'gender_male'], axis=1)
#x = df_train.drop(['weight', 'expenses', 'region_northwest', 'region_southwest', 'region_southeast', 'children', 'gender_male'], axis=1)
x_train = df_train.drop(['weight', 'expenses', 'region_northwest', 'children', 'gender_male'], axis=1)
y_train = df['expenses']

#### Train model

In [16]:
x_train = sm.add_constant(x_train)
model = sm.OLS(y_train, x_train)
result = model.fit()

result.summary()

0,1,2,3
Dep. Variable:,expenses,R-squared:,0.749
Model:,OLS,Adj. R-squared:,0.748
Method:,Least Squares,F-statistic:,740.7
Date:,"Sat, 17 May 2025",Prob (F-statistic):,0.0
Time:,12:45:50,Log-Likelihood:,-12605.0
No. Observations:,1245,AIC:,25220.0
Df Residuals:,1239,BIC:,25250.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-1.432e+04,1031.334,-13.889,0.000,-1.63e+04,-1.23e+04
age,258.7057,12.327,20.987,0.000,234.522,282.890
BMI,341.7059,29.583,11.551,0.000,283.668,399.744
smoker_yes,2.394e+04,429.783,55.712,0.000,2.31e+04,2.48e+04
region_southeast,-991.5815,428.987,-2.311,0.021,-1833.203,-149.960
region_southwest,-938.8055,430.381,-2.181,0.029,-1783.161,-94.450

0,1,2,3
Omnibus:,268.187,Durbin-Watson:,2.081
Prob(Omnibus):,0.0,Jarque-Bera (JB):,604.425
Skew:,1.187,Prob(JB):,5.63e-132
Kurtosis:,5.453,Cond. No.,359.0


#### Question 3 - Calculation

In [20]:
df_train.head()
df_scaled = df_train.copy()
df_scaled.drop(['region_northwest', 'region_southwest', 'region_southeast',], axis=1, inplace=True)

scaler = StandardScaler()
scaler.fit(df_scaled)
predictors_scaled = scaler.transform(df_scaled)

df = df.sample(frac=1).reset_index(drop = True)
df_scaled = pd.DataFrame(predictors_scaled, columns=["age","gender_male","BMI","weight","children","smoker_yes","expenses"])

x = df_scaled[["age","BMI","children","smoker_yes"]] #without weight and gender
y = df_scaled[["expenses"]]
x = sm.add_constant(x)
model = sm.OLS(y,x)
result = model.fit()
result.summary()

0,1,2,3
Dep. Variable:,expenses,R-squared:,0.701
Model:,OLS,Adj. R-squared:,0.7
Method:,Least Squares,F-statistic:,727.5
Date:,"Sat, 17 May 2025",Prob (F-statistic):,2.5e-323
Time:,12:47:00,Log-Likelihood:,-1014.6
No. Observations:,1245,AIC:,2039.0
Df Residuals:,1240,BIC:,2065.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-2.822e-16,0.016,-1.82e-14,1.000,-0.030,0.030
age,-0.2823,0.016,-17.316,0.000,-0.314,-0.250
BMI,-0.0786,0.016,-5.004,0.000,-0.109,-0.048
children,0.8807,0.016,53.714,0.000,0.849,0.913
smoker_yes,0.0222,0.016,1.425,0.154,-0.008,0.053

0,1,2,3
Omnibus:,152.967,Durbin-Watson:,2.008
Prob(Omnibus):,0.0,Jarque-Bera (JB):,640.308
Skew:,0.516,Prob(JB):,9.1e-140
Kurtosis:,6.358,Cond. No.,1.41


#### Question 4(1) - Calculation

In [None]:
x = df_train.drop(['expenses'], axis=1)
y = df_train['expenses']

x = sm.add_constant(x)
model = sm.OLS(y, x)
result_4_1  = model.fit()

result_4_1 .summary()

#### Question 4(2) - Calculation

In [None]:
x = df_train.drop(['expenses','weight'], axis=1)
y = df_train['expenses']

x = sm.add_constant(x)
model = sm.OLS(y, x)
result_4_2  = model.fit()

result_4_2 .summary()

In [None]:
df_train.corr()

#### Question 4(3) - Calculation

In [None]:
x = df_train.drop(['expenses','weight','children','gender_male','region_northwest'], axis=1)
y = df_train['expenses']

x = sm.add_constant(x)
model = sm.OLS(y, x)
result_4_3  = model.fit()

result_4_3 .summary()

#### Question 4(4) - Calculation

In [None]:
x = df_train.drop(['expenses','weight','children','gender_male','region_northwest'], axis=1)
scaler = StandardScaler()
scaler.fit(x)
predictors_scaled = scaler.transform(x)

df_scaler=pd.DataFrame(predictors_scaled, columns=x.columns, index=df_train.index)
x = sm.add_constant(df_scaler)
y= df_train['expenses']


model = sm.OLS(y, x)
result_4_4 = model.fit()

result_4_4.summary()

#### Questions (answer the questions, all computations should precede this part)

#### Question 1

In [None]:
# did you remove any numerical predictor from the data based on multi-collinearity considerations?
# if not - why, if yes - how did you decide on the predictor to remove?
# print a short (one-sentence) answer using the print() command

#### Question 1 - Solution

In [None]:
print("Yes we removed 'weight' because the multi-collinearity with BMI, we also removed 'children' and 'gender_male' because they had low correlation with 'expenses' and we removed 'region_northwest because it has 0 in the CI ")

#### Question 2

In [None]:
# what is the amount of money a person is likely to spend on medical expenses with each additional year of age?
# write here the value itself (hardcoded) based on your inspection of the regression summary (after taking care of multi-collinearity)
# display your answer as a dataframe (as in assignment 2)

#### Question 2 - Solution

In [None]:
data = {
    "Description": ["The amount of money a person is likely to spend on medical expenses with each additional year of age"],
    "Amount": [258.7057]
}

df_amount_per_year = pd.DataFrame(data)

df_amount_per_year

#### Question 3

In [None]:
# consider the predictors: age, gender, BMI, weight, children, smoker
# what predictors (out of this list) have significant contribution to predicting medical expenses?

# report only signifnicant predictors sorted by their contribution to the prediction from highest to lowest
# for each predictor specify if it has a positive or a negative effect on the medical expenses

# display your answer as a dataframe with two columns: (1) predictor, (2) effect (positive or negative)
# no need to include the constant (b_0) value

#### Question 3 - solution

In [23]:
data_q3 = {
    "predictor": ["smoking","age", "BMI"],
    "Amount": ["positive", "positive","positive"]
}

pd.DataFrame(data_q3)

Unnamed: 0,predictor,Amount
0,smoking,positive
1,age,positive
2,BMI,positive


#### Question 4

In [None]:
# compute R-squared for four regression versions:
# (1) including all predictors from the csv file
# (2) including predictors after taking care of the multi-collineraity issue
# (3) (2) above + including only predictors with signficant contribution to the model
# (4) (3) above + after preditor scaling

#### Question 4 - Solution

In [None]:
data_q4={
    "predictors": ["1. including all predictors", "2.predictors after taking care of the multi-collineraity issue", "3. only [redictors with signficant contribution +(2)",
                   "4. after preditor scaling +(3)"], "R-squared": ["0.750", "0.749", "0.749", "0.749"]
                  
}
df_q4 = pd.DataFrame(data_q4)
df_q4

#### Question 5

In [None]:
# what medical expenses may expect a person with the following data?
# age=66, gender=female, BMI=35.4, weight=70.5, children=1, smoker=no, region=southeast

# for this question only, include you computation *in the answer below* using model (3) from Question 4

# !! you may face difficuly adding a constant (sm.add_constant()) to a DataFrame with a single row
# try to search for solution, and in case you need a hint, you may find these links useful - read carefully:
# https://github.com/statsmodels/statsmodels/issues/7057
# https://www.statsmodels.org/0.9.0/generated/statsmodels.tools.tools.add_constant.html
# in this specific case add_constant() has a somewhat unexpected behavior

#### Question 5 - Solution

In [None]:
person_medical_data = {
    "age": 66,
    "BMI": 35.4,
    "smoker_yes": 0,
    "region_southeast": 1,
    "region_southwest": 0 
    }

person_medical_df = pd.DataFrame([person_medical_data])

person_medical_df = sm.add_constant(person_medical_df, has_constant='add')

predicted_medical_expenses = result_4_3.predict(person_medical_df)

pd.DataFrame(predicted_medical_expenses, columns=["Predicted Medical Expenses"])