In [None]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler

In [None]:
df = pd.read_csv('insurance_data.csv')
df.info()

In [None]:
# comments:

# (1) there is a small amount of rows with missing values - they can be dropped

# (2) you may want to make use of https://pandas.pydata.org/docs/reference/api/pandas.get_dummies.html
# read through the function documentation carefully, and use dtype=float for the "dtype" parameter

# (3) perform all your computations (solve the task) before the questions part, in a complete, clear and effective manner

# (4) in the questions part only print answers based on your solution

#### Drop missing values, change categorical values to numerical and check multi-colinearity

In [None]:
# Remove all the rows with the missing values
df.dropna(inplace=True) 

# Change categorical values to numerical
df_no_dummies = pd.get_dummies(df, columns=['gender', 'smoker', 'region'], drop_first=True, dtype=float)

#check multi-colinearity
df_no_dummies.corr()

In [None]:
# We've tested the model with these different variables and came to conclusion which variable should stay,
#based on multi-collinarity, correlation with expenses and CI.

# Use the DataFrame with categorical variables already converted to dummy variables
df_train = df_no_dummies

# We tested several feature combinations and selected this one
# based on multicollinearity, correlation with expenses, and confidence intervals

# Examples of other tested combinations:
#x_train = df_train.drop(['weight', 'expenses', 'region_northwest', 'region_southwest', 'children', 'gender_male'], axis=1)
#x_train = df_train.drop(['weight', 'expenses', 'children', 'gender_male'], axis=1)
#x_train = df_train.drop(['weight', 'expenses', 'region_northwest', 'region_southwest', 'region_southeast', 'children', 'gender_male'], axis=1)

# Final selected features (dropping irrelevant or problematic columns)
x_train = df_train.drop(['weight', 'expenses', 'region_northwest', 'children', 'gender_male'], axis=1)

# Target variable (insurance cost)
y_train = df['expenses']

#### Train model

In [None]:
x_train = sm.add_constant(x_train)
model = sm.OLS(y_train, x_train)
result = model.fit()

result.summary()

#### Question 3 - Calculation

In [None]:
df_scaled = df_train.copy()

# Separate features and target
x = df_scaled.drop(columns=["expenses"])
y = df_scaled[["expenses"]]

# Standardize features
scaler = StandardScaler()
scaler.fit(x)
predictors_scaled = scaler.transform(x)
x_scaled = pd.DataFrame(predictors_scaled, columns=x.columns, index=x.index)

# Fit OLS regression model
x_scaled = sm.add_constant(x_scaled)
model = sm.OLS(y, x_scaled)
result = model.fit()
result.summary()

#### Question 4(1) - Calculation

In [None]:
# Define features and target
x = df_train.drop(['expenses'], axis=1)
y = df_train['expenses']

# Fit OLS regression model
x = sm.add_constant(x)
model = sm.OLS(y, x)
result_4_1 = model.fit()

result_4_1.summary()

#### Question 4(2) - Calculation

In [None]:
# Fit OLS model after taking care of the multi-collineraity issue
x = df_train.drop(['expenses', 'weight'], axis=1)
y = df_train['expenses']

x = sm.add_constant(x)
model = sm.OLS(y, x)
result_4_2 = model.fit()

result_4_2.summary()

In [None]:
# Checking correlation
df_train.corr()

#### Question 4(3) - Calculation

In [None]:
# Fit OLS model after taking care of the above + including only predictors with signficant contribution to the model
x = df_train.drop(['expenses','weight','children','gender_male','region_northwest'], axis=1)
y = df_train['expenses']

x = sm.add_constant(x)
model = sm.OLS(y, x)
result_4_3  = model.fit()

result_4_3 .summary()

#### Question 4(4) - Calculation

In [None]:
# Fit OLS model after taking care of the above + after preditor scaling
x = df_train.drop(['expenses','weight','children','gender_male','region_northwest'], axis=1)
scaler = StandardScaler()
scaler.fit(x)
predictors_scaled = scaler.transform(x)

df_scaler=pd.DataFrame(predictors_scaled, columns=x.columns, index=df_train.index)
x = sm.add_constant(df_scaler)
y= df_train['expenses']


model = sm.OLS(y, x)
result_4_4 = model.fit()

result_4_4.summary()

#### Questions (answer the questions, all computations should precede this part)

#### Question 1

In [None]:
# did you remove any numerical predictor from the data based on multi-collinearity considerations?
# if not - why, if yes - how did you decide on the predictor to remove?
# print a short (one-sentence) answer using the print() command

#### Question 1 - Solution

In [None]:
print("Yes we removed 'weight' because the multi-collinearity with BMI. We chose to remove 'weight' and not 'BMI' because 'weight' has 0.139946 collinarity with 'expenses' and 'BMI' has 0.204042 collinarity with 'expenses'")

#### Question 2

In [None]:
# what is the amount of money a person is likely to spend on medical expenses with each additional year of age?
# write here the value itself (hardcoded) based on your inspection of the regression summary (after taking care of multi-collinearity)
# display your answer as a dataframe (as in assignment 2)

#### Question 2 - Solution

In [None]:
data = {
    "Description": ["The amount of money a person is likely to spend on medical expenses with each additional year of age"],
    "Amount": [258.7057]
}

df_amount_per_year = pd.DataFrame(data)

df_amount_per_year

#### Question 3

In [None]:
# consider the predictors: age, gender, BMI, weight, children, smoker
# what predictors (out of this list) have significant contribution to predicting medical expenses?

# report only signifnicant predictors sorted by their contribution to the prediction from highest to lowest
# for each predictor specify if it has a positive or a negative effect on the medical expenses

# display your answer as a dataframe with two columns: (1) predictor, (2) effect (positive or negative)
# no need to include the constant (b_0) value

#### Question 3 - solution

In [None]:
data_q3 = {
    "predictor": ["smoking","age", "BMI"],
    "effect": ["positive", "positive","positive"]
}

pd.DataFrame(data_q3)

#### Question 4

In [None]:
# compute R-squared for four regression versions:
# (1) including all predictors from the csv file
# (2) including predictors after taking care of the multi-collineraity issue
# (3) (2) above + including only predictors with signficant contribution to the model
# (4) (3) above + after preditor scaling

#### Question 4 - Solution

In [None]:
data_q4={
    "predictors": ["1. including all predictors", "2. predictors after taking care of the multi-collineraity issue", "3. (2) + predictors with signficant contribution",
                   "4. (3) + after preditor scaling"], "R-squared": ["0.750", "0.750", "0.749", "0.749"]
                  
}
df_q4 = pd.DataFrame(data_q4)
df_q4

#### Question 5

In [None]:
# what medical expenses may expect a person with the following data?
# age=66, gender=female, BMI=35.4, weight=70.5, children=1, smoker=no, region=southeast

# for this question only, include you computation *in the answer below* using model (3) from Question 4

# !! you may face difficuly adding a constant (sm.add_constant()) to a DataFrame with a single row
# try to search for solution, and in case you need a hint, you may find these links useful - read carefully:
# https://github.com/statsmodels/statsmodels/issues/7057
# https://www.statsmodels.org/0.9.0/generated/statsmodels.tools.tools.add_constant.html
# in this specific case add_constant() has a somewhat unexpected behavior

#### Question 5 - Solution

In [None]:
# Define new person's medical data
person_medical_data = {
    "age": 66,
    "BMI": 35.4,
    "smoker_yes": 0,
    "region_southeast": 1,
    "region_southwest": 0 
}

# Create DataFrame and add constant term for prediction
person_medical_df = pd.DataFrame([person_medical_data])
person_medical_df = sm.add_constant(person_medical_df, has_constant='add')

# Predict medical expenses using result_4_3 model
predicted_medical_expenses = result_4_3.predict(person_medical_df)

# Display prediction as DataFrame
pd.DataFrame(predicted_medical_expenses, columns=["Predicted Medical Expenses"])