In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression, LassoLars, TweedieRegressor
from sklearn.preprocessing import PolynomialFeatures

import wrangle as w
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

In [2]:
def insurance(data):
    
    origin_df = pd.read_csv(data)

    df = origin_df.copy()
    
    numerical = df.select_dtypes(include='number').columns.tolist()
    categorical = df.select_dtypes(exclude='number').columns.tolist()
    to_encode = ('sex', 'smoker')
    
    categorical = [col for col in categorical if col not in to_encode]

    for cat in to_encode:
        df[f'{cat}_encoded'] = df[cat].apply(lambda x: 0 if x in ['no', 'female'] else 1)
    
    dummies_df = pd.get_dummies(df['region'], drop_first=False, dtype='int')

    df = pd.concat([df,dummies_df], axis = 1)


    return origin_df, df

In [3]:
origin_df, df= insurance('insurance.csv')


In [4]:
df

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,sex_encoded,smoker_encoded,northeast,northwest,southeast,southwest
0,19,female,27.900,0,yes,southwest,16884.92400,0,1,0,0,0,1
1,18,male,33.770,1,no,southeast,1725.55230,1,0,0,0,1,0
2,28,male,33.000,3,no,southeast,4449.46200,1,0,0,0,1,0
3,33,male,22.705,0,no,northwest,21984.47061,1,0,0,1,0,0
4,32,male,28.880,0,no,northwest,3866.85520,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830,1,0,0,1,0,0
1334,18,female,31.920,0,no,northeast,2205.98080,0,0,1,0,0,0
1335,18,female,36.850,0,no,southeast,1629.83350,0,0,0,0,1,0
1336,21,female,25.800,0,no,southwest,2007.94500,0,0,0,0,0,1


In [5]:
origin_df

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


In [6]:
train, validate, test = w.data_split(df)
train.shape, validate.shape, test.shape

((936, 13), (201, 13), (201, 13))

In [7]:
def model_split(train, validate, test):

    X_train= train.drop(columns = ['sex','smoker','region','charges'])
    y_train = train.charges
    
    
    X_validate = validate.drop(columns = ['sex','smoker','region','charges'])
    y_validate = validate.charges
    
    X_test = test.drop(columns = ['sex','smoker','region','charges'])
    y_test = test.charges



    return X_train, y_train, X_validate, y_validate, X_test, y_test


In [8]:
X_train, y_train, X_validate, y_validate, X_test, y_test = model_split (train,validate, test)

In [9]:
X_train_scaled = X_train.copy()
X_validate_scaled = X_validate.copy()
X_test_scaled = X_test.copy()


scaler = StandardScaler()

X_train_scaled[['age','bmi','children']] = scaler.fit_transform(X_train_scaled[['age','bmi','children']])
X_validate_scaled[['age','bmi','children']] = scaler.transform(X_validate_scaled[['age','bmi','children']])
X_test_scaled[['age','bmi','children']] = scaler.transform(X_test_scaled[['age','bmi','children']])

X_train_scaled

Unnamed: 0,age,bmi,children,sex_encoded,smoker_encoded,northeast,northwest,southeast,southwest
463,1.182926,-0.802371,-0.878534,1,0,1,0,0,0
315,0.898367,0.421054,-0.878534,1,0,1,0,0,0
71,-0.595567,-0.373378,3.200375,1,0,1,0,0,0
741,-0.880126,-0.264666,-0.878534,1,1,0,0,1,0
423,-1.022406,-0.023828,-0.878534,1,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...
481,0.684948,1.133534,0.753029,1,0,0,0,1,0
552,1.609765,-1.560845,-0.878534,1,0,0,0,0,1
1213,0.898367,0.429417,0.753029,0,0,0,0,0,1
418,1.752045,1.409495,-0.062752,1,0,0,0,1,0


In [10]:
y_train_log = np.log1p(y_train)
y_validate_log = np.log1p(y_validate)


In [11]:
y_train_log.mean()

np.float64(9.064400266552193)

In [12]:
# Put logged y into DataFrames for convenience
base_y_train = pd.DataFrame({'charges_log': y_train_log})
base_y_validate = pd.DataFrame({'charges_log': y_validate_log})

# --- Baseline predictions in LOG space ---
pred_mean_log = y_train_log.mean()
pred_median_log = y_train_log.median()

base_y_train['pred_mean_log'] = pred_mean_log
base_y_validate['pred_mean_log'] = pred_mean_log

base_y_train['pred_median_log'] = pred_median_log
base_y_validate['pred_median_log'] = pred_median_log

# --- RMSE in LOG space ---
rmse_train_mean_log = mean_squared_error(base_y_train['charges_log'], base_y_train['pred_mean_log'])**(1/2)
rmse_validate_mean_log = mean_squared_error(base_y_validate['charges_log'], base_y_validate['pred_mean_log'])**(1/2)

rmse_train_median_log = mean_squared_error(base_y_train['charges_log'], base_y_train['pred_median_log'])**(1/2)
rmse_validate_median_log = mean_squared_error(base_y_validate['charges_log'], base_y_validate['pred_median_log'])**(1/2)

print("Baseline RMSE (log scale)")
print(f"Mean   - Train: {rmse_train_mean_log:.4f} | Validate: {rmse_validate_mean_log:.4f}")
print(f"Median - Train: {rmse_train_median_log:.4f} | Validate: {rmse_validate_median_log:.4f}")

# --- Convert baseline predictions back to DOLLARS ---
base_y_train['base_mean_dollars'] = np.expm1(base_y_train['pred_mean_log'])
base_y_validate['base_mean_dollars'] = np.expm1(base_y_validate['pred_mean_log'])

base_y_train['base_median_dollars'] = np.expm1(base_y_train['pred_median_log'])
base_y_validate['base_median_dollars'] = np.expm1(base_y_validate['pred_median_log'])

# --- RMSE in DOLLARS ---
rmse_train_mean_dollars = mean_squared_error(y_train, base_y_train.base_mean_dollars)**(1/2)
rmse_validate_mean_dollars = mean_squared_error(y_validate, base_y_validate.base_mean_dollars)**(1/2)

rmse_train_median_dollars = mean_squared_error(y_train, base_y_train.base_median_dollars)**(1/2)
rmse_validate_median_dollars = mean_squared_error(y_validate, base_y_validate.base_median_dollars)**(1/2)

print("\nBaseline RMSE (dollars)")
print(f"Mean   - Train: ${rmse_train_mean_dollars:,.2f} | Validate: ${rmse_validate_mean_dollars:,.2f}")
print(f"Median - Train: ${rmse_train_median_dollars:,.2f} | Validate: ${rmse_validate_median_dollars:,.2f}")



baseline_df = pd.DataFrame({
    'Train_Log': [rmse_train_mean_log, rmse_train_median_log],
    'Validate_Log': [rmse_validate_mean_log, rmse_validate_median_log],
    'Train_Dollars': [rmse_train_mean_dollars, rmse_train_median_dollars],
    'Validate_Dollars': [rmse_validate_mean_dollars, rmse_validate_median_dollars]
}, index=['Mean Baseline', 'Median Baseline'])

baseline_df

Baseline RMSE (log scale)
Mean   - Train: 0.8999 | Validate: 0.9788
Median - Train: 0.9024 | Validate: 0.9769

Baseline RMSE (dollars)
Mean   - Train: $12,198.50 | Validate: $14,343.50
Median - Train: $12,015.51 | Validate: $14,121.89


Unnamed: 0,Train_Log,Validate_Log,Train_Dollars,Validate_Dollars
Mean Baseline,0.899949,0.978814,12198.495794,14343.499589
Median Baseline,0.902419,0.976881,12015.511161,14121.892882


In [13]:
# Put logged y into DataFrames for convenience
linear_y_train = pd.DataFrame({'charges_log': y_train_log})
linear_y_validate = pd.DataFrame({'charges_log': y_validate_log})


# create the model object
lm = LinearRegression()

# fit the model to our training data. We must specify the column in y_train, 
# since we have converted it to a dataframe from a series! 
lm.fit(X_train_scaled, linear_y_train.charges_log)

# predict train
linear_y_train['pred_lm'] = lm.predict(X_train_scaled)

# evaluate: rmse
rmse_train = mean_squared_error(linear_y_train.charges_log, linear_y_train.pred_lm)**(1/2)

# predict validate
linear_y_validate['pred_lm'] = lm.predict(X_validate_scaled)

# evaluate: rmse
rmse_validate = mean_squared_error(linear_y_validate.charges_log, linear_y_validate.pred_lm)**(1/2)


# Convert predictions back to dollars
linear_y_train['pred_dollars'] = np.expm1(linear_y_train['pred_lm'])
linear_y_validate['pred_dollars'] = np.expm1(linear_y_validate['pred_lm'])

# evaluate: rmse in dollars
# train
rmse_train_dollars = mean_squared_error(y_train, linear_y_train['pred_dollars'])**(1/2)

# validate
rmse_validate_dollars = mean_squared_error(y_validate, linear_y_validate['pred_dollars'])**(1/2)

print("RMSE for OLS using LinearRegression\nTraining/In-Sample: ", rmse_train, 
      "\nValidation/Out-of-Sample: ", rmse_validate)
print("\n")
print("RMSE in Dollars")
print(f"Train: ${round(rmse_train_dollars, 2)}")
print(f"Validate: ${round(rmse_validate_dollars, 2)}")

linear_df = pd.DataFrame({
    'Train_Log': [rmse_train],
    'Validate_Log': [rmse_validate],
    'Train_Dollars': [rmse_train_dollars],
    'Validate_Dollars': [rmse_validate_dollars]
}, index=['linear_regression'])

linear_df

RMSE for OLS using LinearRegression
Training/In-Sample:  0.41272005156274627 
Validation/Out-of-Sample:  0.5125243131208743


RMSE in Dollars
Train: $8232.95
Validate: $9894.56


Unnamed: 0,Train_Log,Validate_Log,Train_Dollars,Validate_Dollars
linear_regression,0.41272,0.512524,8232.950363,9894.557731


In [14]:
# Put logged y into DataFrames for convenience
lars_y_train = pd.DataFrame({'charges_log': y_train_log})
lars_y_validate = pd.DataFrame({'charges_log': y_validate_log})

# create the model object
lars = LassoLars(alpha=1.0)

# fit the model to our training data. We must specify the column in y_train, 
# since we have converted it to a dataframe from a series! 
lars.fit(X_train_scaled, lars_y_train.charges_log)

# predict train
lars_y_train['pred_lars'] = lars.predict(X_train_scaled)

# evaluate: rmse
rmse_train = mean_squared_error(lars_y_train.charges_log, lars_y_train.pred_lars)**(1/2)

# predict validate
lars_y_validate['pred_lars'] = lars.predict(X_validate_scaled)

# evaluate: rmse
rmse_validate = mean_squared_error(lars_y_validate.charges_log, lars_y_validate.pred_lars)**(1/2)


# Convert predictions back to dollars
lars_y_train['pred_dollars'] = np.expm1(lars_y_train['pred_lars'])
lars_y_validate['pred_dollars'] = np.expm1(lars_y_validate['pred_lars'])

# evaluate: rmse in dollars
# train
rmse_train_dollars = mean_squared_error(y_train, lars_y_train['pred_dollars'])**(1/2)

# validate
rmse_validate_dollars = mean_squared_error(y_validate, lars_y_validate['pred_dollars'])**(1/2)


print("RMSE for Lasso + Lars\nTraining/In-Sample: ", rmse_train, 
      "\nValidation/Out-of-Sample: ", rmse_validate)
print("\n")
print("RMSE in Dollars")
print(f"Train: ${round(rmse_train_dollars, 2)}")
print(f"Validate: ${round(rmse_validate_dollars, 2)}")


lasso_df = pd.DataFrame({
    'Train_Log': [rmse_train],
    'Validate_Log': [rmse_validate],
    'Train_Dollars': [rmse_train_dollars],
    'Validate_Dollars': [rmse_validate_dollars]
}, index=['lasso'])

lasso_df

RMSE for Lasso + Lars
Training/In-Sample:  0.8999488362007172 
Validation/Out-of-Sample:  0.9788138556303694


RMSE in Dollars
Train: $12198.5
Validate: $14343.5


Unnamed: 0,Train_Log,Validate_Log,Train_Dollars,Validate_Dollars
lasso,0.899949,0.978814,12198.495794,14343.499589


In [15]:
# Put logged y into DataFrames for convenience
tweedie_y_train = pd.DataFrame({'charges_log': y_train_log})
tweedie_y_validate = pd.DataFrame({'charges_log': y_validate_log})


# create the model object
glm = TweedieRegressor(power=1, alpha=0)

# fit the model to our training data. We must specify the column in y_train, 
# since we have converted it to a dataframe from a series! 
glm.fit(X_train, tweedie_y_train.charges_log)

# predict train
tweedie_y_train['pred_glm'] = glm.predict(X_train_scaled)

# evaluate: rmse
rmse_train = mean_squared_error(tweedie_y_train.charges_log, tweedie_y_train.pred_glm)**(1/2)

# predict validate
tweedie_y_validate['pred_glm'] = glm.predict(X_validate_scaled)

# evaluate: rmse
rmse_validate = mean_squared_error(tweedie_y_validate.charges_log, tweedie_y_validate.pred_glm)**(1/2)

print("RMSE for GLM using Tweedie, power=1 & alpha=0\nTraining/In-Sample: ", rmse_train, 
      "\nValidation/Out-of-Sample: ", rmse_validate)


# Convert predictions back to dollars
tweedie_y_train['pred_dollars'] = np.expm1(tweedie_y_train['pred_glm'])
tweedie_y_validate['pred_dollars'] = np.expm1(tweedie_y_validate['pred_glm'])

# evaluate: rmse in dollars
# train
rmse_train_dollars = mean_squared_error(y_train, tweedie_y_train['pred_dollars'])**(1/2)

# validate
rmse_validate_dollars = mean_squared_error(y_validate, tweedie_y_validate['pred_dollars'])**(1/2)




print("\n")
print("RMSE in Dollars")
print(f"Train: ${round(rmse_train_dollars, 2)}")
print(f"Validate: ${round(rmse_validate_dollars, 2)}")

tweedie_df = pd.DataFrame({
    'Train_Log': [rmse_train],
    'Validate_Log': [rmse_validate],
    'Train_Dollars': [rmse_train_dollars],
    'Validate_Dollars': [rmse_validate_dollars]
}, index=['tweedie'])

tweedie_df


RMSE for GLM using Tweedie, power=1 & alpha=0
Training/In-Sample:  1.8783331160447307 
Validation/Out-of-Sample:  1.9216877625295417


RMSE in Dollars
Train: $15194.98
Validate: $17413.55


Unnamed: 0,Train_Log,Validate_Log,Train_Dollars,Validate_Dollars
tweedie,1.878333,1.921688,15194.97912,17413.549408


#### Polynomial Regression 

In [16]:
# make the polynomial features to get a new set of features
pf = PolynomialFeatures(degree=2)

# fit and transform X_train_scaled
X_train_degree2 = pf.fit_transform(X_train_scaled)

# transform X_validate_scaled & X_test_scaled
X_validate_degree2 = pf.transform(X_validate_scaled)
X_test_degree2 = pf.transform(X_test_scaled)

In [17]:
# Put logged y into DataFrames for convenience
lm2_y_train = pd.DataFrame({'charges_log': y_train_log})
lm2_y_validate = pd.DataFrame({'charges_log': y_validate_log})


# create the model object
lm2 = LinearRegression()

# fit the model to our training data. We must specify the column in y_train, 
# since we have converted it to a dataframe from a series! 
lm2.fit(X_train_degree2, lm2_y_train.charges_log)

# predict train
lm2_y_train['pred_lm2'] = lm2.predict(X_train_degree2)

# evaluate: rmse
rmse_train = mean_squared_error(lm2_y_train.charges_log, lm2_y_train.pred_lm2)**(1/2)

# predict validate
lm2_y_validate['pred_lm2'] = lm2.predict(X_validate_degree2)

# evaluate: rmse
rmse_validate = mean_squared_error(lm2_y_validate.charges_log, lm2_y_validate.pred_lm2)**(1/2)

print("RMSE for Polynomial Model, degrees=2\nTraining/In-Sample: ", rmse_train, 
      "\nValidation/Out-of-Sample: ", rmse_validate)


# Convert predictions back to dollars
lm2_y_train['pred_dollars'] = np.expm1(lm2_y_train['pred_lm2'])
lm2_y_validate['pred_dollars'] = np.expm1(lm2_y_validate['pred_lm2'])

# evaluate: rmse in dollars
# train
rmse_train_dollars = mean_squared_error(y_train, lm2_y_train['pred_dollars'])**(1/2)

# validate
rmse_validate_dollars = mean_squared_error(y_validate, lm2_y_validate['pred_dollars'])**(1/2)




print("\n")
print("RMSE in Dollars")
print(f"Train: ${round(rmse_train_dollars, 2)}")
print(f"Validate: ${round(rmse_validate_dollars, 2)}")


poly_df = pd.DataFrame({
    'Train_Log': [rmse_train],
    'Validate_Log': [rmse_validate],
    'Train_Dollars': [rmse_train_dollars],
    'Validate_Dollars': [rmse_validate_dollars]
}, index=['polynomial'])

poly_df

RMSE for Polynomial Model, degrees=2
Training/In-Sample:  0.32720413410626215 
Validation/Out-of-Sample:  0.4349071848955806


RMSE in Dollars
Train: $4565.37
Validate: $6277.46


Unnamed: 0,Train_Log,Validate_Log,Train_Dollars,Validate_Dollars
polynomial,0.327204,0.434907,4565.373974,6277.455744


In [20]:
models_df = pd.concat([baseline_df,linear_df,lasso_df,tweedie_df,poly_df], axis=0)

models_df

Unnamed: 0,Train_Log,Validate_Log,Train_Dollars,Validate_Dollars
Mean Baseline,0.899949,0.978814,12198.495794,14343.499589
Median Baseline,0.902419,0.976881,12015.511161,14121.892882
linear_regression,0.41272,0.512524,8232.950363,9894.557731
lasso,0.899949,0.978814,12198.495794,14343.499589
tweedie,1.878333,1.921688,15194.97912,17413.549408
polynomial,0.327204,0.434907,4565.373974,6277.455744


In [22]:
models_df.sort_values('Train_Dollars', ascending = True)

Unnamed: 0,Train_Log,Validate_Log,Train_Dollars,Validate_Dollars
polynomial,0.327204,0.434907,4565.373974,6277.455744
linear_regression,0.41272,0.512524,8232.950363,9894.557731
Median Baseline,0.902419,0.976881,12015.511161,14121.892882
Mean Baseline,0.899949,0.978814,12198.495794,14343.499589
lasso,0.899949,0.978814,12198.495794,14343.499589
tweedie,1.878333,1.921688,15194.97912,17413.549408


#### Takeaways:

- The polynomial regression model achieved the lowest validation RMSE in dollar terms, indicating the best predictive performance among the evaluated models.