<a href="https://colab.research.google.com/github/mwang-cmn/Prediction-of-Health-Costs/blob/main/Prediction%20of%20Health%20Expenses.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
sns.set_theme(style="whitegrid", palette="pastel")
import warnings
warnings.filterwarnings("ignore")


In [None]:
# Import data
!wget https://cdn.freecodecamp.org/project-data/health-costs/insurance.csv
dataset = pd.read_csv('insurance.csv')
dataset.tail()

In [None]:
dataset.info()

In [None]:
data = dataset.copy()
data.describe()

In [None]:
data.isnull().sum()

In [None]:
data.duplicated().sum()

In [None]:
data.drop_duplicates(inplace=True)
data.duplicated().sum()


### **Exploratory Data Analysis**

In [None]:

sns.pairplot(data)
plt.grid(False)
plt.show()


In [None]:
# prompt: histograms of age, bmi, expenses, use fig, ax, subplots 1,3, add kde

import matplotlib.pyplot as plt
import seaborn as sns

fig, axes = plt.subplots(1, 3, figsize=(15, 5))

sns.histplot(data['age'], kde=True, ax=axes[0])
axes[0].set_title('Age Distribution')

sns.histplot(data['bmi'], kde=True, ax=axes[1])
axes[1].set_title('BMI Distribution')

sns.histplot(data['expenses'], kde=True, ax=axes[2])
axes[2].set_title('Expenses Distribution')

plt.tight_layout()
plt.show()


In [None]:
# boxplots of age, bmi, expenses, use fig, ax, subplots 1,3,

fig, axes = plt.subplots(1, 3, figsize=(15, 5))

sns.boxplot(data['age'], ax=axes[0])
axes[0].set_title('Age Distribution')

sns.boxplot(data['bmi'], ax=axes[1])
axes[1].set_title('BMI Distribution')

sns.boxplot(data['expenses'], ax=axes[2])
axes[2].set_title('Expenses Distribution')

plt.tight_layout()
plt.show()


In [None]:
# countplots of children, smoker, sex and region

fig, axes = plt.subplots(2, 2, figsize=(15, 10))

sns.countplot(x='children', data=data, ax=axes[0, 0])
axes[0, 0].set_title('Children Distribution')

sns.countplot(x='smoker', data=data, ax=axes[0, 1])
axes[0, 1].set_title('Smoker Distribution')

sns.countplot(x='sex', data=data, ax=axes[1, 0])
axes[1, 0].set_title('Sex Distribution')

sns.countplot(x='region', data=data, ax=axes[1, 1])
axes[1, 1].set_title('Region Distribution')

plt.tight_layout()
plt.show()


In [None]:
# barplot of expenses for  children, smoker, sex and region

fig, axes = plt.subplots(2, 2, figsize=(15, 10))

sns.barplot(x='children', y='expenses', data=data, ax=axes[0, 0])
axes[0, 0].set_title('Expenses vs Children')

sns.barplot(x='smoker', y='expenses', data=data, ax=axes[0, 1])
axes[0, 1].set_title('Expenses vs Smoker')

sns.barplot(x='sex', y='expenses', data=data, ax=axes[1, 0])
axes[1, 0].set_title('Expenses vs Sex')

sns.barplot(x='region', y='expenses', data=data, ax=axes[1, 1])
axes[1, 1].set_title('Expenses vs Region')

plt.tight_layout()
plt.show()


In [None]:
health = data.copy()


In [None]:
# label encode health categorical columns

from sklearn.preprocessing import LabelEncoder

label = LabelEncoder()
health['sex'] = label.fit_transform(health['sex'])
health['smoker'] = label.fit_transform(health['smoker'])
health['region'] = label.fit_transform(health['region'])
health.head()


In [None]:
# correlation heatmap

import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(12, 10))
sns.heatmap(health.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap of Health Insurance Dataset')
plt.show()


In [None]:
# Feature Engineering
# Create BMI categories
health['bmi_category'] = pd.cut(health['bmi'], bins=[0, 18.5, 24.9, 29.9, 100], labels=['Underweight', 'Healthy', 'Overweight', 'Obese'])

# Create age groups
health['age_group'] = pd.cut(health['age'], bins=[0, 25, 40, 64, 100], labels=['Young Adult', 'Adult', 'Middle Aged', 'Senior'])

health.head()


In [None]:
# Drop 'age' and 'bmi' columns
health = health.drop(['age', 'bmi'], axis=1)

# Re-apply Label Encoding to the remaining categorical columns
label = LabelEncoder()
for column in health.select_dtypes(include=['object']).columns:
    health[column] = label.fit_transform(health[column])

health.head()


In [None]:
# label encode 'bmi_category' and 'age_group'
label = LabelEncoder()  # Reinitialize if needed
health['bmi_category'] = label.fit_transform(health['bmi_category'])
health['age_group'] = label.fit_transform(health['age_group'])


In [None]:
# Assuming 'health' DataFrame is already prepared as in the previous code

plt.figure(figsize=(12, 10))
sns.heatmap(health.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap of Health Insurance Dataset')
plt.show()


In [None]:
X = health.drop('expenses', axis=1)
y = health['expenses']


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print(f"R-squared: {r2}")
print(f"Mean Absolute Error: {mae}")


**2. Lasso Regression**

Lasso regression is a linear model that incorporates an L1 penalty to both regularize the model and perform feature selection. By adding a penalty proportional to the absolute values of the coefficients, Lasso effectively shrinks less important coefficients to zero, thus simplifying the model and reducing overfitting.

In [None]:
# Initialize and train the Lasso model
lasso_model = Lasso(alpha=1.0)  # You can adjust the alpha value
lasso_model.fit(X_train, y_train)

# Make predictions
y_pred_lasso = lasso_model.predict(X_test)

# Evaluate the model
r2_lasso = r2_score(y_test, y_pred_lasso)
mae_lasso = mean_absolute_error(y_test, y_pred_lasso)

print(f"Lasso R-squared: {r2_lasso}")
print(f"Lasso Mean Absolute Error: {mae_lasso}")


**Results**

**3. Random Forest Model**


In [None]:

# Initialize and train the Random Forest Regressor
rf_model = RandomForestRegressor(n_estimators=100, random_state=42) # You can adjust hyperparameters
rf_model.fit(X_train, y_train)

# Make predictions
y_pred_rf = rf_model.predict(X_test)

# Evaluate the model
r2_rf = r2_score(y_test, y_pred_rf)
mae_rf = mean_absolute_error(y_test, y_pred_rf)

print(f"Random Forest R-squared: {r2_rf}")
print(f"Random Forest Mean Absolute Error: {mae_rf}")


In [None]:
rf_model.get_params()

In [None]:

from sklearn.model_selection import RandomizedSearchCV

# Define the parameter grid for RandomForestRegressor
param_grid = {
    'n_estimators': [50, 100, 150, 200],
    'max_depth': [5,10, 15],
    'min_samples_split': [2, 3, 5],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2']

}

rf_model2 = RandomizedSearchCV(estimator=rf_model, param_distributions=param_grid, cv=5, scoring='neg_mean_absolute_error')
rf_model2.fit(X_train, y_train)
params_rf = rf_model2.best_params_
params_rf



In [None]:
rf = RandomForestRegressor(n_estimators=params_rf['n_estimators'],
                           max_depth=params_rf['max_depth'],
                           min_samples_split=params_rf['min_samples_split'],
                           min_samples_leaf = params_rf['min_samples_leaf'],
                           max_features = params_rf['max_features'])
rf.fit(X_train, y_train)
y_pred_rf2 = rf.predict(X_test)
r2_best_rf2 = r2_score(y_test, y_pred_rf2)
mae_best_rf2 = mean_absolute_error(y_test, y_pred_rf2)

In [None]:
print(f"Best Random Forest R-squared: {r2_best_rf}")
print(f"Best Random Forest Mean Absolute Error: {mae_best_rf2}")

**Results**

The tuned Random Forest model is performing well. The R-squared of the model is approximately 0.8435, the model explains around 84% of the variance in health expenses. The Mean Absolute Error (MAE) of roughly $3428 is below the target threshold of $3500, which is a strong outcome

In [None]:
# Feature Importance of the tuned Random Forest model
importances = rf.feature_importances_
feature_names = X.columns
plt.figure(figsize=(10, 6))
plt.barh(X.columns, importances)
plt.xlabel('Importance')
plt.ylabel('Features')
plt.title('Feature Importance')
plt.show()