<a href="https://colab.research.google.com/github/namuduris/Prediction-of-Health-Insurance-Using-ML/blob/main/Medical_insurance_staple.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Step 1: Install Required Libraries
# Run this in your terminal or command prompt:
# pip install xgboost pandas scikit-learn

# Step 2: Import Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Step 3: Load and Preprocess Data
# Load the dataset
data = pd.read_csv('health_insurance.csv')

# Display the first few rows
print(data.head())

# Define features (X) and target (y) for insurance cost prediction
X = data.drop('insurance_cost', axis=1)
y = data['insurance_cost']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: Train Disease Risk Prediction Models
# 4.1: Diabetes Risk Prediction
# Features: age, bmi, glucose, family_history
X_diabetes = X[['age', 'bmi', 'glucose', 'family_history']]
y_diabetes = data['diabetes_risk']

# Split data
X_train_diabetes, X_test_diabetes, y_train_diabetes, y_test_diabetes = train_test_split(
    X_diabetes, y_diabetes, test_size=0.2, random_state=42
)

# Train logistic regression model
diabetes_model = LogisticRegression()
diabetes_model.fit(X_train_diabetes, y_train_diabetes)

# Evaluate diabetes model
y_pred_diabetes = diabetes_model.predict(X_test_diabetes)
print("Diabetes Risk Model Accuracy:", accuracy_score(y_test_diabetes, y_pred_diabetes))
print("Diabetes Risk Model AUC:", roc_auc_score(y_test_diabetes, diabetes_model.predict_proba(X_test_diabetes)[:, 1]))

# 4.2: Cardiovascular Disease (CVD) Risk Prediction
# Features: age, bmi, blood_pressure, cholesterol, smoker
X_cvd = X[['age', 'bmi', 'blood_pressure', 'cholesterol', 'smoker']]
y_cvd = data['cvd_risk']

# Split data
X_train_cvd, X_test_cvd, y_train_cvd, y_test_cvd = train_test_split(
    X_cvd, y_cvd, test_size=0.2, random_state=42
)

# Train logistic regression model
cvd_model = LogisticRegression()
cvd_model.fit(X_train_cvd, y_train_cvd)

# Evaluate CVD model
y_pred_cvd = cvd_model.predict(X_test_cvd)
print("CVD Risk Model Accuracy:", accuracy_score(y_test_cvd, y_pred_cvd))
print("CVD Risk Model AUC:", roc_auc_score(y_test_cvd, cvd_model.predict_proba(X_test_cvd)[:, 1]))

# 4.3: Cancer Risk Prediction
# Features: age, bmi, smoker, family_history
X_cancer = X[['age', 'bmi', 'smoker', 'family_history']]
y_cancer = data['cancer_risk']

# Split data
X_train_cancer, X_test_cancer, y_train_cancer, y_test_cancer = train_test_split(
    X_cancer, y_cancer, test_size=0.2, random_state=42
)

# Train logistic regression model
cancer_model = LogisticRegression()
cancer_model.fit(X_train_cancer, y_train_cancer)

# Evaluate cancer model
y_pred_cancer = cancer_model.predict(X_test_cancer)
print("Cancer Risk Model Accuracy:", accuracy_score(y_test_cancer, y_pred_cancer))
print("Cancer Risk Model AUC:", roc_auc_score(y_test_cancer, cancer_model.predict_proba(X_test_cancer)[:, 1]))

# Step 5: Add Disease Risk Predictions as Features
# Predict disease risks for the main dataset
X_train['diabetes_risk'] = diabetes_model.predict_proba(X_train[['age', 'bmi', 'glucose', 'family_history']])[:, 1]
X_test['diabetes_risk'] = diabetes_model.predict_proba(X_test[['age', 'bmi', 'glucose', 'family_history']])[:, 1]

X_train['cvd_risk'] = cvd_model.predict_proba(X_train[['age', 'bmi', 'blood_pressure', 'cholesterol', 'smoker']])[:, 1]
X_test['cvd_risk'] = cvd_model.predict_proba(X_test[['age', 'bmi', 'blood_pressure', 'cholesterol', 'smoker']])[:, 1]

X_train['cancer_risk'] = cancer_model.predict_proba(X_train[['age', 'bmi', 'smoker', 'family_history']])[:, 1]
X_test['cancer_risk'] = cancer_model.predict_proba(X_test[['age', 'bmi', 'smoker', 'family_history']])[:, 1]

# Step 6: Train the XGBoost Regressor for Insurance Cost Prediction
# Initialize the XGBoost Regressor
insurance_model = XGBRegressor(objective='reg:squarederror', n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)

# Train the model
insurance_model.fit(X_train, y_train)

# Make predictions
y_pred = insurance_model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'Mean Squared Error: {mse}')
print(f'R^2 Score: {r2}')

# Step 7: Make Predictions on New Data
# Example new data
new_data = pd.DataFrame({
    'age': [28],
    'bmi': [36],
    'smoker': [0],
    'blood_pressure': [136],
    'cholesterol': [281],
    'glucose': [145],
    'family_history': [1]
})

# Predict disease risks
new_data['diabetes_risk'] = diabetes_model.predict_proba(new_data[['age', 'bmi', 'glucose', 'family_history']])[:, 1]
new_data['cvd_risk'] = cvd_model.predict_proba(new_data[['age', 'bmi', 'blood_pressure', 'cholesterol', 'smoker']])[:, 1]
new_data['cancer_risk'] = cancer_model.predict_proba(new_data[['age', 'bmi', 'smoker', 'family_history']])[:, 1]

# Predict insurance cost
predicted_cost = insurance_model.predict(new_data)
print(f'Predicted Insurance Cost: {predicted_cost[0]}')

   age   bmi  smoker  blood_pressure  cholesterol  glucose  family_history  \
0   56  25.0       1             111          155      132               0   
1   46  36.0       1             137          294      189               1   
2   32  24.0       0             140          223      105               0   
3   60  33.0       0              92          263       77               0   
4   25  35.0       1             131          176      138               0   

   diabetes_risk  cvd_risk  cancer_risk  insurance_cost  
0              0         1            1    76051.073141  
1              1         1            1   105443.712109  
2              0         1            0    75109.671634  
3              0         0            0    71126.420767  
4              0         1            1    77566.818698  
Diabetes Risk Model Accuracy: 0.9
Diabetes Risk Model AUC: 0.9616268382352942
CVD Risk Model Accuracy: 0.855
CVD Risk Model AUC: 0.9320987654320987
Cancer Risk Model Accuracy: 0.875
C