In [5]:
# ✅ 1️⃣ Import Required Libraries
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

# ✅ 2️⃣ Disable GPU (If CUDA Errors Occur)
tf.config.set_visible_devices([], 'GPU')

# ✅ 3️⃣ Load Dataset
df = pd.read_csv("insurance_cleaned.csv")  # Ensure this file exists in the correct directory

# ✅ 4️⃣ Check for Missing Values & Handle Them
print("Checking for missing values...")
print(df.isnull().sum())  # Show missing values per column

# Fill missing numerical values with median
df.fillna(df.median(numeric_only=True), inplace=True)

# ✅ 5️⃣ Fix Log Transformation Issues
# Select numerical features prone to skewness
skewed_features = ['Savings Amount', 'BMI']  
for feature in skewed_features:
    df[feature] = np.log1p(df[feature].clip(lower=0))  # Avoid log(negative)

# ✅ 6️⃣ Feature Engineering
df['BMI_Smoker_Interaction'] = df['BMI'] * df['Smoking Status']

# ✅ 7️⃣ Select Features & Target
selected_features = ['Age', 'BMI', 'Smoking Status', 'Hypertension', 'Employment Type', 
                     'Savings Amount', 'Policy Type', 'Policy Renewal Status', 'Hospital Visits Per Year', 
                     'BMI_Smoker_Interaction']

X = df[selected_features]
y = df['Insurance Cost']  # ✅ Corrected from 'Target' to 'Insurance Cost'

# ✅ 8️⃣ Normalize Data (Scaling)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# ✅ 9️⃣ Train-Test Split
X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# ✅ 🔟 Build Deep Learning Model
model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.2),
    Dense(32, activation='relu'),
    Dense(1, activation='linear')  # Linear activation for regression
])

# ✅ 1️⃣1️⃣ Compile Model with Huber Loss (Handles Outliers)
model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.001),
              loss='huber_loss',
              metrics=['mae'])

# ✅ 1️⃣2️⃣ Train Model
history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=100,
    batch_size=32,
    verbose=1
)

# ✅ 1️⃣3️⃣ Plot Training Loss
plt.plot(history.history['loss'], label='Training Loss', color='blue')
plt.plot(history.history['val_loss'], label='Validation Loss', color='red')
plt.xlabel("Epochs")
plt.ylabel("Loss (Huber)")
plt.title("Deep Learning Model Training Loss")
plt.legend()
plt.show()

# ✅ 1️⃣4️⃣ Model Evaluation
val_loss, val_mae = model.evaluate(X_val, y_val)
print(f"\n✅ Final Validation Loss: {val_loss:.4f}, Validation MAE: {val_mae:.4f}")


Checking for missing values...
Name                         0
Age                          0
Gender                       0
BMI                          0
Smoking Status               0
Region                       0
Diabetes                     0
Hypertension                 0
Heart Disease                0
Cancer History               0
Stroke                       0
Liver Disease                0
Kidney Disease               0
COPD                         0
TB                           0
HIV/AIDS                     0
Alcohol Consumption          0
Exercise Frequency           0
Diet Type                    0
Stress Level                 0
Medical History Score        0
Annual Income                0
Employment Type              0
Credit Score                 0
Savings Amount               0
Number of Dependents         0
Previous Insurance Claims    0
Policy Type                  0
Policy Renewal Status        0
Hospital Visits Per Year     0
Medication Costs Per Year    0
Insuranc

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


ValueError: Could not interpret loss identifier: huber_loss

In [None]:
print("Available columns in dataset:", df.columns)


Available columns in dataset: Index(['Name', 'Age', 'Gender', 'BMI', 'Smoking Status', 'Region', 'Diabetes',
       'Hypertension', 'Heart Disease', 'Cancer History', 'Stroke',
       'Liver Disease', 'Kidney Disease', 'COPD', 'TB', 'HIV/AIDS',
       'Alcohol Consumption', 'Exercise Frequency', 'Diet Type',
       'Stress Level', 'Medical History Score', 'Annual Income',
       'Employment Type', 'Credit Score', 'Savings Amount',
       'Number of Dependents', 'Previous Insurance Claims', 'Policy Type',
       'Policy Renewal Status', 'Hospital Visits Per Year',
       'Medication Costs Per Year', 'Insurance Cost', 'BMI Smoker',
       'Income Dependents', 'BMI_Smoker_Interaction'],
      dtype='object')
