In [74]:
print(f"Expected Features: {rf_best.n_features_in_}")
print(f"New Sample Features: {len(new_sample[0])}")
print(f"Feature Names: {list(X.columns)}")
print(f"Feature Importances: {rf_best.feature_importances_}")

Expected Features: 30
New Sample Features: 23
Feature Names: ['id', 'age', 'gender', 'region', 'bmi', 'smoker', 'alcohol_consumption', 'exercise_frequency', 'diet_type', 'stress_level', 'medical_history_score', 'has_diabetes', 'has_hypertension', 'has_heart_disease', 'has_cancer_history', 'annual_income', 'employment_type', 'credit_score', 'savings_amount', 'num_dependents', 'previous_insurance_claims', 'policy_type', 'policy_renewal_status', 'hospital_visits_per_year', 'medication_costs_per_year', 'health_risk_score', 'expense_ratio', 'age_group', 'bmi_smoker', 'income_dependents']
Feature Importances: [0.00677489 0.01184977 0.00091368 0.00232317 0.03605451 0.00592407
 0.00186855 0.00233431 0.00164336 0.00177144 0.00901317 0.00106701
 0.00073658 0.00086735 0.00365695 0.32764648 0.00172867 0.00670596
 0.00677259 0.01593856 0.00440567 0.00191195 0.00093491 0.09000561
 0.00667636 0.17007337 0.24986982 0.         0.00969835 0.02083291]


In [75]:
# Check the highest actual insurance costs in the dataset
print(df[["insurance_cost"]].describe())

# Display top 10 most expensive insurance cases
print(df.sort_values(by="insurance_cost", ascending=False).head(10))


       insurance_cost
count     5000.000000
mean     70709.883600
std      18449.361519
min      17844.000000
25%      57088.750000
50%      70442.000000
75%      83899.750000
max     127106.000000
        id       age  gender  region       bmi  smoker  alcohol_consumption  \
4834  4835  1.369086       1       0  1.535331       1                    0   
4195  4196 -0.098340       0       1  0.953032       1                    2   
1901  1902 -1.452888       0       0  0.675747       1                    0   
2315  2316  0.691813       1       2  0.537104       1                    2   
386    387 -0.380538       0       1  0.370733       1                    1   
2840  2841  1.538405       1       1  0.939168       1                    0   
4842  4843  0.522494       1       2  1.063946       1                    1   
237    238  0.635373       1       2  0.828254       1                    1   
3315  3316  1.651284       1       3  1.507603       1                    1   
2291  2292 -

In [76]:
import numpy as np
import pandas as pd
import joblib
from sklearn.metrics import mean_squared_error, r2_score

# 🔹 1️⃣ Load the Optimized Model
rf_optimized = joblib.load("optimized_random_forest.pkl")

# 🔹 2️⃣ Load the Test Dataset
df = pd.read_csv("medical_insurance_features.csv")

# Define weak features (less important ones removed)
weak_features = ["age_group", "has_hypertension", "has_heart_disease", "gender",
                 "policy_renewal_status", "diet_type", "employment_type"]

# Separate features & target
X = df.drop(columns=["insurance_cost"])
y = df["insurance_cost"]

# Ensure test set remains the same
from sklearn.model_selection import train_test_split
_, X_test, _, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Ensure we drop the same weak features in training & testing
X_test_filtered = X_test.drop(columns=weak_features, errors='ignore')

# 🔹 Debug: Print all feature names for comparison
expected_columns = X_test_filtered.columns.tolist()
print("Features used for testing:", expected_columns)

# 🔹 3️⃣ Test on a Single Data Point from the Test Set
sample_input = X_test_filtered.iloc[0].values.reshape(1, -1)
predicted_cost = rf_optimized.predict(sample_input)

print(f"🔹 Predicted Insurance Cost: ₹{predicted_cost[0]:.2f}")
print(f"🔹 Actual Insurance Cost: ₹{y_test.iloc[0]:.2f}")

# 🔹 4️⃣ Test on Multiple Test Cases (10 Samples)
sample_inputs = X_test_filtered.iloc[:10]
predictions = rf_optimized.predict(sample_inputs)

# Compare predictions with actual values
comparison = pd.DataFrame({
    "Actual Cost": y_test.iloc[:10].values,
    "Predicted Cost": predictions
})
print("\n🔹 Model Predictions on 10 Samples:")
print(comparison)

# 🔹 5️⃣ Test with Completely New Unseen Data


# Corrected new_sample: Ensure feature count matches expected_columns
new_sample = np.array([[1, 35, 1, 2, 25.4, 0, 1, 1, 0, 0.3, 0, 1, 
                         500000, 700, 200000, 1, 0, 1, 1, 2, 3000, 0.01, 250000]]).reshape(1, -1)


# Debug: Check feature mismatch
print("\nExpected features for new sample:")
print(expected_columns)
print("\nActual features in new sample:")
print(new_sample[0])

# Ensure shape matches before creating DataFrame
if new_sample.shape[1] != len(expected_columns):
    raise ValueError(f"Feature mismatch: Expected {len(expected_columns)} features, but got {new_sample.shape[1]}")

# Ensure correct feature names and types
new_sample_df = pd.DataFrame(new_sample, columns=expected_columns).astype(float)

# Explicitly reassign column names to match model training
new_sample_df = new_sample_df[expected_columns]  # Ensure order matches training data
new_sample_df.columns = rf_optimized.feature_names_in_

# Predict insurance cost
predicted_new_cost = rf_optimized.predict(new_sample_df)

# Print the prediction correctly
print(f"\n🔹 Predicted Insurance Cost for New Data: ₹{predicted_new_cost[0]:.2f}")



Features used for testing: ['id', 'age', 'region', 'bmi', 'smoker', 'alcohol_consumption', 'exercise_frequency', 'stress_level', 'medical_history_score', 'has_diabetes', 'has_cancer_history', 'annual_income', 'credit_score', 'savings_amount', 'num_dependents', 'previous_insurance_claims', 'policy_type', 'hospital_visits_per_year', 'medication_costs_per_year', 'health_risk_score', 'expense_ratio', 'bmi_smoker', 'income_dependents']
🔹 Predicted Insurance Cost: ₹82019.48
🔹 Actual Insurance Cost: ₹83920.00

🔹 Model Predictions on 10 Samples:
   Actual Cost  Predicted Cost
0        83920    82019.479722
1        22880    26486.598000
2        94728    90673.897000
3        92815    89866.157587
4        61498    60674.308000
5        39273    42158.174000
6        82205    81219.886000
7        81639    85237.862000
8        60773    61746.574000
9        67903    61265.116800

Expected features for new sample:
['id', 'age', 'region', 'bmi', 'smoker', 'alcohol_consumption', 'exercise_freque

