In [54]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix
)


In [None]:
df = pd.read_csv("/content/engineered_health_dataset.csv")

In [55]:
SAFE_FEATURES = [
    "heart_rate",
    "resting_heart_rate",
    "hrv_rmssd",
    "spo2",
    "steps",
    "calories_burned",
    "sleep_duration",
    "deep_sleep_ratio",
    "rem_sleep_ratio",
    "air_quality_index",
    "pm2_5",
    "noise_level",
    "temperature",
    "sleep_pressure",
    "parasympathetic_score",
    "environmental_load"
]

X = df[SAFE_FEATURES]
y = df["health_risk_label"]


In [56]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [57]:
le = LabelEncoder()

y_train_enc = le.fit_transform(y_train)
y_test_enc  = le.transform(y_test)

print("Classes:", le.classes_)


Classes: [0 1]


In [58]:
lr_pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("model", LogisticRegression(
        multi_class="multinomial",
        max_iter=1000,
        class_weight="balanced",
        random_state=42
    ))
])

lr_pipeline.fit(X_train, y_train_enc)


In [60]:
y_pred_lr = lr_pipeline.predict(X_test)

print("Logistic Regression Accuracy:",
      accuracy_score(y_test_enc, y_pred_lr))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test_enc, y_pred_lr))

print("\nClassification Report:")
print(classification_report(
    y_test_enc,
    y_pred_lr,
    target_names=[str(c) for c in le.classes_]
))


Logistic Regression Accuracy: 0.86826

Confusion Matrix:
[[42190  6430]
 [  157  1223]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.87      0.93     48620
           1       0.16      0.89      0.27      1380

    accuracy                           0.87     50000
   macro avg       0.58      0.88      0.60     50000
weighted avg       0.97      0.87      0.91     50000



In [67]:
rf_model = RandomForestClassifier(
    n_estimators=200,
    max_depth=10,
    min_samples_split=20,
    min_samples_leaf=10,
    class_weight="balanced",
    random_state=42
)

rf_model.fit(X_train, y_train_enc)


In [68]:
y_probs = rf_model.predict_proba(X_test)[:, 1]

# Lower threshold to catch more high-risk users
threshold = 0.35
y_pred_rf_thresh = (y_probs >= threshold).astype(int)


In [71]:
print(confusion_matrix(y_test_enc, y_pred_rf_thresh))
print(classification_report(
    y_test_enc,
    y_pred_rf_thresh,
    target_names=["Low Risk", "High Risk"]
))



[[45626  2994]
 [   66  1314]]
              precision    recall  f1-score   support

    Low Risk       1.00      0.94      0.97     48620
   High Risk       0.31      0.95      0.46      1380

    accuracy                           0.94     50000
   macro avg       0.65      0.95      0.71     50000
weighted avg       0.98      0.94      0.95     50000



In [72]:
from sklearn.metrics import roc_auc_score, precision_recall_curve, auc

roc = roc_auc_score(y_test_enc, y_probs)
print("ROC-AUC:", roc)

precision, recall, _ = precision_recall_curve(y_test_enc, y_probs)
pr_auc = auc(recall, precision)
print("PR-AUC:", pr_auc)


ROC-AUC: 0.9770113092363732
PR-AUC: 0.6331976779672517


In [93]:
unseen_batch = [
    {
        "heart_rate": 98,
        "resting_heart_rate": 72,
        "hrv_rmssd": 28,
        "spo2": 94,
        "steps": 3100,
        "calories_burned": 1600,
        "sleep_duration": 5.5,
        "deep_sleep_ratio": 0.14,
        "rem_sleep_ratio": 0.19,
        "air_quality_index": 160,
        "pm2_5": 55,
        "noise_level": 68,
        "temperature": 33,
        "sleep_pressure": 0.78,
        "parasympathetic_score": 0.36,
        "environmental_load": 0.72
    },
    {
        "heart_rate": 112,
        "resting_heart_rate": 85,
        "hrv_rmssd": 18,
        "spo2": 90,
        "steps": 1200,
        "calories_burned": 1100,
        "sleep_duration": 4.2,
        "deep_sleep_ratio": 0.08,
        "rem_sleep_ratio": 0.12,
        "air_quality_index": 210,
        "pm2_5": 95,
        "noise_level": 78,
        "temperature": 36,
        "sleep_pressure": 0.91,
        "parasympathetic_score": 0.22,
        "environmental_load": 0.88
    },
        # High Risk User
    {
        "heart_rate": 110,
        "resting_heart_rate": 82,
        "hrv_rmssd": 20,
        "spo2": 91,
        "steps": 1500,
        "calories_burned": 1300,
        "sleep_duration": 4.5,
        "deep_sleep_ratio": 0.10,
        "rem_sleep_ratio": 0.15,
        "air_quality_index": 200,
        "pm2_5": 90,
        "noise_level": 75,
        "temperature": 35,
        "sleep_pressure": 0.88,
        "parasympathetic_score": 0.25,
        "environmental_load": 0.85
    },
     {
        "heart_rate": 90,
        "resting_heart_rate": 70,
        "hrv_rmssd": 30,
        "spo2": 95,
        "steps": 4000,
        "calories_burned": 1800,
        "sleep_duration": 6.0,
        "deep_sleep_ratio": 0.16,
        "rem_sleep_ratio": 0.20,
        "air_quality_index": 120,
        "pm2_5": 45,
        "noise_level": 60,
        "temperature": 30,
        "sleep_pressure": 0.70,
        "parasympathetic_score": 0.40,
        "environmental_load": 0.60
    }
    # Add more unseen users here...
]


In [94]:
def predict_batch_rf(unseen_data_list, rf_model, feature_list, threshold=0.35):
    """
    unseen_data_list -> list of dicts with sensor features
    rf_model         -> trained RandomForestClassifier
    feature_list     -> SAFE_FEATURES
    threshold        -> probability threshold for High Risk
    """
    results = []

    df_unseen = pd.DataFrame(unseen_data_list)
    df_unseen = df_unseen[feature_list]

    # Predict probabilities for High Risk (class 1)
    probs = rf_model.predict_proba(df_unseen)[:, 1]

    for i, prob in enumerate(probs):
        pred_class = int(prob >= threshold)
        results.append({
            "Predicted Risk": RISK_LABELS[pred_class],
            "High Risk Probability": round(float(prob), 3),
            "Threshold Used": threshold
        })

    return results


In [95]:
batch_results = predict_batch_rf(
    unseen_data_list=unseen_batch,
    rf_model=rf_model,
    feature_list=SAFE_FEATURES,
    threshold=0.35
)

batch_results


[{'Predicted Risk': 'Low Risk',
  'High Risk Probability': 0.194,
  'Threshold Used': 0.35},
 {'Predicted Risk': 'High Risk',
  'High Risk Probability': 0.882,
  'Threshold Used': 0.35},
 {'Predicted Risk': 'High Risk',
  'High Risk Probability': 0.896,
  'Threshold Used': 0.35},
 {'Predicted Risk': 'Low Risk',
  'High Risk Probability': 0.029,
  'Threshold Used': 0.35}]

In [96]:
import joblib


In [97]:
# Save trained Random Forest model
joblib.dump(rf_model, "health_risk_rf_model.pkl")


['health_risk_rf_model.pkl']

In [98]:
joblib.dump(le, "health_risk_label_encoder.pkl")


['health_risk_label_encoder.pkl']

In [99]:
joblib.dump(SAFE_FEATURES, "health_risk_features.pkl")


['health_risk_features.pkl']

In [100]:
# Load Random Forest
rf_model_loaded = joblib.load("health_risk_rf_model.pkl")

# Load LabelEncoder
le_loaded = joblib.load("health_risk_label_encoder.pkl")

# Load SAFE_FEATURES
features_loaded = joblib.load("health_risk_features.pkl")


In [105]:
batch_results = predict_batch_rf(
    unseen_data_list=unseen_batch,
    rf_model=rf_model_loaded,
    feature_list=features_loaded,
    threshold=0.35
)


In [106]:
print(batch_results)

[{'Predicted Risk': 'Low Risk', 'High Risk Probability': 0.194, 'Threshold Used': 0.35}, {'Predicted Risk': 'High Risk', 'High Risk Probability': 0.882, 'Threshold Used': 0.35}, {'Predicted Risk': 'High Risk', 'High Risk Probability': 0.896, 'Threshold Used': 0.35}, {'Predicted Risk': 'Low Risk', 'High Risk Probability': 0.029, 'Threshold Used': 0.35}]
