In [14]:
import pandas as pd  # Import pandas for data handling
import xgboost as xgb  # Import XGBoost for model training
from sklearn.preprocessing import LabelEncoder  # Import LabelEncoder to encode categorical variables

# Load dataset from CSV file
df = pd.read_csv("data.csv")  # Change the path if needed

# ---- DATA PREPROCESSING ----
# Convert "Blood Pressure" column (formatted as "120/80") into separate "Systolic_BP" and "Diastolic_BP" columns
df[['Systolic_BP', 'Diastolic_BP']] = df['Blood Pressure'].str.extract(r'(\d+)/(\d+)').astype(float)

# Drop the original "Blood Pressure" column as it's no longer needed
df = df.drop(columns=['Blood Pressure'])

# Encode categorical feature "Stress Level" (e.g., Low, Moderate, High → 0, 1, 2)
label_encoder_stress = LabelEncoder()
df['Stress Level'] = label_encoder_stress.fit_transform(df['Stress Level'])

# Encode target variable "Health Condition" (e.g., Normal, Sedentary, Active → 0, 1, 2)
label_encoder_condition = LabelEncoder()
df['Health Condition'] = label_encoder_condition.fit_transform(df['Health Condition'])

# Define input features (X) and target variable (y)
X = df.drop(columns=['Health Condition'])  # Features (all columns except target)
y = df['Health Condition']  # Target variable (encoded health condition)

# ---- MODEL TRAINING ----
# Initialize the XGBoost classifier with optimized hyperparameters
model = xgb.XGBClassifier(
    n_estimators=30,        # Number of trees (reducing to prevent overfitting)
    learning_rate=0.1,      # Learning rate for boosting
    max_depth=7,            # Maximum depth of each decision tree
    random_state=42,        # Ensures reproducibility of results
    subsample=0.8,          # Uses 80% of data per tree to prevent overfitting
    colsample_bytree=0.8,   # Uses 80% of features per tree
    reg_lambda=1.0,         # L2 regularization to avoid overfitting
    eval_metric="mlogloss"  # Log loss metric (default for multi-class classification)
)

# Train the model on the full dataset
model.fit(X, y)

# ---- USER INPUT FOR PREDICTION ----
# Prompt user to enter health-related data
heart_rate = float(input("Enter Heart Rate: "))  # Heart rate in beats per minute
spo2 = float(input("Enter SpO2: "))  # Blood oxygen level (percentage)
steps = float(input("Enter Steps: "))  # Number of steps taken in a day
sleep = float(input("Enter Sleep (hrs): "))  # Number of sleep hours per day
stress_level = input("Enter Stress Level (Low, Moderate, High): ")  # Categorical input
calories_burnt = float(input("Enter Calories Burnt: "))  # Calories burnt in a day
systolic_bp = float(input("Enter Systolic BP: "))  # Systolic blood pressure
diastolic_bp = float(input("Enter Diastolic BP: "))  # Diastolic blood pressure

# Convert "Stress Level" input (e.g., "Moderate") to encoded form (0,1,2)
stress_level_encoded = label_encoder_stress.transform([stress_level])

# Create a DataFrame with the user input values
new_data = pd.DataFrame({
    'Heart Rate': [heart_rate],
    'SpO2': [spo2],
    'Steps': [steps],
    'Sleep': [sleep],
    'Stress Level': stress_level_encoded,  # Encoded categorical variable
    'Calories Burnt': [calories_burnt],
    'Systolic_BP': [systolic_bp],
    'Diastolic_BP': [diastolic_bp]
})

# ---- PREDICT HEALTH CONDITION ----
# Predict the health condition based on user input
predicted_condition = model.predict(new_data)

# Convert the encoded prediction back to the original category (e.g., 0 → "Sedentary")
print("Predicted Health Condition:", label_encoder_condition.inverse_transform(predicted_condition))


Enter Heart Rate:  99
Enter SpO2:  95
Enter Steps:  11416
Enter Sleep (hrs):  5.5
Enter Stress Level (Low, Moderate, High):  Low
Enter Calories Burnt:  439
Enter Systolic BP:  127
Enter Diastolic BP:  83


Predicted Health Condition: ['Moderate']
