# Obesity Prediction ML Project

A step-by-step notebook to predict obesity levels using machine learning.
---

## Load Data

In [ ]:
import pandas as pd
df = pd.read_csv('../data/obesity_data.csv')
df.head()

## Data Preprocessing

In [ ]:
from preprocess import clean_data, encode_features, scale_features, balance_classes
df = clean_data(df)
df = encode_features(df)
X = df.drop('ObesityLevel', axis=1)
y = df['ObesityLevel']
X_scaled, scaler = scale_features(X)
X_res, y_res = balance_classes(X_scaled, y)

## Feature Selection

In [ ]:
from feature_selection import select_k_best
X_best, selected_features = select_k_best(pd.DataFrame(X_res, columns=X.columns), y_res, k=5)
X_best = pd.DataFrame(X_best, columns=selected_features)

## Modeling & Evaluation

In [ ]:
from modeling import train_and_evaluate
results = train_and_evaluate(X_best, y_res)
for model_name, res in results.items():
    print(f'-- {model_name} --')
    for k, v in res.items():
        print(f'{k}: {v}')

## Visualize Confusion Matrix (example for Random Forest)

In [ ]:
from sklearn.model_selection import train_test_split
from utils import plot_conf_matrix
from sklearn.ensemble import RandomForestClassifier
X_train, X_test, y_train, y_test = train_test_split(X_best, y_res, test_size=0.2, random_state=42, stratify=y_res)
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
classes = sorted(df['ObesityLevel'].unique())
plot_conf_matrix(y_test, y_pred, classes)

# Key Learnings
- Handling imbalanced datasets with SMOTE
- Feature selection and model tuning
- Cross-validation for reliability
- Insights on lifestyle predictors for health