# Obesity Risk Prediction using Tree-based models

This notebook trains tree-based models to predict obesity levels based on multi-class classification. Baseline decision trees, random forests and XGBoost models will be used.

In [1]:
# Importing libraries 
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import (accuracy_score, classification_report, 
                            confusion_matrix, f1_score)
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Load split data from feather files
train_path = os.path.join("..", "processed_data", "train_data.feather")
test_path = os.path.join("..", "processed_data", "test_data.feather")

train_df = pd.read_feather(train_path)
test_df = pd.read_feather(test_path)

# Split features and labels again
y_train = train_df["obesity_level"]
X_train = train_df.drop(columns=["obesity_level"])

y_test = test_df["obesity_level"]
X_test = test_df.drop(columns=["obesity_level"])


In [3]:

# Define categorical and numerical columns
categorical_cols = X_train.select_dtypes(include=["category"]).columns
numerical_cols = X_train.select_dtypes(include=["float64"]).columns

In [4]:
# Define the preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numerical_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols),
    ]
)

##### While DecisionTreeClassifier and RandomForestClassifier can work with both numeric and string labels, XGBoostClassifier expects numeric class labels. Therefore we encode the targets into integers. After making the predictions, we'll convert the predicted integers back to labels for evaluation.

In [5]:
# Create encoder
label_encoder = LabelEncoder()

# Fit and transform the target variable
y_train_encoded = label_encoder.fit_transform(y_train)

#### Decision Trees

In [6]:
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.model_selection import GridSearchCV


# Create pipeline
pipeline_dt = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', DecisionTreeClassifier(random_state=42))
])

# Define parameter grid for hyperparameter tuning
param_grid = {
    'classifier__max_depth': [3, 5, 7, 10, 15, 20, None],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4],
    'classifier__criterion': ['gini', 'entropy']
}

# Perform grid search with cross-validation
grid_search = GridSearchCV(pipeline_dt, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Get the best model
best_dt = grid_search.best_estimator_
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best cross-validation accuracy: {grid_search.best_score_:.4f}")

# Evaluate on test set
y_pred = best_dt.predict(X_test)
print("\nTest Set Performance:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")

Best parameters: {'classifier__criterion': 'entropy', 'classifier__max_depth': 10, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 2}
Best cross-validation accuracy: 0.9479

Test Set Performance:
Accuracy: 0.9622


#### Add plots and markdown explanation cells

#### Random Forest

In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# Create pipeline
pipeline_rf = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42, n_jobs=-1))
])

# Define parameter grid for hyperparameter tuning
param_grid = {
    'classifier__n_estimators': [50, 100, 200],
    'classifier__max_depth': [None, 10, 20, 30],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4],
    'classifier__max_features': ['sqrt', 'log2']
}

# Perform grid search with cross-validation
grid_search = GridSearchCV(pipeline_rf, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Get the best model
best_rf = grid_search.best_estimator_
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best cross-validation accuracy: {grid_search.best_score_:.4f}")

# Evaluate on test set
y_pred = best_rf.predict(X_test)
print("\nTest Set Performance:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")

# Helper function to get feature names after preprocessing
def get_feature_names(column_transformer):
    """Get feature names from a ColumnTransformer"""
    col_names = []
    for transformer_in_columns in column_transformer.transformers_:
        transformer_name, transformer, orig_columns = transformer_in_columns
        if transformer == 'drop':
            continue
        if hasattr(transformer, 'get_feature_names_out'):
            names = transformer.get_feature_names_out(orig_columns)
        else:  # for StandardScaler
            names = orig_columns
        col_names.extend(names)
    return col_names

# Feature importance
feature_importances = best_rf.named_steps['classifier'].feature_importances_
features = get_feature_names(best_rf.named_steps['preprocessor'])

# Create DataFrame for feature importances
importance_df = pd.DataFrame({
    'Feature': features,
    'Importance': feature_importances
}).sort_values('Importance', ascending=False)

print("\nTop 5 Important Features:")
print(importance_df.head(5))

Best parameters: {'classifier__max_depth': 20, 'classifier__max_features': 'sqrt', 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 200}
Best cross-validation accuracy: 0.9230

Test Set Performance:
Accuracy: 0.9267

Top 5 Important Features:
         Feature  Importance
2      weight_kg    0.283676
0            age    0.101957
1       height_m    0.093542
4    gender_Male    0.043114
3  gender_Female    0.036349


### NB: Use scikit-learn version 1.5.2: add to requirements.txt

#### XGBoost

In [13]:
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

# Create pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', XGBClassifier(random_state=42, n_jobs=1, 
                                objective='multi:softmax',
                                eval_metric='mlogloss'))
])

# Define parameter grid for hyperparameter tuning
param_grid = {
    'classifier__n_estimators': [50, 100, 200],
    'classifier__max_depth': [3, 5, 7, 10],
    'classifier__learning_rate': [0.01, 0.1, 0.2],
    'classifier__subsample': [0.8, 1.0],
    'classifier__colsample_bytree': [0.8, 1.0],
    'classifier__gamma': [0, 0.1, 0.2]
}

# Perform grid search with cross-validation
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=1)
grid_search.fit(X_train, y_train_encoded)

# Get the best model
best_xgb = grid_search.best_estimator_
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best cross-validation accuracy: {grid_search.best_score_:.4f}")

# Evaluate on test set
y_pred = best_xgb.predict(X_test)
y_pred = label_encoder.inverse_transform(y_pred)
print("\nTest Set Performance:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")

# Feature importance
feature_importances = best_xgb.named_steps['classifier'].feature_importances_
features = get_feature_names(best_xgb.named_steps['preprocessor'])

# Create DataFrame for feature importances
importance_df = pd.DataFrame({
    'Feature': features,
    'Importance': feature_importances
}).sort_values('Importance', ascending=False)

print("\nTop 5 Important Features:")
print(importance_df.head(5))

Best parameters: {'classifier__colsample_bytree': 0.8, 'classifier__gamma': 0, 'classifier__learning_rate': 0.2, 'classifier__max_depth': 3, 'classifier__n_estimators': 200, 'classifier__subsample': 1.0}
Best cross-validation accuracy: 0.9710

Test Set Performance:
Accuracy: 0.9598

Top 5 Important Features:
                       Feature  Importance
3                gender_Female    0.145731
4                  gender_Male    0.120098
18            snacking_freq_no    0.115379
2                    weight_kg    0.062848
8   high_caloric_food_freq_yes    0.058439


# OLD STUFF

In [9]:
# Define the models to be evaluated
models = {
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "XGBoost": XGBClassifier(eval_metric="mlogloss"),
}

# Define the hyperparameters for each model
param_grids = {
    "Decision Tree": {
        "model__max_depth": [None, 5, 10, 15],
        "model__min_samples_split": [2, 5, 10],
        "model__min_samples_leaf": [1, 2, 4],
    },
    "Random Forest": {
        "model__n_estimators": [100, 200],
        "model__max_depth": [None, 5, 10],
        "model__min_samples_split": [2, 5],
        "model__min_samples_leaf": [1, 2],
    },
    "XGBoost": {
        "model__n_estimators": [100, 200],
        "model__max_depth": [3, 5],
        "model__learning_rate": [0.01, 0.1],
    },
}


In [10]:
# Initialize a dictionary to store the best models and their scores
best_models = {}
# Loop through each model and perform hyperparameter tuning
for model_name, model in models.items():
    print(f"Training {model_name}...")
    # Create a pipeline with preprocessing and the model
    pipeline = Pipeline(steps=[("preprocessor", preprocessor), ("model", model)])
    
    # Perform grid search with cross-validation
    grid_search = GridSearchCV(
        pipeline,
        param_grid=param_grids[model_name],
        cv=5,
        scoring="accuracy",
        n_jobs=-1,
        verbose=1,
    )
    
    # Fit the model
    grid_search.fit(X_train, y_train_encoded)
    
    # Store the best model and its score
    best_models[model_name] = {
        "model": grid_search.best_estimator_,
        "score": grid_search.best_score_,
    }
    print(f"Best score for {model_name}: {grid_search.best_score_}")
    print(f"Best parameters for {model_name}: {grid_search.best_params_}")

Training Decision Tree...
Fitting 5 folds for each of 36 candidates, totalling 180 fits
Best score for Decision Tree: 0.9318754060365564
Best parameters for Decision Tree: {'model__max_depth': 10, 'model__min_samples_leaf': 1, 'model__min_samples_split': 2}
Training Random Forest...
Fitting 5 folds for each of 24 candidates, totalling 120 fits
Best score for Random Forest: 0.9253524836268502
Best parameters for Random Forest: {'model__max_depth': None, 'model__min_samples_leaf': 1, 'model__min_samples_split': 2, 'model__n_estimators': 100}
Training XGBoost...
Fitting 5 folds for each of 8 candidates, totalling 40 fits
Best score for XGBoost: 0.9632732252910294
Best parameters for XGBoost: {'model__learning_rate': 0.1, 'model__max_depth': 5, 'model__n_estimators': 200}


In [11]:
  # Evaluate the best models on the test set
for model_name, model_info in best_models.items():
    print(f"Evaluating {model_name} on the test set...")
    # Make predictions
    y_pred = model_info["model"].predict(X_test)
    # Convert the predicted integers back to labels
    y_pred = label_encoder.inverse_transform(y_pred)
    
    # Calculate accuracy and F1 score
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average="weighted")
    
    print(f"Accuracy for {model_name}: {accuracy}")
    print(f"F1 Score for {model_name}: {f1}")
    
    # Print classification report and confusion matrix
    print(classification_report(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))

Evaluating Decision Tree on the test set...
Accuracy for Decision Tree: 0.9385342789598109
F1 Score for Decision Tree: 0.9384826114039201
                     precision    recall  f1-score   support

Insufficient_Weight       0.92      0.96      0.94        56
      Normal_Weight       0.90      0.87      0.89        62
     Obesity_Type_I       0.95      0.94      0.94        78
    Obesity_Type_II       0.96      0.95      0.96        58
   Obesity_Type_III       1.00      1.00      1.00        63
 Overweight_Level_I       0.90      0.93      0.91        56
Overweight_Level_II       0.94      0.92      0.93        50

           accuracy                           0.94       423
          macro avg       0.94      0.94      0.94       423
       weighted avg       0.94      0.94      0.94       423

[[54  2  0  0  0  0  0]
 [ 5 54  0  0  0  3  0]
 [ 0  0 73  2  0  0  3]
 [ 0  0  3 55  0  0  0]
 [ 0  0  0  0 63  0  0]
 [ 0  4  0  0  0 52  0]
 [ 0  0  1  0  0  3 46]]
Evaluating Random F