<div style="text-align: left; font-size: 14px; line-height: 1.4">
<strong>Pontificia Universidad Católica de Chile</strong><br>
Facultad de Matemática<br>
Magíster en Inteligencia Artificial - MIA
</div>

<hr style="border: 1px solid #999;">

<div style="text-align: center">
<h2 style="margin-bottom: 0.3em">Proyecto Aplicado: Feature Selection</h2>
<h4 style="margin-top: 0.2em; margin-bottom: 0.2em">EPG4001 – Aprendizaje Supervisado</h4>
<h5 style="margin-top: 0.2em; font-weight: normal">Segundo Bimestre</h5>
</div>

<hr style="border: 1px solid #999;">

<div style="text-align: center; font-size: 15px; margin-bottom: 1em">
<strong>Profesor:</strong> Jonathan Acosta
</div>

<div style="font-size: 14px">
Julio 2025<br>
Glen Restrepo A.<br>
Javiera Vukasovic F.<br>
Marco Gutierrez C.<br>
Maximiliano Zapater C. <br>
Sebastián Silva E.
</div>


## Library Imports

In [23]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier

from sklearn.metrics import classification_report, ConfusionMatrixDisplay


## Data Import

In [None]:
df_raw = pd.read_csv('../ObesityDataSet_raw_and_data_sinthetic.csv')
df_raw.shape

In [3]:
df = df_raw.copy()

## Data Splitting
- Para selección de variables utilizaremos 70/30, para entrenar el modelo final podemos separa validación y test

In [4]:
# Separate features and target
X = df.drop('NObeyesdad', axis=1)
y = df['NObeyesdad']

# One-hot encode cat features
X_encoded = pd.get_dummies(X, drop_first=True)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.3, random_state=42, stratify=y)

# Feature selection

### Random forest Feature Importnace

In [6]:
# Initialize and train model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, oob_score=True)
rf_model.fit(X_train, y_train)

# Feature importances
importances = rf_model.feature_importances_
feature_names = X_encoded.columns
feature_importance_df = pd.DataFrame({'feature': feature_names, 'importance': importances})
feature_importance_df = feature_importance_df.sort_values(by='importance', ascending=False)

In [None]:
# Plot feature importances
plt.figure(figsize=(8, 6))
sns.barplot(x='importance', y='feature', data=feature_importance_df)
plt.title('Random Forest Feature Importance')
plt.show()

In [None]:
print(f"Out-of-Bag (OOB) Score: {rf_model.oob_score_:.4f}")

#### Recursive Feature Elimination (RFE) 

In [None]:

rfe_estimator = RandomForestClassifier(n_estimators=50, random_state=42)

# Select top 10 features
rfe_selector = RFE(estimator=rfe_estimator, n_features_to_select=10, step=1)

# Fit RFE on the training data
rfe_selector.fit(X_train, y_train)

# Get the results
selected_features_mask = rfe_selector.support_
selected_feature_names = X_train.columns[selected_features_mask]

print("Features selected by RFE:")
print(list(selected_feature_names))

In [None]:
# --- Model 1: Full Features ---
rf_full = RandomForestClassifier(n_estimators=100, random_state=42)
rf_full.fit(X_train, y_train)
y_pred_full = rf_full.predict(X_test)

print("--- Classification Report for Full Model ---")
print(classification_report(y_test, y_pred_full))
ConfusionMatrixDisplay.from_estimator(rf_full, X_test, y_test, xticks_rotation='vertical')
plt.title('Confusion Matrix - Full Model')
plt.show()

In [None]:
# --- Model 2: RFE-Selected Features ---
X_train_rfe = X_train[selected_feature_names]
X_test_rfe = X_test[selected_feature_names]

rf_rfe = RandomForestClassifier(n_estimators=100, random_state=42)
rf_rfe.fit(X_train_rfe, y_train)
y_pred_rfe = rf_rfe.predict(X_test_rfe)

print("\n--- Classification Report for RFE-Selected Model ---")
print(classification_report(y_test, y_pred_rfe))
ConfusionMatrixDisplay.from_estimator(rf_rfe, X_test_rfe, y_test, xticks_rotation='vertical')
plt.title('Confusion Matrix - RFE Model')
plt.show()

### YUHUUUU MEJOR MODELO MENOS VARIABLESSS

## Train models

In [20]:
# Add BMI 
df['BMI'] = df['Weight'] / (df['Height']**2)

In [21]:
X = df.drop('NObeyesdad', axis=1)
y = df['NObeyesdad']

In [27]:
# Training (60%) Temporary set (40%)
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.4, random_state=42, stratify=y
)

# Temporary set into validation (20%) and test (20%)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
)

In [None]:
# Identify categorical features
categorical_features_indices = [i for i, col_type in enumerate(X.dtypes) if col_type == 'object']
print(f"Categorical feature column indices: {categorical_features_indices}")

In [29]:
cat_model = CatBoostClassifier(iterations=500,
                               learning_rate=0.1,
                               depth=6,
                               cat_features=categorical_features_indices,
                               random_state=42,
                               verbose=0)

# Train
cat_model.fit(X_train, y_train)

# Predict
y_pred_cat = cat_model.predict(X_test)

In [None]:
# Evaluate the model
print("--- Classification Report for CatBoost Model ---")
print(classification_report(y_test, y_pred_cat))

ConfusionMatrixDisplay.from_estimator(cat_model, X_test, y_test, xticks_rotation='vertical')
plt.title('Confusion Matrix - CatBoost Model')
plt.show()