In [1]:
import pandas as pd

df = pd.read_csv('../data/processed/cleaned_data.csv')

head = df.head()
print(head)

   Gender   Age  Height  Weight family_history_with_overweight FAVC  FCVC  \
0  Female  21.0    1.62    64.0                            yes   no   2.0   
1  Female  21.0    1.52    56.0                            yes   no   3.0   
2    Male  23.0    1.80    77.0                            yes   no   2.0   
3    Male  27.0    1.80    87.0                             no   no   3.0   
4    Male  22.0    1.78    89.8                             no   no   2.0   

   NCP       CAEC SMOKE  CH2O  SCC  FAF  TUE        CALC  \
0  3.0  Sometimes    no   2.0   no  0.0  1.0          no   
1  3.0  Sometimes   yes   3.0  yes  3.0  0.0   Sometimes   
2  3.0  Sometimes    no   2.0   no  2.0  1.0  Frequently   
3  3.0  Sometimes    no   2.0   no  2.0  0.0  Frequently   
4  1.0  Sometimes    no   2.0   no  0.0  0.0   Sometimes   

                  MTRANS           NObeyesdad  
0  Public_Transportation        Normal_Weight  
1  Public_Transportation        Normal_Weight  
2  Public_Transportation        

# Handling Data Types

In [2]:
# Store the target variable separately before preprocessing
y_original = df['NObeyesdad'].copy()

# Automatically identify categorical and numerical columns
cat_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
num_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Remove target variable from categorical columns if present
if 'NObeyesdad' in cat_cols:
    cat_cols.remove('NObeyesdad')

# Remove target variable from numerical columns if present
if 'NObeyesdad' in num_cols:
    num_cols.remove('NObeyesdad')


print(f"\n===== FEATURE TYPES =====")
print(f"Identified {len(cat_cols)} categorical columns: {cat_cols}")
print(f"Identified {len(num_cols)} numerical columns: {num_cols}")
print("=" * 60)


===== FEATURE TYPES =====
Identified 8 categorical columns: ['Gender', 'family_history_with_overweight', 'FAVC', 'CAEC', 'SMOKE', 'SCC', 'CALC', 'MTRANS']
Identified 8 numerical columns: ['Age', 'Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE']


# DATA PREPROCESSING

Preparing the data for model training by encoding categorical variables,
scaling numerical features, and splitting into training and testing sets.
This step ensures our data is in the correct format for machine learning algorithms.

### Encoding Target Variables

In [3]:
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
import pickle

# Label Encode target variable
le = LabelEncoder()
y = le.fit_transform(y_original)
class_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print(f"Classes mapping: {class_mapping}")

# Drop the target variable from the dataframe before one-hot encoding
X_df = df.drop('NObeyesdad', axis=1)

# Save the label encoder for future use
pickle.dump(le, open('../models/label_encoder.pkl', 'wb'))
print("Label encoder saved to '../models/label_encoder.pkl'")


Classes mapping: {'Insufficient_Weight': 0, 'Normal_Weight': 1, 'Obesity_Type_I': 2, 'Obesity_Type_II': 3, 'Obesity_Type_III': 4, 'Overweight_Level_I': 5, 'Overweight_Level_II': 6}
Label encoder saved to '../models/label_encoder.pkl'


### Split the data into train/test

In [4]:

from sklearn.model_selection import train_test_split

# Split into train and test sets with stratification to maintain class distribution
X_train, X_test, y_train, y_test = train_test_split(
    X_df, y, test_size=0.2, random_state=42, stratify=y
)
print(f"Training set: {X_train.shape}, Test set: {X_test.shape}")


Training set: (1669, 16), Test set: (418, 16)


### Preprocessing pipeline

In [5]:
from sklearn.compose import ColumnTransformer

# Create a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_cols),
        ('cat', OneHotEncoder(drop='first', sparse_output=False), cat_cols)
    ],
    remainder='passthrough'
)
# Fit the preprocessor on the training data and transform both train and test
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

# Get feature names from the preprocessor
num_feature_names = num_cols
cat_feature_names = preprocessor.named_transformers_['cat'].get_feature_names_out(cat_cols)
feature_names = list(num_feature_names) + list(cat_feature_names)
print(f"Training set after preprocessing: {X_train_processed.shape}")
print(f"Test set after preprocessing: {X_test_processed.shape}")

# Convert to DataFrames for easier saving
X_train_df = pd.DataFrame(X_train_processed, columns=feature_names)
X_test_df = pd.DataFrame(X_test_processed, columns=feature_names)

# Save the preprocessor pipeline for future use
pickle.dump(preprocessor, open('../models/preprocessor.pkl', 'wb'))
print("Preprocessor pipeline saved to '../models/preprocessor.pkl'")

# Save the feature columns for future reference
feature_columns = {
    'categorical': cat_cols,
    'numerical': num_cols,
    'encoded': feature_names
}
pickle.dump(feature_columns, open('../models/feature_columns.pkl', 'wb'))
print("Feature column names saved to '../models/feature_columns.pkl'")
print(feature_columns)

Training set after preprocessing: (1669, 23)
Test set after preprocessing: (418, 23)
Preprocessor pipeline saved to '../models/preprocessor.pkl'
Feature column names saved to '../models/feature_columns.pkl'
{'categorical': ['Gender', 'family_history_with_overweight', 'FAVC', 'CAEC', 'SMOKE', 'SCC', 'CALC', 'MTRANS'], 'numerical': ['Age', 'Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE'], 'encoded': ['Age', 'Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE', 'Gender_Male', 'family_history_with_overweight_yes', 'FAVC_yes', 'CAEC_Frequently', 'CAEC_Sometimes', 'CAEC_no', 'SMOKE_yes', 'SCC_yes', 'CALC_Frequently', 'CALC_Sometimes', 'CALC_no', 'MTRANS_Bike', 'MTRANS_Motorbike', 'MTRANS_Public_Transportation', 'MTRANS_Walking']}


### Save train/test data

In [6]:
import os

# Save train/test data
save_dir = '../data/processed/'
os.makedirs(save_dir, exist_ok=True)

train_features_path = os.path.join(save_dir, "train_features.csv")
train_labels_path = os.path.join(save_dir, "train_labels.csv")
test_features_path = os.path.join(save_dir, "test_features.csv")
test_labels_path = os.path.join(save_dir, "test_labels.csv")

X_train_df.to_csv(train_features_path, index=False)
print(f"Data saved to: {train_features_path}")

pd.DataFrame(y_train, columns=['target']).to_csv(train_labels_path, index=False)
print(f"Data saved to: {train_labels_path}")

X_test_df.to_csv(test_features_path, index=False)
print(f"Data saved to: {test_features_path}")

pd.DataFrame(y_test, columns=['target']).to_csv(test_labels_path, index=False)
print(f"Data saved to: {test_labels_path}")

Data saved to: ../data/processed/train_features.csv
Data saved to: ../data/processed/train_labels.csv
Data saved to: ../data/processed/test_features.csv
Data saved to: ../data/processed/test_labels.csv
