In [14]:
import pandas as pd

df = pd.read_csv('../data/processed/cleaned_data.csv')

head = df.head()
print(head)

   Gender   Age  Height  Weight family_history_with_overweight FAVC  FCVC  \
0  Female  21.0    1.62    64.0                            yes   no   2.0   
1  Female  21.0    1.52    56.0                            yes   no   3.0   
2    Male  23.0    1.80    77.0                            yes   no   2.0   
3    Male  27.0    1.80    87.0                             no   no   3.0   
4    Male  22.0    1.78    89.8                             no   no   2.0   

   NCP       CAEC SMOKE  CH2O  SCC  FAF  TUE        CALC  \
0  3.0  Sometimes    no   2.0   no  0.0  1.0          no   
1  3.0  Sometimes   yes   3.0  yes  3.0  0.0   Sometimes   
2  3.0  Sometimes    no   2.0   no  2.0  1.0  Frequently   
3  3.0  Sometimes    no   2.0   no  2.0  0.0  Frequently   
4  1.0  Sometimes    no   2.0   no  0.0  0.0   Sometimes   

                  MTRANS           NObeyesdad  
0  Public_Transportation        Normal_Weight  
1  Public_Transportation        Normal_Weight  
2  Public_Transportation        

# Handling Data Types

In [15]:
# Store the target variable separately before preprocessing
y_original = df['NObeyesdad'].copy()

# Automatically identify categorical and numerical columns
cat_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
num_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Remove target variable from categorical columns if present
if 'NObeyesdad' in cat_cols:
    cat_cols.remove('NObeyesdad')

# Remove target variable from numerical columns if present
if 'NObeyesdad' in num_cols:
    num_cols.remove('NObeyesdad')


print(f"\n===== FEATURE TYPES =====")
print(f"Identified {len(cat_cols)} categorical columns: {cat_cols}")
print(f"Identified {len(num_cols)} numerical columns: {num_cols}")
print("=" * 60)


===== FEATURE TYPES =====
Identified 8 categorical columns: ['Gender', 'family_history_with_overweight', 'FAVC', 'CAEC', 'SMOKE', 'SCC', 'CALC', 'MTRANS']
Identified 8 numerical columns: ['Age', 'Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE']


# DATA PREPROCESSING

Preparing the data for model training by encoding categorical variables,
scaling numerical features, and splitting into training and testing sets.
This step ensures our data is in the correct format for machine learning algorithms.

### Encoding Categorical Variables

In [16]:
from sklearn.preprocessing import StandardScaler, LabelEncoder
import pickle

# Label Encode target variable
le = LabelEncoder()
y = le.fit_transform(y_original)
class_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print(f"Classes mapping: {class_mapping}")

# Drop the target variable from the dataframe before one-hot encoding
X_df = df.drop('NObeyesdad', axis=1)

# One-hot encode all categorical columns
X_encoded = pd.get_dummies(X_df, columns=cat_cols, drop_first=True)
print(f"Dataset shape after one-hot encoding: {X_encoded.shape}")

# Save the label encoder for future use
pickle.dump(le, open('../models/label_encoder.pkl', 'wb'))
print("Label encoder saved to '../models/label_encoder.pkl'")


Classes mapping: {'Insufficient_Weight': 0, 'Normal_Weight': 1, 'Obesity_Type_I': 2, 'Obesity_Type_II': 3, 'Obesity_Type_III': 4, 'Overweight_Level_I': 5, 'Overweight_Level_II': 6}
Dataset shape after one-hot encoding: (2087, 23)
Label encoder saved to '../models/label_encoder.pkl'


### Split the data into train/test

In [17]:

from sklearn.model_selection import train_test_split

# Split into train and test sets with stratification to maintain class distribution
X_train, X_test, y_train, y_test = train_test_split(
    X_encoded, y, test_size=0.2, random_state=42, stratify=y
)
print(f"Training set: {X_train.shape}, Test set: {X_test.shape}")


Training set: (1669, 23), Test set: (418, 23)


### Feature Scaling

In [19]:
# Scale numerical features
scaler = StandardScaler()
# Get column indices for numerical features in the one-hot encoded dataframe
num_feature_indices = [X_train.columns.get_loc(col) for col in num_cols if col in X_train.columns]
X_train_array = X_train.values.copy()  # Create a copy to avoid modifying the original
X_test_array = X_test.values.copy()

# Apply scaling only to numerical columns
X_train_array[:, num_feature_indices] = scaler.fit_transform(X_train_array[:, num_feature_indices])
X_test_array[:, num_feature_indices] = scaler.transform(X_test_array[:, num_feature_indices])
# Save the scaler for future use
pickle.dump(scaler, open('../models/scaler.pkl', 'wb'))
print("Scaler saved to '../models/scaler.pkl'")

Scaler saved to '../models/scaler.pkl'


In [20]:

# Save the feature names for future reference
feature_columns = {
    'categorical': cat_cols,
    'numerical': num_cols,
    'encoded': X_encoded.columns.tolist()
}
pickle.dump(feature_columns, open('../models/feature_columns.pkl', 'wb'))
print("Feature column names saved to '../models/feature_columns.pkl'")
print("=" * 60)

Feature column names saved to '../models/feature_columns.pkl'


### Save train/test data

In [25]:
import os

def save_df(df, path):
    print("Data saved to:", path)
    df.to_csv(path, index=False)

save_dir = '../data/processed/'
train_features_path = os.path.join(save_dir, "train_features.csv")
train_labels_path = os.path.join(save_dir, "train_labels.csv")
test_features_path = os.path.join(save_dir, "test_features.csv")
test_labels_path = os.path.join(save_dir, "test_labels.csv")

save_df(pd.DataFrame(X_train_array, columns=X_train.columns), train_features_path)
save_df(pd.DataFrame(y_train), train_labels_path)
save_df(pd.DataFrame(X_test_array, columns=X_test.columns), test_features_path)
save_df(pd.DataFrame(y_test), test_labels_path)

Data saved to: ../data/processed/train_features.csv
Data saved to: ../data/processed/train_labels.csv
Data saved to: ../data/processed/test_features.csv
Data saved to: ../data/processed/test_labels.csv
