In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
import pickle

**Preprocessing**

In [2]:
# Load data
df = pd.read_csv('/Users/debby/Downloads/ObesityDataSet1.csv')

# Preprocessing
# 1. Perbaiki kolom Age
df['Age'] = df['Age'].str.replace('years','',case=False).str.strip()
df['Age'] = pd.to_numeric(df['Age'], errors='coerce')

# 2. Drop data yang value di target labelnya (NObeyesdad) NaN
df = df.dropna(subset=['NObeyesdad'])

# Define fitur (input) dan target variable (output)
df_input = df.drop('NObeyesdad', axis=1)
df_output = df['NObeyesdad']

# Split data
x_train, x_test, y_train, y_test = train_test_split(
    df_input, df_output, test_size=0.2, random_state=42, stratify=df_output
)

# Define obesity levels untuk referensi
obesity_levels = [
    'Insufficient_Weight',
    'Normal_Weight',
    'Overweight_Level_I',
    'Overweight_Level_II',
    'Obesity_Type_I',
    'Obesity_Type_II',
    'Obesity_Type_III'
]

# Encode target variable
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

**Define steps preprocessing**

In [3]:
# Variabel numerikal
num_cols = ['Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE']
num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean'))
])

# Variabel kategorikal
cat_cols = ['Gender', 'family_history_with_overweight', 'FAVC', 'SMOKE', 'SCC', 'CAEC', 'CALC']
cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinal', OrdinalEncoder(
        categories=[
            ['Male', 'Female'],
            ['no', 'yes'],
            ['no', 'yes'],
            ['no', 'yes'],
            ['no', 'yes'],
            ['no', 'Sometimes', 'Frequently', 'Always'],
            ['no', 'Sometimes', 'Frequently']
        ]
    ))
])

# One-hot encoding kolom MTRANS 
mtrans_features = ['MTRANS']
mtrans_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(drop='first'))
])

# Satukan semua preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_cols),
        ('cat', cat_transformer, cat_cols),
        ('mtrans', mtrans_transformer, mtrans_features)
    ])

**Full pipeline XGBoost**

In [4]:
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', XGBClassifier())
])

# Fit pipeline dengan y_train_encoded
pipeline.fit(x_train, y_train_encoded)

# Save pipeline, label encoder, dan obesity levels
with open('xgb_pipeline.pkl', 'wb') as file:
    pickle.dump({
        'pipeline': pipeline,
        'label_encoder': label_encoder,
        'obesity_levels': obesity_levels
    }, file)

In [5]:
# Prediksi
y_pred_encoded = pipeline.predict(x_test)
y_pred = label_encoder.inverse_transform(y_pred_encoded)

# Classification report
print('\nClassification Report\n')
print(classification_report(y_test, y_pred, target_names=obesity_levels))


Classification Report

                     precision    recall  f1-score   support

Insufficient_Weight       0.93      1.00      0.96        27
      Normal_Weight       0.90      0.90      0.90        29
 Overweight_Level_I       0.94      0.97      0.96        35
Overweight_Level_II       1.00      0.97      0.98        30
     Obesity_Type_I       1.00      1.00      1.00        32
    Obesity_Type_II       0.90      0.90      0.90        29
   Obesity_Type_III       0.96      0.90      0.93        29

           accuracy                           0.95       211
          macro avg       0.95      0.95      0.95       211
       weighted avg       0.95      0.95      0.95       211

