In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
import pickle

In [2]:
df = pd.read_csv(r"C:\Users\Lenovo\OneDrive\Documents\Data science\smst 4\Model Deployment\UAS Project\ObesityDataSet2.csv")
df['Age'] = pd.to_numeric(df['Age'], errors='coerce')

cat_cols = ['MTRANS']
df[cat_cols] = df[cat_cols].fillna(df[cat_cols].mode().iloc[0])
df['FCVC'] = df['FCVC'].fillna(df['FCVC'].median())

y = df['NObeyesdad']
X = df.drop(columns='NObeyesdad')

cat_features = X.select_dtypes(include='object').columns.tolist()
num_features = X.select_dtypes(include='number').columns.tolist()

In [3]:
X = df.drop(columns="NObeyesdad")
y_raw = df["NObeyesdad"]

# Encode target label
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y_raw)

# Simpan label encoder untuk dipakai saat inference di FastAPI
with open("label_encoder.pkl", "wb") as f:
    pickle.dump(label_encoder, f)

# Identifikasi fitur kategorik dan numerik
cat_features = X.select_dtypes(include="object").columns.tolist()
num_features = X.select_dtypes(exclude="object").columns.tolist()

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Preprocessing pipeline
preprocessor = ColumnTransformer([
    ('cat', Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OneHotEncoder(handle_unknown='ignore'))
    ]), cat_features),
    ('num', Pipeline([
        ('scaler', StandardScaler())
    ]), num_features)
])

# Random Forest pipeline
pipeline_rf = Pipeline([
    ('preprocess', preprocessor),
    ('model', RandomForestClassifier(random_state=42))
])

# XGBoost pipeline
pipeline_xgb = Pipeline([
    ('preprocess', preprocessor),
    ('model', XGBClassifier(use_label_encoder=False, eval_metric='mlogloss'))
])

# Fit dan evaluasi RF
pipeline_rf.fit(X_train, y_train)
y_pred_rf = pipeline_rf.predict(X_test)
print("Random Forest:")
print(classification_report(y_test, y_pred_rf))

# Fit dan evaluasi XGB
pipeline_xgb.fit(X_train, y_train)
y_pred_xgb = pipeline_xgb.predict(X_test)
print("XGBoost:")
print(classification_report(y_test, y_pred_xgb))

# Simpan model terbaik (misal XGB)
with open("model.pkl", "wb") as f:
    pickle.dump(pipeline_xgb, f)

Random Forest:
              precision    recall  f1-score   support

           0       0.96      1.00      0.98        27
           1       0.77      0.93      0.84        29
           2       0.97      0.86      0.91        35
           3       0.97      0.97      0.97        30
           4       0.97      0.97      0.97        33
           5       0.96      0.90      0.93        29
           6       0.89      0.86      0.88        29

    accuracy                           0.92       212
   macro avg       0.93      0.93      0.93       212
weighted avg       0.93      0.92      0.93       212

XGBoost:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        27
           1       0.93      0.97      0.95        29
           2       0.87      0.94      0.90        35
           3       0.94      0.97      0.95        30
           4       1.00      0.97      0.98        33
           5       0.93      0.93      0.93        29


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
