In [1]:
import pandas as pd
import numpy as np
import pickle

from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.inspection import permutation_importance
from sklearn.utils import resample
from sklearn.svm import SVC
from xgboost import XGBClassifier

In [2]:
data = pd.read_csv("data/burnout_submissions.csv", keep_default_na=False)
data.isnull().sum()

usia                 0
jenis_kelamin        0
kota_asal            0
status_pernikahan    0
jumlah_anak          0
                    ..
PV                   0
event_load_genap     0
EV                   0
skor_total           0
risiko_stres         0
Length: 63, dtype: int64

In [3]:
features_to_drop = [
    'risiko_stres', 'personal_vulnerability_ganjil', 'PV',
    'event_load_genap', 'EV', 'skor_total'
]

X = data.drop(columns=features_to_drop)
y = data['risiko_stres']

In [4]:
print(f"Number of Features: {len(X.columns)}")

Number of Features: 57


In [5]:
print(f"Unique values in risiko_stres: {y.unique()}")
print(f"Value counts: \n{y.value_counts()}")

le = LabelEncoder()
y_encoded = le.fit_transform(y)

print(f"Original unique values in encoded labels: {np.unique(y_encoded)}")
if y_encoded.min() != 0:
    y_encoded = y_encoded - y_encoded.min()
print(f"Adjusted unique values in encoded labels: {np.unique(y_encoded)}")

numerical_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X.select_dtypes(include=['object', 'bool']).columns.tolist()

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ],
    remainder='passthrough'
)

Unique values in risiko_stres: ['High Stress (High Risk)' 'Low Stress (Lowest Risk)'
 'Challenged (Low Risk)']
Value counts: 
risiko_stres
High Stress (High Risk)     6
Low Stress (Lowest Risk)    4
Challenged (Low Risk)       1
Name: count, dtype: int64
Original unique values in encoded labels: [0 1 2]
Adjusted unique values in encoded labels: [0 1 2]


In [6]:
print(f"Numerical Features: {numerical_features}")
print(f"Categorical Features: {categorical_features}")

Numerical Features: ['usia', 'jumlah_anak', 'usia_anak', 'lama_bekerja', 'waktu_bekerja_seminggu', 'beban_sks', 'mhs_bimbingan', 'work_life_balance', 'gaji_sesuai', '1_tidak_mampu', '2_kewalahan_tanggung_jawab', '3_keadaan_tidak_berpihak', '4_waktu_tidak_cukup', '5_tidak_berjalan_baik', '6_terburu_buru', '7_tidak_ada_jalan_keluar', '8_masalah_menumpuk', '9_ingin_menyerah', '10_memikul_beban_berat']
Categorical Features: ['jenis_kelamin', 'kota_asal', 'status_pernikahan', 'tinggal_dengan_siapa', 'tinggal_sendiri', 'tinggal_pasangan', 'tinggal_anak', 'tinggal_ortu', 'tinggal_mertua', 'tinggal_saudara', 'tinggal_teman', 'profesi', 'bidang', 'mode_bekerja', 'jarak', 'jabatan_struktural', 'jabatan_fungsional', 'sertifikasi', 'status_keaktifan', 'kesehatan_fisik', 'fisik_mata', 'fisik_punggung', 'fisik_tensi', 'fisik_lemah', 'fisik_kepala', 'fisik_obesitas', 'fisik_imun', 'fisik_carpal', 'kondisi_mental', 'mental_anxiety', 'mental_burnout', 'mental_depresi', 'mental_distress', 'mental_konsen

In [7]:
kf = KFold(n_splits=5, shuffle=True, random_state=2024)

In [13]:
models = {
    'RFC': Pipeline(steps=[('preprocessor', preprocessor), ('classifier', RandomForestClassifier(n_estimators=100, random_state=2024))]),
    'SVC': Pipeline(steps=[('preprocessor', preprocessor), ('classifier', SVC(kernel='rbf', C=1.0,))]),   
    'GBC': Pipeline(steps=[('preprocessor', preprocessor), ('classifier', GradientBoostingClassifier(n_estimators=100, random_state=2024))]),    
    'XGB': Pipeline(steps=[('preprocessor', preprocessor), 
                           ('classifier', XGBClassifier(objective='multi:softmax',
                                                        num_class=4,
                                                        use_label_encoder=False, 
                                                        eval_metric='mlogloss',
                                                        n_estimators=100,
                                                        random_state=2024))]),
}

In [14]:
results = {model: {'accuracy': [], 'precision': [], 'recall': [], 'f1' : []} for model in models}
trained_models = {}

In [15]:
for train_index, test_index in kf.split(X, y_encoded):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y_encoded[train_index], y_encoded[test_index]

    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        
        results[name]['accuracy'].append(accuracy_score(y_test, y_pred))
        results[name]['precision'].append(precision_score(y_test, y_pred, average='weighted'))
        results[name]['recall'].append(recall_score(y_test, y_pred, average='weighted'))
        results[name]['f1'].append(f1_score(y_test, y_pred, average='weighted'))

        # Save the last trained version of each model
        trained_models[name] = model

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


ValueError: Invalid classes inferred from unique values of `y`.  Expected: [0 1], got [1 2]

In [12]:
for name, metrics in results.items():
    print(f"{name} Results:")
    print(f"Accuracy:  {np.mean(metrics['accuracy'])*100:.4f}%")
    print(f"Precision: {np.mean(metrics['precision'])*100:.4f}%")
    print(f"Recall:    {np.mean(metrics['recall'])*100:.4f}%")
    print(f"F1 Score:  {np.mean(metrics['f1'])*100:.4f}%")
    print()

RFC Results:
Accuracy:  80.0000%
Precision: 75.0000%
Recall:    80.0000%
F1 Score:  76.6667%

SVC Results:
Accuracy:  70.0000%
Precision: 55.0000%
Recall:    70.0000%
F1 Score:  60.0000%

GBC Results:
Accuracy:  83.3333%
Precision: 85.0000%
Recall:    83.3333%
F1 Score:  82.2222%



In [None]:
def get_feature_importance(model, model_name, X, y, sample_size=10000):
    feature_names = model.named_steps['preprocessor'].get_feature_names_out()
    

    if model_name in ['XGB', 'RFC', 'GBC']:
        importances = model.named_steps['classifier'].feature_importances_
    elif model_name == 'SVC':
        # --- Ambil sample data untuk mempercepat ---
        if len(X) > sample_size:
            X_sample, y_sample = resample(
                X, y,
                n_samples=sample_size,
                stratify=y,
                random_state=2024
            )
        else:
            X_sample, y_sample = X, y
        perm_importance = permutation_importance(
            model, 
            X_sample, y_sample, 
            n_repeats=5,          # kurangi dari 10 → 5 biar lebih cepat
            random_state=2024, 
            n_jobs=-1             # pakai semua core CPU
        )
        importances = perm_importance.importances_mean

    else:
        return None

    feature_importance = dict(zip(feature_names, importances))
    return dict(sorted(feature_importance.items(), key=lambda item: abs(item[1]), reverse=True))

In [None]:
X = data[X]
y = data[y]
for name, model in trained_models.items():
    print(f"\nFeature Importance for {name} (Hybrid):")
    importance = get_feature_importance(model, name, X, y)
    if importance:
        for feature, value in importance.items():
            print(f"{feature}: {value}")
    print()

ValueError: Boolean array expected for the condition, not int64

In [None]:
encoding_scheme = "hybrid"

for name, model in trained_models.items():
    filename = f"{name}_{encoding_scheme}.pkl"
    with open(filename, "wb") as f:
        pickle.dump(model, f)

print("✅ Models saved separately with Hybrid encoding")

✅ Models saved separately with Hybrid encoding
