In [1]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [3]:
numeric_features = ['Age', 'WorkHoursPerWeek', 'Experience', 'RemoteRatio', 'SatisfactionLevel', 'StressLevel']
categorical_features = ['JobRole', 'Gender']
features = numeric_features + categorical_features

df = pd.read_csv('synthetic_employee_burnout.csv')
X = df[features]
y = df['Burnout']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [4]:
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numeric_features),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
])

In [5]:
rf = RandomForestClassifier(random_state=42, n_jobs=-1)
xgb = XGBClassifier(random_state=42, eval_metric='logloss')
ensemble = VotingClassifier([('rf', rf), ('xgb', xgb)], voting='soft')

pipeline = Pipeline([
    ('preprocessing', preprocessor),
    ('classifier', ensemble)
])



In [6]:
pipeline.fit(X_train, y_train)

y_pred_proba = pipeline.predict_proba(X_test)[:, 1]
roc_pipeline = roc_auc_score(y_test, y_pred_proba)

print(f"\n PIPELINE RESULTS:")
print(f"Day23 manual:   1.000")
print(f"Day24 Pipeline: {roc_pipeline:.3f}")
print(f"AUTOMATIZADO:   {'OK' if abs(roc_pipeline-1.0)<0.01 else 'Revisar'}")
print("\n DAY 24 PIPELINE COMPLETED!")


 PIPELINE RESULTS:
Day23 manual:   1.000
Day24 Pipeline: 1.000
AUTOMATIZADO:   OK

 DAY 24 PIPELINE COMPLETED!
