In [None]:
import pandas as pd
import numpy as np
import joblib
import optuna
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report, precision_recall_curve
from sklearn.preprocessing import LabelEncoder

In [None]:
raw_path = "datasets/dataset_raw.csv"
df = pd.read_csv(raw_path).drop_duplicates().dropna(how="all")

df.columns = df.columns.str.replace(r"[\[\]]", "", regex=True).str.replace(" ", "_")

df["quality"] = df["Product_ID"].str[0]
df["serial"] = df["Product_ID"].str[1:].astype(int)

if "UDI" in df.columns:
    df = df.drop(columns=["UDI"])
if "Product_ID" in df.columns:
    df = df.drop(columns=["Product_ID"])

y_binary = df['Target']
y_multi = df['Failure_Type']
X = df.drop(columns=['Target','Failure_Type'])

In [129]:
X = X.copy()

if 'process_temperature_K' in X.columns and 'air_temperature_K' in X.columns:
    X['temp_diff'] = X['process_temperature_K'] - X['air_temperature_K']
if 'torque_Nm' in X.columns and 'rotational_speed_rpm' in X.columns:
    X['load'] = X['torque_Nm'] * X['rotational_speed_rpm']
if 'load' in X.columns and 'process_temperature_K' in X.columns:
    X['stress'] = X['load'] / (X['process_temperature_K'] + 1e-9)
if 'tool_wear_min' in X.columns and 'serial' in X.columns:
    X['wear_rate'] = X['tool_wear_min'] / (X['serial'] + 1e-9)
if 'torque_Nm' in X.columns and 'process_temperature_K' in X.columns:
    X['torque_temp_inter'] = X['torque_Nm'] * X['process_temperature_K']

if 'rotational_speed_rpm' in X.columns:
    X['high_rpm'] = (X['rotational_speed_rpm'] > X['rotational_speed_rpm'].median()).astype(int)

X['quality_ord'] = X['quality'].map({'L':0, 'M':1, 'H':2})

In [130]:
cat_cols = X.select_dtypes(include=['object']).columns.tolist()
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()

for c in cat_cols:
    X[c] = X[c].fillna(X[c].mode().iloc[0])
for c in num_cols:
    X[c] = X[c].fillna(X[c].median())

In [131]:
X_temp, X_test, y_temp, y_test = train_test_split(
    X, y_binary, test_size=0.15, random_state=42, stratify=y_binary
)
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=0.1764706, random_state=42, stratify=y_temp
)

In [132]:
categorical_features = [c for c in cat_cols if c in X_train.columns and c != 'quality']

if 'quality' in X_train.columns:
    categorical_features.append('quality')

numeric_features = [c for c in X_train.columns if c not in categorical_features]

numeric_transformer = StandardScaler()
cat_transformer = OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', cat_transformer, categorical_features)
    ], remainder='drop'
)

In [133]:
scale_pos = (len(y_train) - y_train.sum()) / (y_train.sum() + 1e-9)

def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 200, 800),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'gamma': trial.suggest_float('gamma', 0.0, 5.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'random_state': 42,
        'use_label_encoder': False,
        'verbosity': 0,
        'scale_pos_weight': scale_pos,
        'eval_metric': 'logloss'
    }

    model = XGBClassifier(**params)

    pipeline = ImbPipeline(steps=[
        ('pre', preprocessor),
        ('smote', SMOTE(random_state=42)),
        ('model', model)
    ])

    pipeline.fit(X_train, y_train)
    probs = pipeline.predict_proba(X_val)[:,1]

    prec, rec, th = precision_recall_curve(y_val, probs)
    f1s = 2 * (prec * rec) / (prec + rec + 1e-9)
    best_idx = np.nanargmax(f1s)
    best_f1 = f1s[best_idx]

    return best_f1

In [134]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=60, n_jobs=1)
print('Best params:', study.best_params)

[I 2025-11-19 00:26:03,854] A new study created in memory with name: no-name-2857bc22-bfad-44a4-bea0-7da9906fe057
[I 2025-11-19 00:26:04,181] Trial 0 finished with value: 0.7142857138086735 and parameters: {'n_estimators': 755, 'learning_rate': 0.06582847322713758, 'max_depth': 4, 'subsample': 0.6007297703430753, 'colsample_bytree': 0.8431356874966419, 'gamma': 3.737105564528071, 'min_child_weight': 5}. Best is trial 0 with value: 0.7142857138086735.
[I 2025-11-19 00:26:04,497] Trial 1 finished with value: 0.6999999995002001 and parameters: {'n_estimators': 726, 'learning_rate': 0.032723600669412364, 'max_depth': 4, 'subsample': 0.7668929332821297, 'colsample_bytree': 0.8745026351158875, 'gamma': 0.36365022167228034, 'min_child_weight': 5}. Best is trial 0 with value: 0.7142857138086735.
[I 2025-11-19 00:26:04,743] Trial 2 finished with value: 0.7173913038537334 and parameters: {'n_estimators': 313, 'learning_rate': 0.020381133244332838, 'max_depth': 7, 'subsample': 0.7032427340846792,

Best params: {'n_estimators': 645, 'learning_rate': 0.09793649799715745, 'max_depth': 10, 'subsample': 0.7720399962788242, 'colsample_bytree': 0.9963533146023291, 'gamma': 1.9232140192005087, 'min_child_weight': 3}


In [None]:
best_params = study.best_params
best_params.update({'random_state':42, 'use_label_encoder':False, 'verbosity':0, 'scale_pos_weight': scale_pos, 'eval_metric':'logloss'})

final_model = XGBClassifier(**best_params)
final_pipeline = ImbPipeline(steps=[
    ('pre', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('model', final_model)
])

X_comb = pd.concat([X_train, X_val])
y_comb = pd.concat([y_train, y_val])
final_pipeline.fit(X_comb, y_comb)

In [None]:
probs_val = final_pipeline.predict_proba(X_val)[:,1]
prec, rec, th = precision_recall_curve(y_val, probs_val)
f1s = 2 * (prec * rec) / (prec + rec + 1e-9)
best_idx = np.nanargmax(f1s)
best_threshold = th[best_idx]
print('Best threshold (val):', best_threshold, 'Best F1 (val):', f1s[best_idx])

Best threshold (val): 0.95347446 Best F1 (val): 0.9902912616359694


In [137]:
probs_test = final_pipeline.predict_proba(X_test)[:,1]
final_pred = (probs_test >= best_threshold).astype(int)

print('\n=== Final classification report (test) ===')
print(classification_report(y_test, final_pred))


=== Final classification report (test) ===
              precision    recall  f1-score   support

           0       1.00      0.99      0.99      1449
           1       0.69      0.88      0.78        51

    accuracy                           0.98      1500
   macro avg       0.84      0.93      0.88      1500
weighted avg       0.99      0.98      0.98      1500



In [None]:
joblib.dump({'pipeline': final_pipeline, 'threshold': best_threshold, 'optuna_study': study}, 'datasets/model.pkl')

['model.pkl']

In [139]:
df_fail = df[df["Target"] == 1].reset_index(drop=True)

X_fail = X.loc[df["Target"] == 1].copy()
y_fail = df_fail["Failure_Type"].copy()

le = LabelEncoder()
y_fail_enc = le.fit_transform(y_fail)
num_classes = len(le.classes_)

X_train_m, X_test_m, y_train_m, y_test_m = train_test_split(
    X_fail,
    y_fail_enc,
    test_size=0.15,
    random_state=42,
    stratify=y_fail_enc
)

multi_model = XGBClassifier(
    objective="multi:softprob",
    num_class=num_classes,
    eval_metric="mlogloss",
    n_estimators=400,
    learning_rate=0.1,
    max_depth=6,
    subsample=0.9,
    colsample_bytree=0.9,
    random_state=42
)

multi_pipeline = ImbPipeline(steps=[
    ('pre', preprocessor),
    ('model', multi_model)
])

multi_pipeline.fit(X_train_m, y_train_m)

0,1,2
,steps,"[('pre', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,objective,'multi:softprob'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.9
,device,
,early_stopping_rounds,
,enable_categorical,False


In [140]:
probs_test_m = multi_pipeline.predict_proba(X_test_m)
final_pred_m = np.argmax(probs_test_m, axis=1)

target_names = le.classes_

print('\n=== Final classification report (test) ===')
print(classification_report(    
    y_test_m,
    final_pred_m,
    target_names=target_names
))


=== Final classification report (test) ===
                          precision    recall  f1-score   support

Heat Dissipation Failure       0.89      1.00      0.94        17
              No Failure       1.00      1.00      1.00         1
      Overstrain Failure       0.86      0.50      0.63        12
           Power Failure       0.82      1.00      0.90        14
       Tool Wear Failure       0.86      0.86      0.86         7

                accuracy                           0.86        51
               macro avg       0.89      0.87      0.87        51
            weighted avg       0.86      0.86      0.85        51



In [None]:
joblib.dump({
    "pipeline": multi_pipeline,
    "label_encoder": le
}, "datasets/model_multiclass.pkl")

['model_multiclass.pkl']

In [None]:
df_raw = pd.read_csv(raw_path).drop_duplicates().dropna(how="all")

df_raw.columns = df_raw.columns.str.replace(r"[\[\]]", "", regex=True).str.replace(" ", "_")

df_raw_full = df_raw.copy()

df_raw_full = df_raw_full.reset_index(drop=True)

X_temp_raw, X_test_raw, y_temp_raw, y_test_raw_raw = train_test_split(
    df_raw_full, 
    df_raw_full["Target"], 
    test_size=0.15, 
    random_state=42, 
    stratify=df_raw_full["Target"]
)

inference_df = X_test_raw.copy()

if 'Target' in inference_df.columns:
    inference_df = inference_df.drop(columns=['Target'])
if 'Failure_Type' in inference_df.columns:
    inference_df = inference_df.drop(columns=['Failure_Type'])

inference_df.to_csv("datasets/inference.csv", index=False)

inference_failure_df = X_test_raw[X_test_raw['Target'] == 1].copy()

if 'Target' in inference_failure_df.columns:
    inference_failure_df = inference_failure_df.drop(columns=['Target'])
if 'Failure_Type' in inference_failure_df.columns:
    inference_failure_df = inference_failure_df.drop(columns=['Failure_Type'])

inference_failure_df.to_csv("datasets/inference_failure.csv", index=False)

print("inference berhasil dibuat!")

inference berhasil dibuat!
