In [None]:
import pandas as pd
import numpy as np
import joblib
import optuna
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_curve, roc_auc_score

In [32]:
raw_path = "dataset_raw.csv"
df = pd.read_csv(raw_path).drop_duplicates().dropna(how="all")

df.columns = df.columns.str.replace(r"[\[\]]", "", regex=True).str.replace(" ", "_")

df["quality"] = df["Product_ID"].str[0]
df["serial"] = df["Product_ID"].str[1:].astype(int)

if "UDI" in df.columns:
    df = df.drop(columns=["UDI"])
if "Product_ID" in df.columns:
    df = df.drop(columns=["Product_ID"])

y_binary = df['Target']
y_multi = df['Failure_Type']
X = df.drop(columns=['Target','Failure_Type'])

In [33]:
X = X.copy()

if 'process_temperature_K' in X.columns and 'air_temperature_K' in X.columns:
    X['temp_diff'] = X['process_temperature_K'] - X['air_temperature_K']
if 'torque_Nm' in X.columns and 'rotational_speed_rpm' in X.columns:
    X['load'] = X['torque_Nm'] * X['rotational_speed_rpm']
if 'load' in X.columns and 'process_temperature_K' in X.columns:
    X['stress'] = X['load'] / (X['process_temperature_K'] + 1e-9)
if 'tool_wear_min' in X.columns and 'serial' in X.columns:
    X['wear_rate'] = X['tool_wear_min'] / (X['serial'] + 1e-9)
if 'torque_Nm' in X.columns and 'process_temperature_K' in X.columns:
    X['torque_temp_inter'] = X['torque_Nm'] * X['process_temperature_K']

if 'rotational_speed_rpm' in X.columns:
    X['high_rpm'] = (X['rotational_speed_rpm'] > X['rotational_speed_rpm'].median()).astype(int)

X['quality_ord'] = X['quality'].map({'L':0, 'M':1, 'H':2})

In [34]:
cat_cols = X.select_dtypes(include=['object']).columns.tolist()
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()

for c in cat_cols:
    X[c] = X[c].fillna(X[c].mode().iloc[0])
for c in num_cols:
    X[c] = X[c].fillna(X[c].median())

In [35]:
X_temp, X_test, y_temp, y_test = train_test_split(
    X, y_binary, test_size=0.15, random_state=42, stratify=y_binary
)
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=0.1764706, random_state=42, stratify=y_temp
)

In [36]:
categorical_features = [c for c in cat_cols if c in X_train.columns and c != 'quality']

if 'quality' in X_train.columns:
    categorical_features.append('quality')

numeric_features = [c for c in X_train.columns if c not in categorical_features]

numeric_transformer = StandardScaler()
cat_transformer = OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', cat_transformer, categorical_features)
    ], remainder='drop'
)

In [37]:
scale_pos = (len(y_train) - y_train.sum()) / (y_train.sum() + 1e-9)


def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 200, 800),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'gamma': trial.suggest_float('gamma', 0.0, 5.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'random_state': 42,
        'use_label_encoder': False,
        'verbosity': 0,
        'scale_pos_weight': scale_pos,
        'eval_metric': 'logloss'
    }

    model = XGBClassifier(**params)

    pipeline = ImbPipeline(steps=[
        ('pre', preprocessor),
        ('smote', SMOTE(random_state=42)),
        ('model', model)
    ])

    pipeline.fit(X_train, y_train)
    probs = pipeline.predict_proba(X_val)[:,1]

    prec, rec, th = precision_recall_curve(y_val, probs)
    f1s = 2 * (prec * rec) / (prec + rec + 1e-9)
    best_idx = np.nanargmax(f1s)
    best_f1 = f1s[best_idx]

    return best_f1

In [38]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=60, n_jobs=1)
print('Best params:', study.best_params)

[I 2025-11-15 14:34:16,802] A new study created in memory with name: no-name-8faaf011-d960-40e7-9bb8-f92c0b189d97
[I 2025-11-15 14:34:17,349] Trial 0 finished with value: 0.7058823524611764 and parameters: {'n_estimators': 800, 'learning_rate': 0.024087434618909366, 'max_depth': 7, 'subsample': 0.9629085306405927, 'colsample_bytree': 0.7929265853236285, 'gamma': 1.2337289698883025, 'min_child_weight': 4}. Best is trial 0 with value: 0.7058823524611764.
[I 2025-11-15 14:34:17,520] Trial 1 finished with value: 0.6913580242249657 and parameters: {'n_estimators': 454, 'learning_rate': 0.09121130830703934, 'max_depth': 5, 'subsample': 0.9983368365798114, 'colsample_bytree': 0.7257980602914909, 'gamma': 2.045341136481559, 'min_child_weight': 1}. Best is trial 0 with value: 0.7058823524611764.
[I 2025-11-15 14:34:17,707] Trial 2 finished with value: 0.7021276590780897 and parameters: {'n_estimators': 354, 'learning_rate': 0.09385462160767698, 'max_depth': 5, 'subsample': 0.61268755549987, 'co

Best params: {'n_estimators': 475, 'learning_rate': 0.1607341706633555, 'max_depth': 5, 'subsample': 0.6610008104305312, 'colsample_bytree': 0.9775141884659192, 'gamma': 1.6794750437072037, 'min_child_weight': 4}


In [39]:
best_params = study.best_params
best_params.update({'random_state':42, 'use_label_encoder':False, 'verbosity':0, 'scale_pos_weight': scale_pos, 'eval_metric':'logloss'})

final_model = XGBClassifier(**best_params)
final_pipeline = ImbPipeline(steps=[
    ('pre', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('model', final_model)
])

X_comb = pd.concat([X_train, X_val])
y_comb = pd.concat([y_train, y_val])
final_pipeline.fit(X_comb, y_comb)

0,1,2
,steps,"[('pre', ...), ('smote', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,sampling_strategy,'auto'
,random_state,42
,k_neighbors,5

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.9775141884659192
,device,
,early_stopping_rounds,
,enable_categorical,False


In [40]:
probs_val = final_pipeline.predict_proba(X_val)[:,1]
prec, rec, th = precision_recall_curve(y_val, probs_val)
f1s = 2 * (prec * rec) / (prec + rec + 1e-9)
best_idx = np.nanargmax(f1s)
best_threshold = th[best_idx]
print('Best threshold (val):', best_threshold, 'Best F1 (val):', f1s[best_idx])

Best threshold (val): 0.9587665 Best F1 (val): 0.9999999995


In [41]:
probs_test = final_pipeline.predict_proba(X_test)[:,1]
final_pred = (probs_test >= best_threshold).astype(int)

print('\n=== Final classification report (test) ===')
print(classification_report(y_test, final_pred))
print('\nConfusion matrix:')
print(confusion_matrix(y_test, final_pred))
print('ROC AUC test:', roc_auc_score(y_test, probs_test))


=== Final classification report (test) ===
              precision    recall  f1-score   support

           0       1.00      0.99      0.99      1449
           1       0.76      0.86      0.81        51

    accuracy                           0.99      1500
   macro avg       0.88      0.93      0.90      1500
weighted avg       0.99      0.99      0.99      1500


Confusion matrix:
[[1435   14]
 [   7   44]]
ROC AUC test: 0.9874423199231382


In [42]:
joblib.dump({'pipeline': final_pipeline, 'threshold': best_threshold, 'optuna_study': study}, 'model.pkl')

['model.pkl']

In [None]:
df_raw = pd.read_csv(raw_path).drop_duplicates().dropna(how="all")

df_raw.columns = df_raw.columns.str.replace(r"[\[\]]", "", regex=True).str.replace(" ", "_")

df_raw_full = df_raw.copy()

df_raw_full = df_raw_full.reset_index(drop=True)

X_temp_raw, X_test_raw, y_temp_raw, y_test_raw_raw = train_test_split(
    df_raw_full, 
    df_raw_full["Target"], 
    test_size=0.15, 
    random_state=42, 
    stratify=df_raw_full["Target"]
)

inference_df = X_test_raw.copy()

inference_df.to_csv("inference.csv", index=False)

print("inference.csv berhasil dibuat!")


inference.csv berhasil dibuat!
