In [1]:
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score
from xgboost import XGBClassifier
import optuna


In [3]:
train = pd.read_csv('train.csv',index_col='id')
test = pd.read_csv('test.csv', index_col='id')

# steel_plate_faults = pd.read_csv('/kaggle/input/steel-plates-faults/SteelPlatesFaults.csv')
# train = pd.concat([train, steel_plate_faults], axis=0)
# train = train.drop('id', axis=1)

display(train.head())

Unnamed: 0_level_0,X_Minimum,X_Maximum,Y_Minimum,Y_Maximum,Pixels_Areas,X_Perimeter,Y_Perimeter,Sum_of_Luminosity,Minimum_of_Luminosity,Maximum_of_Luminosity,...,Orientation_Index,Luminosity_Index,SigmoidOfAreas,Pastry,Z_Scratch,K_Scatch,Stains,Dirtiness,Bumps,Other_Faults
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,584,590,909972,909977,16,8,5,2274,113,140,...,-0.5,-0.0104,0.1417,0,0,0,1,0,0,0
1,808,816,728350,728372,433,20,54,44478,70,111,...,0.7419,-0.2997,0.9491,0,0,0,0,0,0,1
2,39,192,2212076,2212144,11388,705,420,1311391,29,141,...,-0.0105,-0.0944,1.0,0,0,1,0,0,0,0
3,781,789,3353146,3353173,210,16,29,3202,114,134,...,0.6667,-0.0402,0.4025,0,0,1,0,0,0,0
4,1540,1560,618457,618502,521,72,67,48231,82,111,...,0.9158,-0.2455,0.9998,0,0,0,0,0,0,1


In [4]:
target_columns = ['Pastry', 'Z_Scratch', 'K_Scatch', 'Stains', 'Dirtiness', 'Bumps', 'Other_Faults']

X, y = train.drop(target_columns, axis=1), train[target_columns]

In [7]:
def auc_score(estimator, X, y):
    y_prob = estimator.predict_proba(X)
    return roc_auc_score(y, y_prob, multi_class="ovr")


def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 10, 1000),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.5, log=True),
        'gamma': trial.suggest_float('gamma', 1e-3, 1, log=True),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-3, 1, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-3, 1, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        "min_child_weight": trial.suggest_int('min_child_weight', 1, 10),
        'subsample': trial.suggest_float('subsample', 0.5, 1, log=True),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1, log=True),
        'random_state': trial.suggest_int('random_state', 0, 1000),
    }

    model = XGBClassifier(objective='binary:logistic',**params)
    score = cross_val_score(model, X, y, scoring=auc_score, cv=5).mean()
    return score

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

print('Best trial:', study.best_trial.params)
print('Best score:', study.best_value)

[I 2024-03-17 14:56:50,156] A new study created in memory with name: no-name-6dac320e-20fd-473e-bcb5-f12b6274c084
[I 2024-03-17 14:56:52,647] Trial 0 finished with value: 0.8810813304074188 and parameters: {'n_estimators': 64, 'learning_rate': 0.3242964432674124, 'gamma': 0.011976211989786842, 'reg_alpha': 0.30287646221211273, 'reg_lambda': 0.058375496877015716, 'max_depth': 3, 'min_child_weight': 2, 'subsample': 0.6077578908491438, 'colsample_bytree': 0.5237148341226315, 'random_state': 97}. Best is trial 0 with value: 0.8810813304074188.
[I 2024-03-17 14:58:22,715] Trial 1 finished with value: 0.8728201457992133 and parameters: {'n_estimators': 762, 'learning_rate': 0.047166768363066505, 'gamma': 0.004737509531487607, 'reg_alpha': 0.046718090190635546, 'reg_lambda': 0.01787716043882973, 'max_depth': 10, 'min_child_weight': 3, 'subsample': 0.6354104500107249, 'colsample_bytree': 0.9643309162274946, 'random_state': 757}. Best is trial 0 with value: 0.8810813304074188.
[I 2024-03-17 14:

Best trial: {'n_estimators': 642, 'learning_rate': 0.011401406811339021, 'gamma': 0.03019990623254004, 'reg_alpha': 0.326231409485988, 'reg_lambda': 0.061503595803754064, 'max_depth': 5, 'min_child_weight': 4, 'subsample': 0.7438314539513564, 'colsample_bytree': 0.5660718425034847, 'random_state': 683}
Best score: 0.8887613844672508


In [8]:
params = study.best_trial.params
xgb = XGBClassifier(objective='binary:logistic', **params)

scores = cross_val_score(xgb, X, y, scoring=auc_score, cv=5)
print(f'ROC AUC: {scores.mean()}')

xgb.fit(X, y)

ROC AUC: 0.8887613844672508


In [None]:
y_pred = xgb.predict_proba(test)

submission = pd.DataFrame(y_pred, columns=target_columns, index=test.index)
submission.to_csv('test.csv', index=True)