In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import MinMaxScaler

from sklearn.metrics import accuracy_score, f1_score

from xgboost import XGBClassifier

import optuna
from sklearn.model_selection import cross_val_score

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
submission = pd.read_csv('sample_submission.csv')


train.shape, test.shape, submission.shape

((19219, 35), (12814, 28), (12814, 8))

In [3]:
train.columns

Index(['id', 'X_Minimum', 'X_Maximum', 'Y_Minimum', 'Y_Maximum',
       'Pixels_Areas', 'X_Perimeter', 'Y_Perimeter', 'Sum_of_Luminosity',
       'Minimum_of_Luminosity', 'Maximum_of_Luminosity', 'Length_of_Conveyer',
       'TypeOfSteel_A300', 'TypeOfSteel_A400', 'Steel_Plate_Thickness',
       'Edges_Index', 'Empty_Index', 'Square_Index', 'Outside_X_Index',
       'Edges_X_Index', 'Edges_Y_Index', 'Outside_Global_Index', 'LogOfAreas',
       'Log_X_Index', 'Log_Y_Index', 'Orientation_Index', 'Luminosity_Index',
       'SigmoidOfAreas', 'Pastry', 'Z_Scratch', 'K_Scatch', 'Stains',
       'Dirtiness', 'Bumps', 'Other_Faults'],
      dtype='object')

In [4]:
target_1 = ['Pastry', 'Z_Scratch', 'K_Scatch', 'Stains','Dirtiness', 'Bumps', 'Other_Faults']

X = train.drop(target_1, axis=1)
target_bin = train[target_1]

In [5]:
target_bin

Unnamed: 0,Pastry,Z_Scratch,K_Scatch,Stains,Dirtiness,Bumps,Other_Faults
0,0,0,0,1,0,0,0
1,0,0,0,0,0,0,1
2,0,0,1,0,0,0,0
3,0,0,1,0,0,0,0
4,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...
19214,0,0,0,1,0,0,0
19215,0,0,0,0,0,0,1
19216,1,0,0,0,0,0,0
19217,0,0,0,0,0,0,1


In [6]:
target_bin.sum(axis=1).value_counts()

1    18380
0      818
2       21
Name: count, dtype: int64

In [7]:
target_2 = ["Zero_Defects", "Pastry", "Z_Scratch", "K_Scatch", "Stains", "Dirtiness", "Bumps", "Other_Faults"]

In [8]:
target = target_bin @ (np.arange(target_bin.shape[1])+1)
target[target_bin.sum(axis=1) == 2] = 7

In [9]:
target = np.reshape(target, (len(target),1))

In [10]:
X

Unnamed: 0,id,X_Minimum,X_Maximum,Y_Minimum,Y_Maximum,Pixels_Areas,X_Perimeter,Y_Perimeter,Sum_of_Luminosity,Minimum_of_Luminosity,...,Outside_X_Index,Edges_X_Index,Edges_Y_Index,Outside_Global_Index,LogOfAreas,Log_X_Index,Log_Y_Index,Orientation_Index,Luminosity_Index,SigmoidOfAreas
0,0,584,590,909972,909977,16,8,5,2274,113,...,0.0059,1.0000,1.0000,0.0,1.2041,0.9031,0.6990,-0.5000,-0.0104,0.1417
1,1,808,816,728350,728372,433,20,54,44478,70,...,0.0044,0.2500,1.0000,1.0,2.6365,0.7782,1.7324,0.7419,-0.2997,0.9491
2,2,39,192,2212076,2212144,11388,705,420,1311391,29,...,0.1077,0.2363,0.3857,0.0,4.0564,2.1790,2.2095,-0.0105,-0.0944,1.0000
3,3,781,789,3353146,3353173,210,16,29,3202,114,...,0.0044,0.3750,0.9310,1.0,2.3222,0.7782,1.4314,0.6667,-0.0402,0.4025
4,4,1540,1560,618457,618502,521,72,67,48231,82,...,0.0192,0.2105,0.9861,1.0,2.7694,1.4150,1.8808,0.9158,-0.2455,0.9998
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19214,19214,749,757,143210,143219,17,4,4,2193,122,...,0.0044,1.0000,0.8000,0.0,1.2305,0.7782,0.6021,-0.1429,0.0044,0.2901
19215,19215,723,735,2488529,2488541,231,17,26,27135,104,...,0.0065,0.7333,0.9216,1.0,2.3636,1.0414,1.4150,0.7222,-0.0989,0.5378
19216,19216,6,31,1578055,1578129,780,114,98,71112,41,...,0.0199,0.1862,0.9554,1.0,2.8921,1.4314,1.8692,0.7719,-0.4283,0.9997
19217,19217,9,18,1713172,1713184,126,13,26,14808,88,...,0.0068,0.7692,1.0000,1.0,2.1004,1.0414,1.4150,0.9610,-0.1162,0.3509


# MinMax

In [11]:
scaler = MinMaxScaler()
X_norm = scaler.fit_transform(X)

# Split

In [12]:
x_train, x_valid, y_train, y_valid = train_test_split(X_norm, target, stratify=target, test_size=0.2, random_state=8)

# Model

In [14]:
xgb = XGBClassifier(booster='gbtree',
                        device = "cuda",
                        tree_method="hist",
                        objective='multi:softmax',  # 'multi:softmax' or 'multi:softprob' for multiclass classification
                        num_class=7,  # Number of classes
                        n_estimators=300,
                        max_depth=6,
                        verbosity=0)

xgb.fit(x_train, y_train)

yhat = xgb.predict(x_valid)

print(f1_score(y_valid, yhat, average='weighted'))
print(accuracy_score(y_valid, yhat))

0.5347369758314938
0.554630593132154


# Optimization

In [16]:
def objective(trial):
    booster=trial.suggest_categorical('booster', ['gbtree','dart'])
    
    model = XGBClassifier(booster='gbtree',
                        device = "cuda",
                        tree_method="hist",
                        objective='multi:softmax',  # 'multi:softmax' or 'multi:softprob' for multiclass classification
                        num_class=7,  # Number of classes
                        n_estimators=300,
                        max_depth=6,
                        verbosity=0)
    model.fit(x_train, y_train)
    score = f1_score(y_valid, model.predict(x_valid), average='weighted')
    return score

In [17]:
study = optuna.create_study(direction='maximize')

[I 2024-03-17 17:50:46,253] A new study created in memory with name: no-name-8ae6800d-182f-4c5e-b4b1-f215ec16af0a


In [None]:
study.optimize(objective, n_trials=5)