In [28]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import MinMaxScaler

from sklearn.metrics import accuracy_score, f1_score

from xgboost import XGBClassifier
from sklearn.ensemble import *

import optuna
from sklearn.model_selection import cross_val_score

from sklearn.multioutput import MultiOutputClassifier

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
submission = pd.read_csv('sample_submission.csv')


train.shape, test.shape, submission.shape

((19219, 35), (12814, 28), (12814, 8))

In [5]:
train.columns

Index(['id', 'X_Minimum', 'X_Maximum', 'Y_Minimum', 'Y_Maximum',
       'Pixels_Areas', 'X_Perimeter', 'Y_Perimeter', 'Sum_of_Luminosity',
       'Minimum_of_Luminosity', 'Maximum_of_Luminosity', 'Length_of_Conveyer',
       'TypeOfSteel_A300', 'TypeOfSteel_A400', 'Steel_Plate_Thickness',
       'Edges_Index', 'Empty_Index', 'Square_Index', 'Outside_X_Index',
       'Edges_X_Index', 'Edges_Y_Index', 'Outside_Global_Index', 'LogOfAreas',
       'Log_X_Index', 'Log_Y_Index', 'Orientation_Index', 'Luminosity_Index',
       'SigmoidOfAreas', 'Pastry', 'Z_Scratch', 'K_Scatch', 'Stains',
       'Dirtiness', 'Bumps', 'Other_Faults'],
      dtype='object')

In [6]:
target_classes = ['Pastry', 'Z_Scratch', 'K_Scatch', 'Stains', 'Dirtiness', 'Bumps', 'Other_Faults']
targets_bin = train[target_classes]

In [7]:
X_train = train.drop(target_classes, axis=1)

In [8]:
target = targets_bin @ (np.arange(targets_bin.shape[1]) + 1)
target[targets_bin.sum(axis=1)==2] = 2
target.shape

(19219,)

In [9]:
train = np.array(train)

In [16]:
x_train, x_valid, y_train, y_valid = train_test_split(X_train, target, stratify=target, test_size=0.25, random_state=2024)

x_train.shape, y_train.shape

((14414, 28), (14414,))

In [None]:
clf = RandomForestClassifier()
model = MultiOutputClassifier(clf, n_jobs=-1)

In [17]:
def objective(trial):
        params = {
        'n_estimators' : trial.suggest_int('n_estimators', 100, 400, step=25),
        'max_depth' : trial.suggest_int('max_depth', 15, 50),
        'max_features' : trial.suggest_float('max_features', 0.1, 0.9),
        'max_samples' : trial.suggest_float('max_samples', 0.5, 0.9),
        }
        model = RandomForestClassifier(random_state=2024, n_jobs=-1, **params)
        model.fit(x_train, y_train.ravel())
        yhat = model.predict(x_valid)
        f1 = f1_score(y_valid, yhat, average='weighted')
        return f1   

In [18]:
study = optuna.create_study()
study.optimize(objective, n_trials=50)

[I 2024-03-29 08:47:27,732] A new study created in memory with name: no-name-d3fa50af-b56b-47be-829b-1013b405a742
[I 2024-03-29 08:47:42,033] Trial 0 finished with value: 0.5415779386318661 and parameters: {'n_estimators': 225, 'max_depth': 34, 'max_features': 0.4404206238834436, 'max_samples': 0.828466056247344}. Best is trial 0 with value: 0.5415779386318661.
[I 2024-03-29 08:47:48,110] Trial 1 finished with value: 0.5390835706047437 and parameters: {'n_estimators': 250, 'max_depth': 36, 'max_features': 0.19947775890369412, 'max_samples': 0.5647867931315}. Best is trial 1 with value: 0.5390835706047437.
[I 2024-03-29 08:47:59,408] Trial 2 finished with value: 0.5402720835616844 and parameters: {'n_estimators': 200, 'max_depth': 17, 'max_features': 0.397309162383073, 'max_samples': 0.8440493403963685}. Best is trial 1 with value: 0.5390835706047437.
[I 2024-03-29 08:48:06,355] Trial 3 finished with value: 0.5439389587397817 and parameters: {'n_estimators': 150, 'max_depth': 33, 'max_f

[I 2024-03-29 08:52:40,498] Trial 32 finished with value: 0.538754901241109 and parameters: {'n_estimators': 125, 'max_depth': 21, 'max_features': 0.15259334492527052, 'max_samples': 0.6264572347776736}. Best is trial 12 with value: 0.5257607943276896.
[I 2024-03-29 08:52:42,268] Trial 33 finished with value: 0.5311809215954747 and parameters: {'n_estimators': 100, 'max_depth': 27, 'max_features': 0.11622742590282027, 'max_samples': 0.5396958608620884}. Best is trial 12 with value: 0.5257607943276896.
[I 2024-03-29 08:52:46,114] Trial 34 finished with value: 0.5385968526801702 and parameters: {'n_estimators': 150, 'max_depth': 28, 'max_features': 0.21027942987536316, 'max_samples': 0.5506407077655359}. Best is trial 12 with value: 0.5257607943276896.
[I 2024-03-29 08:52:51,357] Trial 35 finished with value: 0.5438017306447521 and parameters: {'n_estimators': 125, 'max_depth': 26, 'max_features': 0.3442950698556836, 'max_samples': 0.5369701793166474}. Best is trial 12 with value: 0.5257

In [26]:
clf = RandomForestClassifier(n_estimators=100,
                             max_depth=23, max_features=0.10074256444840124, max_samples=0.6468679663949485,
                             random_state=2024, n_jobs=-1)
clf.fit(x_train, y_train.ravel())
yhat = clf.predict(x_valid)
f1 = f1_score(y_valid, yhat, average='weighted')
acc_score = accuracy_score(y_valid, yhat)
print(f'f1-score: {f1}, accuracy_score: {acc_score}')

f1-score: 0.5257607943276896, accuracy_score: 0.5529656607700312
