In [2]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import MinMaxScaler

from sklearn.metrics import accuracy_score, f1_score

from xgboost import XGBClassifier
from sklearn.ensemble import *

import optuna
from sklearn.model_selection import cross_val_score

In [3]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
submission = pd.read_csv('sample_submission.csv')


train.shape, test.shape, submission.shape

((19219, 35), (12814, 28), (12814, 8))

In [4]:
train.columns

Index(['id', 'X_Minimum', 'X_Maximum', 'Y_Minimum', 'Y_Maximum',
       'Pixels_Areas', 'X_Perimeter', 'Y_Perimeter', 'Sum_of_Luminosity',
       'Minimum_of_Luminosity', 'Maximum_of_Luminosity', 'Length_of_Conveyer',
       'TypeOfSteel_A300', 'TypeOfSteel_A400', 'Steel_Plate_Thickness',
       'Edges_Index', 'Empty_Index', 'Square_Index', 'Outside_X_Index',
       'Edges_X_Index', 'Edges_Y_Index', 'Outside_Global_Index', 'LogOfAreas',
       'Log_X_Index', 'Log_Y_Index', 'Orientation_Index', 'Luminosity_Index',
       'SigmoidOfAreas', 'Pastry', 'Z_Scratch', 'K_Scatch', 'Stains',
       'Dirtiness', 'Bumps', 'Other_Faults'],
      dtype='object')

In [5]:
target_classes = ['Pastry', 'Z_Scratch', 'K_Scatch', 'Stains', 'Dirtiness', 'Bumps', 'Other_Faults']
targets_bin = train[target_classes]

In [6]:
train = train.drop(target_classes, axis=1)

In [7]:
target = targets_bin @ (np.arange(targets_bin.shape[1]) + 1)
target[targets_bin.sum(axis=1)==2] = 2
target.shape

(19219,)

In [8]:
np.arange(targets_bin.shape[1]) + 1

array([1, 2, 3, 4, 5, 6, 7])

In [14]:
train = np.array(train)

In [15]:
kfold = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)

In [16]:
train_idx, valid_idx = kfold.split(train, target)

ValueError: too many values to unpack (expected 2)

In [17]:
for i, (train_idx, valid_idx) in enumerate(kfold.split(train, target)):
    print(train_idx)
    
    

[    0     2     3 ... 19212 19213 19217]
[    1     2     3 ... 19216 19217 19218]
[    0     1     2 ... 19216 19217 19218]
[    0     1     3 ... 19216 19217 19218]
[    0     1     2 ... 19215 19216 19218]


In [None]:
for i, (train_idx, valid_idx) in enumerate(kfold.split(train, target)):
    x_train_s , y_train_s = train[train_idx] , target[train_idx]
    x_valid_s , y_valid_s = train[valid_idx] , target[valid_idx]
    
    def objective(trial):
        params = {
        'n_estimators' : trial.suggest_int('n_estimators', 100, 400, step=25),
        'max_depth' : trial.suggest_int('max_depth', 15, 50),
        'max_features' : trial.suggest_float('max_features', 0.1, 0.9),
        'max_samples' : trial.suggest_float('max_samples', 0.5, 0.9),
        'random_state' : trial.suggest_int('random_state', 50, 50),
        'n_jobs' : trial.suggest_int('n_jobs', 3, 3)
        }
        model = RandomForestClassifier(**params)
        model.fit(x_train_s, y_train_s.ravel())
        yhat = model.predict(x_valid_s)
        
        return f1_score(y_valid_s, yhat, average='weighted')
    
    study = optuna.create_study()
    study.optimize(objective, n_trials=10)
    

[I 2024-03-22 13:51:11,111] A new study created in memory with name: no-name-71d5557b-d33e-4a14-914e-15437d2295d4
[I 2024-03-22 13:52:28,812] Trial 0 finished with value: 0.5424665221662149 and parameters: {'n_estimators': 375, 'max_depth': 30, 'max_features': 0.3999704427595234, 'max_samples': 0.6335746871202735, 'random_state': 50, 'n_jobs': 3}. Best is trial 0 with value: 0.5424665221662149.
[I 2024-03-22 13:52:48,255] Trial 1 finished with value: 0.5414067934332669 and parameters: {'n_estimators': 150, 'max_depth': 40, 'max_features': 0.23699290481025265, 'max_samples': 0.7880248312751861, 'random_state': 50, 'n_jobs': 3}. Best is trial 1 with value: 0.5414067934332669.
[I 2024-03-22 13:53:08,042] Trial 2 finished with value: 0.5352793437577562 and parameters: {'n_estimators': 175, 'max_depth': 46, 'max_features': 0.1902890648169292, 'max_samples': 0.8230379701907431, 'random_state': 50, 'n_jobs': 3}. Best is trial 2 with value: 0.5352793437577562.
[I 2024-03-22 13:53:20,483] Trial

In [None]:
RandomForestClassifier()