# 베이지안 최적화 기준 모델 파라미터 튜닝 함수(Optuna)

!! 유의 사항 !!

1. trian, test(최종 예측할 test)는 각각 같은 열을 가져야 함.[train의 경우 target까지]
2. 모두 데이터 프레임이여야함
3. 함수에 넣을 때 [데이터1, 데이터2, 데이터3]과 같이 리스트안에 넣어줘야함
4. target변수는 모두 0,1 인코딩

In [58]:
def final_voting_model(datasets, test_datasets, random_state=42, num_model=3, n_trials=10):
    import optuna
    from catboost import CatBoostClassifier
    from xgboost import XGBClassifier
    from lightgbm import LGBMClassifier
    import lightgbm as lgb
    from sklearn.model_selection import StratifiedKFold, train_test_split, cross_val_score
    from sklearn.metrics import f1_score, make_scorer
    from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
    from sklearn.preprocessing import LabelEncoder
    import numpy as np
    from collections import defaultdict
    from joblib import Parallel, delayed

    optuna.logging.set_verbosity(optuna.logging.WARNING) # log 숨김
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_state)
    final_test_preds_list = []

    for i, (data, test_data) in enumerate(zip(datasets, test_datasets)):
        print(f'Processing dataset {i+1}...')

        y = data['target']
        X = data.drop('target', axis=1)
        X = X.apply(lambda x: x.astype('category') if x.dtype == 'object' else x)
        test_data = test_data.apply(lambda x: x.astype('category') if x.dtype == 'object' else x)

        class_count = y.value_counts()
        weight = class_count[0] / class_count[1]

        train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=random_state, stratify=y)

        cat_features = [col for col in train_X.columns if train_X[col].dtype == 'object' or train_X[col].dtype.name == 'category']

        # 원본 데이터의 복사본 생성 [사이킷런 모델에 사용할 것]
        train_X_encoded = train_X.copy()
        test_X_encoded = test_X.copy()
        test_data_encoded = test_data.copy()

        # LabelEncoder를 cat_features에 있는 각 열에 적용
        label_encoders = defaultdict(LabelEncoder)
        for col in cat_features:
            train_X_encoded[col] = label_encoders[col].fit_transform(train_X_encoded[col])

        # Test 데이터에서 Train 데이터에 있는 범주형 값을 인코딩하고, 새로운 값은 -1로 인코딩
        for col in cat_features:
            test_X_encoded[col] = test_X_encoded[col].map(lambda s: label_encoders[col].transform([s])[0] if s in label_encoders[col].classes_ else -1)
            test_data_encoded[col] = test_data_encoded[col].map(lambda s: label_encoders[col].transform([s])[0] if s in label_encoders[col].classes_ else -1)

        f1_scorer = make_scorer(f1_score, average='macro')

        print(f'Train shape : {train_X.shape}')
        print(f'Test shape : {test_X.shape}', '\n')

        print('+++++' * 2, 'Model Fitting', '+++++' * 2)

        model_performance = []

        # CatBoost 파라미터 최적화와 모델 학습
        def catboost_objective(trial):
            param = {
                'objective': 'Logloss',
                'eval_metric': 'F1',
                'iterations': trial.suggest_int('iterations', 100, 3000),
                'depth': trial.suggest_int('depth', 4, 16),
                'learning_rate': trial.suggest_float('learning_rate', 0.002, 0.5, log=True),
                'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1, 10),
                'subsample': trial.suggest_float('subsample', 0.6, 1.0),
                'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.5, 1.0),
                'random_state': random_state,
                'silent': True,
                'use_best_model': True,
                'auto_class_weights': 'Balanced',
                'early_stopping_rounds': 100,
                'cat_features': cat_features
            }

            def train_and_evaluate(train_idx, val_idx):
                X_train_fold, X_val_fold = train_X_encoded.iloc[train_idx], train_X_encoded.iloc[val_idx]
                y_train_fold, y_val_fold = train_y.iloc[train_idx], train_y.iloc[val_idx]

                model = CatBoostClassifier(**param)
                model.fit(
                    X_train_fold, y_train_fold,
                    eval_set=(X_val_fold, y_val_fold),
                    early_stopping_rounds=100,
                    verbose=0
                )
                val_pred = model.predict(X_val_fold)
                score = f1_score(y_val_fold, val_pred, average='macro')
                return score

            scores = Parallel(n_jobs=-1)(delayed(train_and_evaluate)(train_idx, val_idx) for train_idx, val_idx in skf.split(train_X_encoded, train_y))

            return np.mean(scores)

        catboost_study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(), pruner=optuna.pruners.HyperbandPruner())
        print('VVV' * 2, 'Optimizing CatBoost', 'VVV' * 2, '\n')
        catboost_study.optimize(catboost_objective, n_trials=n_trials)
        catboost_best_params = catboost_study.best_params
        print('Done', '\n')
        print('Fitting CatBoost...')
        catboost_model = CatBoostClassifier(**catboost_best_params, cat_features=cat_features, thread_count=-1, silent= True)
        catboost_model.fit(train_X, train_y)
        catboost_train_preds = catboost_model.predict(train_X)
        catboost_score = f1_score(train_y, catboost_train_preds, average='macro')
        model_performance.append(('catboost', catboost_model, catboost_score))
        print('Done', '\n')

        # XGBoost 파라미터 최적화와 모델 학습
        def xgb_objective(trial):
            param = {
                'objective': 'binary:logistic',
                'eval_metric': 'logloss',
                'n_estimators': trial.suggest_int('n_estimators', 100, 3000),
                'max_depth': trial.suggest_int('max_depth', 3, 20),
                'learning_rate': trial.suggest_float('learning_rate', 0.002, 0.5, log=True),
                'subsample': trial.suggest_float('subsample', 0.5, 1.0),
                'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
                'gamma': trial.suggest_float('gamma', 0, 5),
                'lambda': trial.suggest_float('lambda', 1e-8, 1.0, log=True),
                'alpha': trial.suggest_float('alpha', 1e-8, 1.0, log=True),
                'random_state': 42,
                'scale_pos_weight': weight,
                'enable_categorical': True
            }

            def train_and_evaluate(train_idx, val_idx):
                X_train_fold, X_val_fold = train_X.iloc[train_idx], train_X.iloc[val_idx]
                y_train_fold, y_val_fold = train_y.iloc[train_idx], train_y.iloc[val_idx]

                model = XGBClassifier(**param)
                model.set_params(early_stopping_rounds=100)
                model.fit(
                    X_train_fold, y_train_fold,
                    eval_set=[(X_val_fold, y_val_fold)],
                    verbose=0
                )
                val_pred = model.predict(X_val_fold)
                score = f1_score(y_val_fold, val_pred, average='macro')
                return score

            scores = Parallel(n_jobs=-1)(delayed(train_and_evaluate)(train_idx, val_idx) for train_idx, val_idx in skf.split(train_X, train_y))
            return np.mean(scores)

        xgb_study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(), pruner=optuna.pruners.HyperbandPruner())
        print('VVV' * 2, 'Optimizing XGBoost', 'VVV' * 2, '\n')
        xgb_study.optimize(xgb_objective, n_trials=n_trials)
        xgb_best_params = xgb_study.best_params
        print('Done', '\n')
        print('Fitting XGBoost...')

        # 학습 데이터를 train과 validation으로 나누기
        X_train_part, X_val_part, y_train_part, y_val_part = train_test_split(train_X, train_y, test_size=0.2, random_state=42)

        # 최종 모델 학습
        xgb_model = XGBClassifier(**xgb_best_params, enable_categorical=True, n_jobs=-1)
        xgb_model.set_params(early_stopping_rounds=100)
        xgb_model.fit(X_train_part, y_train_part, eval_set=[(X_val_part, y_val_part)], verbose=0)
        xgb_train_preds = xgb_model.predict(train_X)
        xgb_score = f1_score(train_y, xgb_train_preds, average='macro')
        model_performance.append(('xgb', xgb_model, xgb_score))
        print('Done', '\n')

        # LightGBM 파라미터 최적화와 모델 학습
        def lgb_objective(trial):
            param = {
                'objective': 'binary',
                'metric': 'binary_logloss',
                'max_depth': trial.suggest_int('max_depth', 3, 20),
                'num_leaves': trial.suggest_int('num_leaves', 2, 2000),
                'learning_rate': trial.suggest_float('learning_rate', 0.002, 0.5, log=True),
                'subsample': trial.suggest_float('subsample', 0.5, 1.0),
                'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
                'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 1.0, log=True),
                'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 1.0, log=True),
                'random_state': 42,
                'is_unbalanced': True
            }

            def train_and_evaluate(train_idx, val_idx):
                X_train_fold, X_val_fold = train_X.iloc[train_idx], train_X.iloc[val_idx]
                y_train_fold, y_val_fold = train_y.iloc[train_idx], train_y.iloc[val_idx]

                # LightGBM 데이터셋 생성
                dtrain = lgb.Dataset(X_train_fold, label=y_train_fold)
                dvalid = lgb.Dataset(X_val_fold, label=y_val_fold, reference=dtrain)

                model = lgb.train(
                    params=param,
                    train_set=dtrain,
                    valid_sets=[dvalid],
                    num_boost_round=1000,
                    callbacks=[
                        lgb.early_stopping(stopping_rounds=100, verbose=0)
                    ],
                )

                # 검증 데이터로 예측
                val_pred = model.predict(X_val_fold, num_iteration=model.best_iteration)
                val_pred_binary = (val_pred >= 0.5).astype(int)
                score = f1_score(y_val_fold, val_pred_binary, average='macro')
                return score

            scores = Parallel(n_jobs=-1)(delayed(train_and_evaluate)(train_idx, val_idx) for train_idx, val_idx in skf.split(train_X, train_y))
            return np.mean(scores)

        lgb_study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(), pruner=optuna.pruners.HyperbandPruner())
        print('VVV' * 2, 'Optimizing LGBM', 'VVV' * 2, '\n')
        lgb_study.optimize(lgb_objective, n_trials=n_trials)
        lgb_best_params = lgb_study.best_params
        print('Done', '\n')
        print('Fitting LGBM...')

        # 전체 데이터를 사용하여 최종 모델 훈련
        dtrain = lgb.Dataset(train_X, label=train_y)
        model = lgb.train(
            params=lgb_best_params,
            train_set=dtrain,
            num_boost_round=1000,
            valid_sets=[dtrain],
            valid_names=["train"],
            callbacks=[
                lgb.early_stopping(stopping_rounds=100, verbose=0)
            ],
        )

        lgb_train_preds = model.predict(train_X, num_iteration=model.best_iteration)
        lgb_score = f1_score(train_y, (lgb_train_preds >= 0.5).astype(int), average='macro')
        model_performance.append(('lgb', model, lgb_score))
        print('Done', '\n')

        # GradientBoostingClassifier 파라미터 최적화와 모델 학습
        def gbm_objective(trial):
            param = {
                'n_estimators': trial.suggest_int('n_estimators', 100, 3000),
                'max_depth': trial.suggest_int('max_depth', 2, 20),
                'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
                'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 20),
                'random_state': random_state,
                'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.5, log=True),
                'subsample': trial.suggest_float('subsample', 0.5, 1.0)
            }

            def train_and_evaluate(train_idx, val_idx):
                X_train_fold, X_val_fold = train_X_encoded.iloc[train_idx], train_X_encoded.iloc[val_idx]
                y_train_fold, y_val_fold = train_y.iloc[train_idx], train_y.iloc[val_idx]

                model = GradientBoostingClassifier(**param)
                model.fit(X_train_fold, y_train_fold)
                val_pred = model.predict(X_val_fold)
                score = f1_score(y_val_fold, val_pred, average='macro')
                return score

            scores = Parallel(n_jobs=-1)(delayed(train_and_evaluate)(train_idx, val_idx) for train_idx, val_idx in skf.split(train_X_encoded, train_y))
            return np.mean(scores)

        gbm_study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(), pruner=optuna.pruners.HyperbandPruner())
        print('VVV' * 2, 'Optimizing GBM', 'VVV' * 2, '\n')
        gbm_study.optimize(gbm_objective, n_trials=n_trials)
        gbm_best_params = gbm_study.best_params
        print('Done', '\n')
        print('Fitting GBM...')
        gbm_model = GradientBoostingClassifier(**gbm_best_params)
        gbm_model.fit(train_X_encoded, train_y)
        gbm_train_preds = gbm_model.predict(train_X_encoded)
        gbm_score = f1_score(train_y, gbm_train_preds, average='macro')
        model_performance.append(('gbm', gbm_model, gbm_score))
        print('Done', '\n')

        # RF 파라미터 최적화와 모델 학습
        def rf_objective(trial):
            param = {
                'n_estimators': trial.suggest_int('n_estimators', 100, 2000),
                'max_depth': trial.suggest_int('max_depth', 2, 20),
                'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
                'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 20),
                'random_state': random_state,
                'class_weight': 'balanced'
            }

            def train_and_evaluate(train_idx, val_idx):
                X_train_fold, X_val_fold = train_X_encoded.iloc[train_idx], train_X_encoded.iloc[val_idx]
                y_train_fold, y_val_fold = train_y.iloc[train_idx], train_y.iloc[val_idx]

                model = RandomForestClassifier(**param)
                model.fit(X_train_fold, y_train_fold)
                val_pred = model.predict(X_val_fold)
                score = f1_score(y_val_fold, val_pred, average='macro')
                return score

            scores = Parallel(n_jobs=-1)(delayed(train_and_evaluate)(train_idx, val_idx) for train_idx, val_idx in skf.split(train_X_encoded, train_y))
            return np.mean(scores)

        rf_study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(), pruner=optuna.pruners.HyperbandPruner())
        print('VVV' * 2, 'Optimizing RF', 'VVV' * 2, '\n')
        rf_study.optimize(rf_objective, n_trials=n_trials)
        rf_best_params = rf_study.best_params
        print('Done', '\n')
        print('Fitting RF...')
        rf_model = RandomForestClassifier(**rf_best_params, n_jobs=-1)
        rf_model.fit(train_X_encoded, train_y)
        rf_train_preds = rf_model.predict(train_X_encoded)
        rf_score = f1_score(train_y, rf_train_preds, average='macro')
        model_performance.append(('rf', rf_model, rf_score))
        print('Done', '\n')

        # 모델 성능에 따른 상위 num_model 모델 선택
        model_performance.sort(key=lambda x: x[2], reverse=True)
        top_models = model_performance[:num_model]

        print("Selected models for final voting:")
        for name, model, score in top_models:
            print(f"Model: {name}, F1 Score: {score}")

        # 선택된 모델로 최종 예측
        print('\n', '===================== 임계값 계산 =====================')
        preds = []
        for name, model, score in top_models:
            if name in ['rf', 'gbm']:
                preds.append(model.predict_proba(test_X_encoded)[:, 1])
                print(model.predict_proba(test_X_encoded)[:, 1])
            elif name == 'lgb':
                preds.append(model.predict(test_X, num_iteration=model.best_iteration))
                print(model.predict(test_X, num_iteration=model.best_iteration))
            else:
                preds.append(model.predict_proba(test_X)[:, 1])
                print(model.predict_proba(test_X)[:, 1])


        # 평균 예측 확률 계산
        avg_preds = np.mean(preds, axis=0)

        # 임계값 최적화
        best_f1 = 0
        best_threshold = 0.5
        thresholds = np.arange(0.1, 0.9, 0.001)
        for threshold in thresholds:
            pred_labels = (avg_preds >= threshold).astype(int)
            f1 = f1_score(test_y, pred_labels, average='macro')
            if f1 > best_f1:
                best_f1 = f1
                best_threshold = threshold

        print(f'Best F1 score: {best_f1} at threshold: {best_threshold}')
        print('========================================================', '\n')

        # 테스트 데이터 예측
        test_preds = []
        for name, model, score in top_models:
            if name in ['rf', 'gbm']:
                test_preds.append(model.predict_proba(test_data_encoded)[:, 1])
            elif name == 'lgb':
                test_preds.append(model.predict(test_data, num_iteration=model.best_iteration))
            else:
                test_preds.append(model.predict_proba(test_data)[:, 1])

        avg_test_preds = np.mean(test_preds, axis=0)
        final_test_pred = (avg_test_preds >= best_threshold).astype(int)

        # Store the predictions for hard voting later
        final_test_preds_list.append(final_test_pred)

    
    final_hard_voting_pred = np.apply_along_axis(lambda x: 1 if np.sum(x) >= len(x) / 2 else 0, axis=0, arr=np.array(final_test_preds_list))
    final_super_voting_pred = np.apply_along_axis(lambda x: 1 if np.any(x == 1) else 0, axis=0, arr=np.array(final_test_preds_list))

    # 배열에서 0을 "Normal"로, 1을 "AbNormal"로 변환
    final_hard_voting_result = np.where(final_hard_voting_pred == 0, "Normal", "AbNormal")
    final_super_voting_result = np.where(final_super_voting_pred == 0, "Normal", "AbNormal")


    return final_hard_voting_result, final_super_voting_result

## 예시

In [50]:
import pandas as pd

# Load data
train_df = pd.read_csv("train_new.csv")
test_df = pd.read_csv("test_new.csv")
train_df['target'] = train_df['target'].map({'AbNormal': 1, 'Normal': 0})
# 필요없는 컬럼 삭제
test_df = test_df.drop(['Set ID', 'target'], axis=1)

In [51]:
train_df['GMES_ORIGIN_INSP_JUDGE_CODE Collect Result_AutoClave'].replace('OK', 1, inplace=True)
test_df['GMES_ORIGIN_INSP_JUDGE_CODE Collect Result_AutoClave'].replace('OK', 1, inplace=True)

In [52]:
train_df['GMES_ORIGIN_INSP_JUDGE_CODE Collect Result_AutoClave'].fillna(0, inplace=True)
test_df['GMES_ORIGIN_INSP_JUDGE_CODE Collect Result_AutoClave'].fillna(0, inplace=True)

In [53]:
# DAm
train_Dam = train_df[['HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Dam',
                     'THICKNESS 1 Collect Result_Dam',
                     'Workorder_Dam',
                     'Production Qty Collect Result_Dam', 'target']]
test_Dam = test_df[['HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Dam',
                     'THICKNESS 1 Collect Result_Dam',
                     'Workorder_Dam',
                     'Production Qty Collect Result_Dam']]

In [54]:
# Fill1
train_F1 = train_df[['HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Fill1',
                     'HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Fill1',
                     'DISCHARGED TIME OF RESIN(Stage2) Collect Result_Fill1',
                     'DISCHARGED SPEED OF RESIN Collect Result_Fill1',
                     'Workorder_Dam',
                     'Production Qty Collect Result_Dam', 'target']]
test_F1 = test_df[['HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Fill1',
                     'HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Fill1',
                   'DISCHARGED TIME OF RESIN(Stage2) Collect Result_Fill1',
                   'DISCHARGED SPEED OF RESIN Collect Result_Fill1',
                     'Workorder_Dam',
                     'Production Qty Collect Result_Dam']]

In [55]:
# Fill2
train_F2 = train_df[['CURE SPEED Collect Result_Fill2',
                     'Head Purge Position Z Collect Result_Fill2',
                     'Workorder_Dam',
                     'Production Qty Collect Result_Dam', 'target']]
test_F2 = test_df[['CURE SPEED Collect Result_Fill2',
                     'Head Purge Position Z Collect Result_Fill2',
                     'Workorder_Dam',
                     'Production Qty Collect Result_Dam']]

In [56]:
# AC
train_AC = train_df[['1st Pressure Collect Result_AutoClave',
                     '2nd Pressure Collect Result_AutoClave',
                     'GMES_ORIGIN_INSP_JUDGE_CODE Collect Result_AutoClave',
                     'Workorder_Dam',
                     'Production Qty Collect Result_Dam', 'target']]
test_AC = test_df[['1st Pressure Collect Result_AutoClave',
                     '2nd Pressure Collect Result_AutoClave',
                   'GMES_ORIGIN_INSP_JUDGE_CODE Collect Result_AutoClave',
                     'Workorder_Dam',
                     'Production Qty Collect Result_Dam']]

In [None]:
# voting_type = super / hard
## super : 어떤 모델이든 불량으로 분류한다면 최종 에측을 불량
## hard : 다수결

# num_models : xgb, rf, cat, lgbm, rf에서 선정할 모델 수(train performance 기준)
## 단 gbm이 과적합 되는 경우 확인. 

# n_trials : 베이지안 최적화 과정에서 반복할 횟수(파라미터 서칭)


final_hard_pred, final_super_pred = final_voting_model([train_Dam, train_F1, train_F2, train_AC],
                                [test_Dam, test_F1, test_F2, test_AC],
                                random_state=42,
                                num_model=5,
                                n_trials = 50)

Processing dataset 1...
Train shape : (32404, 4)
Test shape : (8102, 4) 

++++++++++ Model Fitting ++++++++++
VVVVVV Optimizing CatBoost VVVVVV 

Done 

Fitting CatBoost...
Done 

VVVVVV Optimizing XGBoost VVVVVV 

Done 

Fitting XGBoost...
Done 

VVVVVV Optimizing LGBM VVVVVV 

[LightGBM] [Info] Number of positive: 1504, number of negative: 24419
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000191 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 879
[LightGBM] [Info] Number of data points in the train set: 25923, number of used features: 4
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.058018 -> initscore=-2.787233
[LightGBM] [Info] Start training from score -2.787233
[LightGBM] [Info] Number of positive: 1504, number of negative: 24420
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000189 



Done 

VVVVVV Optimizing GBM VVVVVV 

[LightGBM] [Info] Number of data points in the train set: 25923, number of used features: 4
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.058018 -> initscore=-2.787233
[LightGBM] [Info] Start training from score -2.787233
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.058016 -> initscore=-2.787274
[LightGBM] [Info] Start training from score -2.787274
[LightGBM] [Info] Number of positive: 1504, number of negative: 24419
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000236 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 877
[LightGBM] [Info] Number of data points in the train set: 25923, number of used features: 4
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.058018 -> initscore=-2.787233
[LightGBM] [Info] Start training from score -2.787233
[LightGBM] [Info] Number of positive: 1504, num



[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.058018 -> initscore=-2.787233
[LightGBM] [Info] Start training from score -2.787233




Done 

Fitting GBM...
Done 

VVVVVV Optimizing RF VVVVVV 

Done 

Fitting RF...
Done 

Selected models for final voting:
Model: gbm, F1 Score: 0.9990105822137947
Model: lgb, F1 Score: 0.9533446317678972
Model: rf, F1 Score: 0.647180267868559
Model: catboost, F1 Score: 0.6073737358717806
Model: xgb, F1 Score: 0.5496365079793233

[1.00970485e-20 8.86680329e-07 1.25700839e-11 ... 7.93229067e-10
 6.03019389e-01 1.52068232e-14]
[ 0.11170029 -0.0897768   0.00235718 ...  0.01209261 -0.04382213
 -0.0581413 ]
[0.00947657 0.0777691  0.01191195 ... 0.01871697 0.25838278 0.00749272]
[0.00490756 0.02108379 0.02121041 ... 0.07613384 0.00268697 0.0920433 ]
[0.04765977 0.11783523 0.05076955 ... 0.04776588 0.08834039 0.02390354]
Best F1 score: 0.5754764329145959 at threshold: 0.28200000000000014

Processing dataset 2...
Train shape : (32404, 6)
Test shape : (8102, 6) 

++++++++++ Model Fitting ++++++++++
VVVVVV Optimizing CatBoost VVVVVV 





Done 

Fitting CatBoost...
Done 

VVVVVV Optimizing XGBoost VVVVVV 





Done 

Fitting XGBoost...
Done 

VVVVVV Optimizing LGBM VVVVVV 

[LightGBM] [Info] Number of positive: 1504, number of negative: 24419
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000354 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 914
[LightGBM] [Info] Number of data points in the train set: 25923, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.058018 -> initscore=-2.787233
[LightGBM] [Info] Start training from score -2.787233
[LightGBM] [Info] Number of positive: 1504, number of negative: 24419
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000336 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 915
[LightGBM] [Info] Number of data points in the trai



Done 

Fitting CatBoost...
Done 

VVVVVV Optimizing XGBoost VVVVVV 





Done 

Fitting XGBoost...
Done 

VVVVVV Optimizing LGBM VVVVVV 

[LightGBM] [Info] Number of positive: 1504, number of negative: 24419
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000710 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 868
[LightGBM] [Info] Number of data points in the train set: 25923, number of used features: 4
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.058018 -> initscore=-2.787233
[LightGBM] [Info] Start training from score -2.787233
[LightGBM] [Info] Number of positive: 1504, number of negative: 24420
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000717 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 867
[LightGBM] [Info] Number of data points in the train set: 25924, number of used features: 4
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.058016 -> initscore=-2.787274
[LightG



[LightGBM] [Info] Number of positive: 1504, number of negative: 24420
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000719 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 867
[LightGBM] [Info] Number of data points in the train set: 25924, number of used features: 4
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.058016 -> initscore=-2.787274
[LightGBM] [Info] Start training from score -2.787274
[LightGBM] [Info] Number of positive: 1504, number of negative: 24419
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000899 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 868
[LightGBM] [Info] Number of data points in the train set: 25923, number of used features: 4
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.058018 -> initscore=-2.787233
[LightGBM] [Info] Start training from score -2.787233
[LightGBM] [Info] 



Done 

VVVVVV Optimizing GBM VVVVVV 

[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.058018 -> initscore=-2.787233
[LightGBM] [Info] Start training from score -2.787233
[LightGBM] [Info] Start training from score -2.787233
[LightGBM] [Info] Number of data points in the train set: 25923, number of used features: 4
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.058018 -> initscore=-2.787233
[LightGBM] [Info] Start training from score -2.787233












In [26]:
# 일관성 검사 적용해줘야함
Abnormal_rows = [64, 562, 1460, 1530, 1892, 2505, 2710, 3457, 3682, 3732, 4928, 4932, 6092,
 7001, 7287, 7666, 7836, 8253, 8898, 10989, 12439, 12585, 12844, 14756, 15180, 15406, 15811, 15964]


# 주어진 인덱스에 해당하는 값이 이미 "AbNormal"인지 확인하고, 아니라면 "AbNormal"로 변경
for idx in Abnormal_rows:
    if final_pred[idx] != "AbNormal":
        final_pred[idx] = "AbNormal"
    else:
        print(f'{idx} is passed')

sub_data = pd.read_csv('submission.csv')

# 수정된 결과를 CSV로 저장
sub_data['target'] = final_pred
sub_data.to_csv('submission.csv', index=False)


64 is passed
562 is passed
1460 is passed
1530 is passed
1892 is passed
2505 is passed
2710 is passed
3457 is passed
3682 is passed
3732 is passed
4928 is passed
4932 is passed
6092 is passed
7001 is passed
7287 is passed
7666 is passed
7836 is passed
8253 is passed
8898 is passed
10989 is passed
12439 is passed
12585 is passed
12844 is passed
14756 is passed
15180 is passed
15406 is passed
15811 is passed
15964 is passed


In [28]:
sub_data['target'].value_counts()

target
Normal      15930
AbNormal     1431
Name: count, dtype: int64

In [24]:
final_pred

array(['Normal', 'Normal', 'Normal', ..., 'Normal', 'Normal', 'Normal'],
      dtype='<U8')