In [None]:
# 파일이름 : model_evaluation_vary_models.ipynb
# 코드설명 : 물성값 및 불량여부 예측 AI 모델의 성능 비교 및 최적(상이한 기술) 모델 선정, 필요없는 모델 제거
# 입/출력 : 입/출력 : 예측 대상별 학습된 모델들 / 예측 대상별 최종 선정 모델(다른 기술)
# 유의 사항 :
# 1. 경량화에서 실수로 제거 시 재학습에 오랜 시간이 소요되므로, 복사해두고 진행
# 2. autogluon(0.8.2)과 python(3.9.18) 버전을 맞추어야 함
# 3. 모델 저장 경로를 잘 수정하여 진행
# 최종수정 : 2023년 11월 23일
# 제 작 자 : 홍민성 (mshong@micube.co.kr), 맹영준 (myj6223@micube.co.kr)
# Copyright : MICUBE Solution, Inc.

# python 버전 : 3.9.18
# autogluon 버전 : 0.8.2

In [1]:
# 관련 라이브러리 로드
import numpy as np
import pandas as pd
pd.set_option("display.max_columns", 500)
pd.set_option('display.max_rows', 50)

import os
import random
import warnings
warnings.filterwarnings(action='ignore')

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_percentage_error, recall_score, accuracy_score

# autoML
from autogluon.tabular import TabularPredictor

# 동일 결과 재생성을 위한 랜덤성 고정
seed = 42
random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)
np.random.seed(seed)

In [2]:
# SMAPE 계산 함수 정의
def smape_cal(y_true, y_pred):
    return np.mean(2 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred)))

# 0을 제외한 MAPE 계산 함수 정의
def mape_non_zero(y_true, y_pred):
    mask = y_true != 0
    return np.mean(np.abs((y_pred[mask] - y_true[mask]) / y_true[mask]))

## 1. AutoML (Autogluon) 회귀 모델 로드 및 모델 조정

### 1) 물성값 예측에 대한 회귀 모델 로드 및 leaderboard 출력

In [6]:
# 물성값: regression, 불량여부: binary
problem_type_ = 'regression' # 'regression', 'binary'

# 모델 인덱스
file_suffix='combined'

if problem_type_ == 'regression':
    # 물성값 예측
    # yCols=['HS','SG','TS','EB']#'MNY','REHO_MIN','REHO_MAX','REHO_TS2','REHO_TC90','SCR']
    yCols=['MNY','REHO_MIN','REHO_MAX','REHO_TS2','REHO_TC90','SCR']
    data_type_ = 'SFT_regr'
else:
    # 불량여부 예측
    # yCols=['HS_RESULT','SG_RESULT','TS_RESULT','EB_RESULT']#,'MNY_RESULT','REHO_RESULT','SCR_RESULT']
    yCols=['MNY_RESULT','REHO_RESULT','SCR_RESULT']
    data_type_ = 'SFT_clas'

for yCol in yCols:
    print(f'{yCol}!!!!')
    print(f'='*80)
    # 데이터셋 로드 및 모델 입출력 데이터 생성
    train = pd.read_csv(f'./final_dataset/{file_suffix}_data_{yCol}_fin.csv')
    X_train, X_test, y_train, y_test = train_test_split(train.drop(['ID', 'REAL_VAL','TYPE'], axis=1), train['REAL_VAL'], test_size=0.2, random_state=seed)
    train_data = train.iloc[X_train.index].drop(['ID','TYPE'], axis=1)

    # 모델 로드    
    predictor = TabularPredictor.load(path=f'{data_type_}/{file_suffix}_{yCol}_models', verbosity = 0)

    # 테스트 세트에 대한 예측
    y_pred = predictor.predict(X_test)

    res_df = pd.DataFrame()
    res_df[yCol+'_real'] = y_test
    res_df[yCol+'_pred'] = y_pred

    # 성능 평가
    if problem_type_ == 'regression':
        mape = mean_absolute_percentage_error(res_df[yCol+'_real'], res_df[yCol+'_pred'])
        zero_mape = mape_non_zero(res_df[yCol+'_real'], res_df[yCol+'_pred'])
        smape = smape_cal(res_df[yCol+'_real'], res_df[yCol+'_pred'])
        print(f"{yCol} >> MAPE: {mape:.4f}, zero_mape: {zero_mape:.4f}, smape: {smape:.4f}")
    else:
        recall = recall_score(res_df[yCol+'_real'], res_df[yCol+'_pred'])
        accuracy = accuracy_score(res_df[yCol+'_real'], res_df[yCol+'_pred'])
        print(f"{yCol} >> reall: {recall:.4f}, accuracy: {accuracy:.4f}")

    # 학습된 모델의 성능 요약
    predictor.leaderboard()
    print(f'='*80,'\n')

MNY!!!!
MNY >> MAPE: 0.0946, zero_mape: 0.0946, smape: 0.0637
                    model  score_val  pred_time_val    fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0     WeightedEnsemble_L2  -6.379864      12.463250  142.153263                0.000805           0.429593            2       True          7
1     WeightedEnsemble_L3  -6.418823      16.148978  185.673658                0.000760           0.241785            3       True         10
2         LightGBM_BAG_L2  -6.443068      15.937715  180.487148                0.130140           3.616469            2       True          9
3       LightGBMXT_BAG_L2  -6.501688      16.018078  181.815404                0.210503           4.944726            2       True          8
4  RandomForestMSE_BAG_L1  -6.605629       3.912031   61.144779                3.912031          61.144779            1       True          5
5         LightGBM_BAG_L1  -6.637255       3.348607   36.976955                3.34860

### 2) 모델 선정

In [3]:
# 선정된 모델명 리스트
trg_model_dic = {}
trg_model_dic['MNY'] = ['LightGBM_BAG_L1', 'LightGBMXT_BAG_L1', 'LightGBM_BAG_L2', 'LightGBMXT_BAG_L2']
trg_model_dic['REHO_MAX'] = ['LightGBM_BAG_L1', 'LightGBMXT_BAG_L1', 'LightGBM_BAG_L2', 'LightGBMXT_BAG_L2']
trg_model_dic['REHO_MIN'] = ['LightGBM_BAG_L1', 'LightGBMXT_BAG_L1']
trg_model_dic['REHO_TS2'] = ['LightGBM_BAG_L1', 'LightGBMXT_BAG_L1', 'LightGBM_BAG_L2', 'LightGBMXT_BAG_L2']
trg_model_dic['REHO_TC90'] = ['LightGBM_BAG_L1', 'LightGBMXT_BAG_L1', 'LightGBM_BAG_L2', 'LightGBMXT_BAG_L2']
trg_model_dic['SCR'] = ['LightGBM_BAG_L1', 'LightGBMXT_BAG_L1', 'LightGBM_BAG_L2', 'LightGBMXT_BAG_L2']

### 3) 개별 데이터의 최적 예측 모델(들) 선정 및 모델별 성능 요약
- 개별 데이터(LAB, MES-CMB, MES-FMB)

In [4]:
# 물성값: regression, 불량여부: binary
problem_type_ = 'regression' # 'regression', 'binary'

# 모델 인덱스
file_suffix='combined'

max_model_num = len(max(trg_model_dic.values(), key=len))

if problem_type_ == 'regression':
    # 물성값 예측
    # yCols=['HS','SG','TS','EB']#'MNY','REHO_MIN','REHO_MAX','REHO_TS2','REHO_TC90','SCR']
    yCols=['MNY','REHO_MIN','REHO_MAX','REHO_TS2','REHO_TC90','SCR']
    data_type_ = 'SFT_regr'
    
    col_Nms = ['type','target_y']
    for i in np.arange(max_model_num):
        col_Nms = col_Nms + ['mape','mape0','smape']    
else:
    # 불량여부 예측
    yCols=['HS_RESULT','SG_RESULT','TS_RESULT','EB_RESULT']#,'MNY_RESULT','REHO_RESULT','SCR_RESULT']
    # yCols=['MNY_RESULT','REHO_RESULT','SCR_RESULT']
    data_type_ = 'SFT_clas'
    
    col_Nms = ['type','target_y']
    for i in np.arange(max_model_num):
        col_Nms = col_Nms + ['recall','accuracy','']
    
col_Nms.append('데이터수')
fin_df = pd.DataFrame(columns=col_Nms)

for yCol in yCols:
    print(f'{yCol}!!!!')
    print(f'='*80)    
    # 데이터셋 로드 및 모델 입출력 데이터 생성
    train = pd.read_csv(f'./final_dataset/{file_suffix}_data_{yCol}_fin.csv')
    X_train, X_test, y_train, y_test = train_test_split(train.drop(['ID', 'REAL_VAL'], axis=1), train[['REAL_VAL','TYPE']], test_size=0.2, random_state=seed)

    # 모델 로드    
    predictor = TabularPredictor.load(path=f'{data_type_}/{file_suffix}_{yCol}_models', verbosity = 0)
        
    types = X_test['TYPE'].unique().tolist()
    types.sort()
    display(types)
    
    for type_ in types:
        tmp_X_test = X_test[X_test['TYPE'] == type_]
        tmp_y_test = y_test[y_test['TYPE'] == type_]
        tmp_X_test = tmp_X_test.drop(columns=['TYPE'])
        tmp_y_test = tmp_y_test['REAL_VAL']

        result_ls = []
        model_ls = []
        for i in np.arange(max_model_num):
            try :
                model_name = trg_model_dic[yCol][i]
            
                # 테스트 세트에 대한 예측
                y_pred = predictor.predict(tmp_X_test, model=model_name)

                res_df = pd.DataFrame()
                res_df[yCol+'_real'] = tmp_y_test
                res_df[yCol+'_pred'] = y_pred

                # 성능 평가
                if problem_type_ == 'regression':
                    mape = mean_absolute_percentage_error(res_df[yCol+'_real'], res_df[yCol+'_pred'])
                    zero_mape = mape_non_zero(res_df[yCol+'_real'], res_df[yCol+'_pred'])
                    smape = smape_cal(res_df[yCol+'_real'], res_df[yCol+'_pred'])
                    print(f"{type_}: {yCol} >> MAPE: {mape:.4f}, zero_mape: {zero_mape:.4f}, smape: {smape:.4f}")
                    model_ls = model_ls + [model_name, '', '']
                    result_ls = result_ls + [mape, zero_mape, smape]
                else:
                    recall = recall_score(res_df[yCol+'_real'], res_df[yCol+'_pred'])
                    accuracy = accuracy_score(res_df[yCol+'_real'], res_df[yCol+'_pred'])
                    print(f"{type_}: {yCol} >> reall: {recall:.4f}, accuracy: {accuracy:.4f}")
                    model_ls = model_ls + [model_name, '', '']
                    result_ls = result_ls + [recall, accuracy, '']
                
            except:
                model_ls = model_ls + ['', '', '']
                result_ls = result_ls + ['', '', '']                

        fin_df.loc[len(fin_df)] = [type_, yCol] + model_ls + ['']
        fin_df.loc[len(fin_df)] = [type_, yCol] + result_ls + [len(tmp_X_test)]
        # predictor.leaderboard()

fin_df['index'] = fin_df.index
fin_df.sort_values(['type','index'])

MNY!!!!


['LAB', 'mCMB', 'mFMB']

LAB: MNY >> MAPE: 0.2157, zero_mape: 0.2157, smape: 0.1894
LAB: MNY >> MAPE: 0.2259, zero_mape: 0.2259, smape: 0.1993
LAB: MNY >> MAPE: 0.1926, zero_mape: 0.1926, smape: 0.1659
LAB: MNY >> MAPE: 0.1851, zero_mape: 0.1851, smape: 0.1623
mCMB: MNY >> MAPE: 0.0973, zero_mape: 0.0973, smape: 0.0661
mCMB: MNY >> MAPE: 0.0976, zero_mape: 0.0976, smape: 0.0665
mCMB: MNY >> MAPE: 0.0951, zero_mape: 0.0951, smape: 0.0640
mCMB: MNY >> MAPE: 0.0954, zero_mape: 0.0954, smape: 0.0650
mFMB: MNY >> MAPE: 0.0588, zero_mape: 0.0588, smape: 0.0592
mFMB: MNY >> MAPE: 0.0595, zero_mape: 0.0595, smape: 0.0601
mFMB: MNY >> MAPE: 0.0605, zero_mape: 0.0605, smape: 0.0602
mFMB: MNY >> MAPE: 0.0593, zero_mape: 0.0593, smape: 0.0591
REHO_MIN!!!!


['LAB', 'mCMB', 'mFMB']

LAB: REHO_MIN >> MAPE: 0.3358, zero_mape: 0.3358, smape: 0.2751
LAB: REHO_MIN >> MAPE: 0.3339, zero_mape: 0.3339, smape: 0.2744
mCMB: REHO_MIN >> MAPE: 0.2698, zero_mape: 0.2698, smape: 0.2304
mCMB: REHO_MIN >> MAPE: 0.2524, zero_mape: 0.2524, smape: 0.2203
mFMB: REHO_MIN >> MAPE: 0.1639, zero_mape: 0.1639, smape: 0.1484
mFMB: REHO_MIN >> MAPE: 0.1573, zero_mape: 0.1573, smape: 0.1428
REHO_MAX!!!!


['LAB', 'mCMB', 'mFMB']

LAB: REHO_MAX >> MAPE: 0.4166, zero_mape: 0.4166, smape: 0.3779
LAB: REHO_MAX >> MAPE: 0.2269, zero_mape: 0.2269, smape: 0.1638
LAB: REHO_MAX >> MAPE: 0.2094, zero_mape: 0.2094, smape: 0.1545
LAB: REHO_MAX >> MAPE: 0.2017, zero_mape: 0.2017, smape: 0.1527
mCMB: REHO_MAX >> MAPE: 0.2789, zero_mape: 0.2789, smape: 0.2834
mCMB: REHO_MAX >> MAPE: 0.0787, zero_mape: 0.0787, smape: 0.0755
mCMB: REHO_MAX >> MAPE: 0.0816, zero_mape: 0.0816, smape: 0.0779
mCMB: REHO_MAX >> MAPE: 0.0787, zero_mape: 0.0787, smape: 0.0753
mFMB: REHO_MAX >> MAPE: 0.5603, zero_mape: 0.5603, smape: 0.3913
mFMB: REHO_MAX >> MAPE: 0.0789, zero_mape: 0.0789, smape: 0.0738
mFMB: REHO_MAX >> MAPE: 0.0775, zero_mape: 0.0775, smape: 0.0726
mFMB: REHO_MAX >> MAPE: 0.0747, zero_mape: 0.0747, smape: 0.0699
REHO_TS2!!!!


['mCMB', 'mFMB']

mCMB: REHO_TS2 >> MAPE: 0.0714, zero_mape: 0.0714, smape: 0.0703
mCMB: REHO_TS2 >> MAPE: 0.0727, zero_mape: 0.0727, smape: 0.0715
mCMB: REHO_TS2 >> MAPE: 0.0722, zero_mape: 0.0722, smape: 0.0710
mFMB: REHO_TS2 >> MAPE: 0.0525, zero_mape: 0.0525, smape: 0.0514
mFMB: REHO_TS2 >> MAPE: 0.0518, zero_mape: 0.0518, smape: 0.0507
mFMB: REHO_TS2 >> MAPE: 0.0514, zero_mape: 0.0514, smape: 0.0504
REHO_TC90!!!!


['mCMB', 'mFMB']

mCMB: REHO_TC90 >> MAPE: 0.0672, zero_mape: 0.0672, smape: 0.0669
mCMB: REHO_TC90 >> MAPE: 0.0687, zero_mape: 0.0687, smape: 0.0682
mCMB: REHO_TC90 >> MAPE: 0.0674, zero_mape: 0.0674, smape: 0.0670
mFMB: REHO_TC90 >> MAPE: 0.0485, zero_mape: 0.0485, smape: 0.0480
mFMB: REHO_TC90 >> MAPE: 0.0480, zero_mape: 0.0480, smape: 0.0475
mFMB: REHO_TC90 >> MAPE: 0.0477, zero_mape: 0.0477, smape: 0.0472
SCR!!!!


['mCMB', 'mFMB']

mCMB: SCR >> MAPE: 0.1427, zero_mape: 0.1427, smape: 0.1263
mCMB: SCR >> MAPE: 0.1483, zero_mape: 0.1483, smape: 0.1314
mCMB: SCR >> MAPE: 0.1228, zero_mape: 0.1228, smape: 0.1100
mCMB: SCR >> MAPE: 0.1276, zero_mape: 0.1276, smape: 0.1153
mFMB: SCR >> MAPE: 0.0681, zero_mape: 0.0681, smape: 0.0658
mFMB: SCR >> MAPE: 0.0716, zero_mape: 0.0716, smape: 0.0689
mFMB: SCR >> MAPE: 0.0521, zero_mape: 0.0521, smape: 0.0504
mFMB: SCR >> MAPE: 0.0561, zero_mape: 0.0561, smape: 0.0544


Unnamed: 0,type,target_y,mape,mape0,smape,mape.1,mape0.1,smape.1,mape.2,mape0.2,smape.2,mape.3,mape0.3,smape.3,데이터수
0,LAB,MNY,LightGBM_BAG_L1,,,LightGBMXT_BAG_L1,,,LightGBM_BAG_L2,,,LightGBMXT_BAG_L2,,,
1,LAB,MNY,0.215721,0.215721,0.1894,0.225863,0.225863,0.199317,0.192589,0.192589,0.165886,0.185053,0.185053,0.162262,157.0
6,LAB,REHO_MIN,LightGBM_BAG_L1,,,LightGBMXT_BAG_L1,,,,,,,,,
7,LAB,REHO_MIN,0.335824,0.335824,0.27508,0.333926,0.333926,0.274417,,,,,,,1551.0
13,LAB,REHO_MAX,0.416624,0.416624,0.377907,0.2269,0.2269,0.163763,0.209388,0.209388,0.154535,0.20168,0.20168,0.152739,1559.0
12,LAB,REHO_MAX,LightGBM_BAG_L1,,,LightGBMXT_BAG_L1,,,LightGBM_BAG_L2,,,LightGBMXT_BAG_L2,,,
27,mCMB,SCR,0.142662,0.142662,0.126297,0.148257,0.148257,0.131356,0.122777,0.122777,0.110007,0.127581,0.127581,0.115257,2169.0
26,mCMB,SCR,LightGBM_BAG_L1,,,LightGBMXT_BAG_L1,,,LightGBM_BAG_L2,,,LightGBMXT_BAG_L2,,,
23,mCMB,REHO_TC90,,,,0.067186,0.067186,0.066912,0.068688,0.068688,0.068191,0.067364,0.067364,0.066987,2239.0
22,mCMB,REHO_TC90,,,,LightGBMXT_BAG_L1,,,LightGBM_BAG_L2,,,LightGBMXT_BAG_L2,,,


### 4) 통합 데이터의 최적 예측 모델(들) 선정 및 모델별 성능 요약

In [None]:
# 선정된 모델명 리스트
# trg_model_ls = ['LightGBM_BAG_L2']
trg_model_ls = ['LightGBMXT_BAG_L1']
print(f'선정된 모델 : {trg_model_ls}')

In [14]:
# 물성값: regression, 불량여부: binary
problem_type_ = 'regression' # 'regression', 'binary'

# 모델 인덱스
file_suffix='combined'

if problem_type_ == 'regression':
    # 물성값 예측
    # yCols=['HS','SG','TS','EB']#'MNY','REHO_MIN','REHO_MAX','REHO_TS2','REHO_TC90','SCR']
    yCols=['MNY','REHO_MIN','REHO_MAX','REHO_TS2','REHO_TC90','SCR']
    data_type_ = 'SFT_regr'
    
    col_Nms = ['target_y']
    for trg_model in trg_model_ls:
        col_Nms += ['mape','mape0','smape']    
else:
    # 불량여부 예측
    # yCols=['HS_RESULT','SG_RESULT','TS_RESULT','EB_RESULT']#,'MNY_RESULT','REHO_RESULT','SCR_RESULT']
    yCols=['MNY_RESULT','REHO_RESULT','SCR_RESULT']
    data_type_ = 'SFT_clas'
    
    col_Nms = ['target_y']
    for trg_model in trg_model_ls:
        col_Nms += ['recall','accuracy','']
    
col_Nms.append('데이터수')
fin_df = pd.DataFrame(columns=col_Nms)

for yCol in yCols:    
    print(f'{yCol}!!!!')
    print(f'='*80)
    # 데이터셋 로드 및 모델 입출력 데이터 생성
    train = pd.read_csv(f'./final_dataset/{file_suffix}_data_{yCol}_fin.csv')
    X_train, X_test, y_train, y_test = train_test_split(train.drop(['ID', 'REAL_VAL','TYPE'], axis=1), train[['REAL_VAL']], test_size=0.2, random_state=seed)

    # 모델 로드    
    predictor = TabularPredictor.load(path=f'{data_type_}/{file_suffix}_{yCol}_models', verbosity = 0)
        
    result_ls = []
    for model_name in trg_model_ls:
        # 테스트 세트에 대한 예측
        y_pred = predictor.predict(X_test, model=model_name)

        res_df = pd.DataFrame()
        res_df[yCol+'_real'] = y_test
        res_df[yCol+'_pred'] = y_pred

        # 성능 평가
        if problem_type_ == 'regression':
            mape = mean_absolute_percentage_error(res_df[yCol+'_real'], res_df[yCol+'_pred'])
            zero_mape = mape_non_zero(res_df[yCol+'_real'], res_df[yCol+'_pred'])
            smape = smape_cal(res_df[yCol+'_real'], res_df[yCol+'_pred'])
            print(f"{yCol} >> MAPE: {mape:.4f}, zero_mape: {zero_mape:.4f}, smape: {smape:.4f}")
            result_ls = result_ls + [mape, zero_mape, smape]
        else:
            recall = recall_score(res_df[yCol+'_real'], res_df[yCol+'_pred'])
            accuracy = accuracy_score(res_df[yCol+'_real'], res_df[yCol+'_pred'])
            print(f"{yCol} >> reall: {recall:.4f}, accuracy: {accuracy:.4f}")
            result_ls = result_ls + [recall, accuracy, '']

    fin_df.loc[len(fin_df)] = [yCol] + result_ls + [len(y_test)]
    # predictor.leaderboard()

fin_df

MNY!!!!
mFMB: MNY >> MAPE: 0.0990, zero_mape: 0.0990, smape: 0.0682
REHO_MIN!!!!
mFMB: REHO_MIN >> MAPE: 0.1694, zero_mape: 0.1694, smape: 0.1522
REHO_MAX!!!!
mFMB: REHO_MAX >> MAPE: 0.0846, zero_mape: 0.0846, smape: 0.0773
REHO_TS2!!!!
mFMB: REHO_TS2 >> MAPE: 0.0536, zero_mape: 0.0536, smape: 0.0525
REHO_TC90!!!!
mFMB: REHO_TC90 >> MAPE: 0.0496, zero_mape: 0.0496, smape: 0.0491
SCR!!!!
mFMB: SCR >> MAPE: 0.0889, zero_mape: 0.0889, smape: 0.0830


Unnamed: 0,target_y,mape,mape0,smape,데이터수
0,MNY,0.099046,0.099046,0.06817,12003
1,REHO_MIN,0.169355,0.169355,0.152186,40450
2,REHO_MAX,0.084561,0.084561,0.077318,40461
3,REHO_TS2,0.053611,0.053611,0.05251,38862
4,REHO_TC90,0.049587,0.049587,0.049066,38863
5,SCR,0.088869,0.088869,0.083005,9615


### 5) 최종적으로 선정된 모델 외 삭제 (모델 로딩 시간 단축)
<p style="font-weight:bold"> <span style="color:red">** 주의 : 실수로 모델 제거시, 다시 학습해야하므로 복사 해두고 진행하기 바랍니다.</span> </p>

In [5]:
print(f'선정된 모델 : {trg_model_ls}')

선정된 모델 : ['LightGBM_BAG_L2']


In [7]:
# 물성값: regression, 불량여부: binary
problem_type_ = 'regression' # 'regression', 'binary'

# 모델 인덱스
file_suffix='combined'

if problem_type_ == 'regression':
    # 물성값 예측
    yCols=['HS','SG','TS','EB']#'MNY','REHO_MIN','REHO_MAX','REHO_TS2','REHO_TC90','SCR']
    yCols=['MNY','REHO_MIN','REHO_MAX','REHO_TS2','REHO_TC90','SCR']
    data_type_ = 'SFT_regr'
else:
    # 불량여부 예측
    yCols=['HS_RESULT','SG_RESULT','TS_RESULT','EB_RESULT']#,'MNY_RESULT','REHO_RESULT','SCR_RESULT']
    yCols=['MNY_RESULT','REHO_RESULT','SCR_RESULT']
    data_type_ = 'SFT_clas'

for yCol in yCols:
    print(yCol)
    predictor = TabularPredictor.load(path=f'{data_type_}/{file_suffix}_{yCol}_models', verbosity = 0)
    predictor.delete_models(models_to_keep=trg_model_ls, models_to_delete=None, dry_run=False)
    predictor.leaderboard()

HS
                    model  score_val  pred_time_val    fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0         LightGBM_BAG_L2  -2.568964      26.542925  195.168303                0.135264           2.928477            2       True          8
1         LightGBM_BAG_L1  -2.824011       3.190365   44.580211                3.190365          44.580211            1       True          4
2       LightGBMXT_BAG_L1  -2.844179      16.409028   70.873482               16.409028          70.873482            1       True          3
3    ExtraTreesMSE_BAG_L1  -3.053433       2.502457   16.746434                2.502457          16.746434            1       True          7
4  RandomForestMSE_BAG_L1  -3.127833       2.451457   31.652997                2.451457          31.652997            1       True          5
5         CatBoost_BAG_L1  -3.954185       0.242229   28.159337                0.242229          28.159337            1       True          6
6  

## 2. AutoML (Autogluon) 분류 모델 로드 및 모델 조정

### 1) 불량여부 예측에 대한 분류 모델 로드 및 leaderboard 출력

In [13]:
# 물성값: regression, 불량여부: binary
problem_type_ = 'binary' # 'regression', 'binary'

# 모델 인덱스
file_suffix='combined'

if problem_type_ == 'regression':
    # 물성값 예측
    yCols=['HS','SG','TS','EB']#'MNY','REHO_MIN','REHO_MAX','REHO_TS2','REHO_TC90','SCR']
    yCols=['MNY','REHO_MIN','REHO_MAX','REHO_TS2','REHO_TC90','SCR']
    data_type_ = 'SFT_regr'
else:
    # 불량여부 예측
    # yCols=['HS_RESULT','SG_RESULT','TS_RESULT','EB_RESULT']#,'MNY_RESULT','REHO_RESULT','SCR_RESULT']
    yCols=['MNY_RESULT','REHO_RESULT','SCR_RESULT']
    data_type_ = 'SFT_clas'

for yCol in yCols:
    print(f'{yCol}!!!!')
    print(f'='*80)
    # 데이터셋 로드 및 모델 입출력 데이터 생성
    train = pd.read_csv(f'./final_dataset/{file_suffix}_data_{yCol}_fin.csv')
    X_train, X_test, y_train, y_test = train_test_split(train.drop(['ID', 'REAL_VAL','TYPE'], axis=1), train['REAL_VAL'], test_size=0.2, random_state=seed)
    train_data = train.iloc[X_train.index].drop(['ID','TYPE'], axis=1)

    # 모델 로드    
    predictor = TabularPredictor.load(path=f'{data_type_}/{file_suffix}_{yCol}_models', verbosity = 0)

    # 테스트 세트에 대한 예측
    y_pred = predictor.predict(X_test)

    res_df = pd.DataFrame()
    res_df[yCol+'_real'] = y_test
    res_df[yCol+'_pred'] = y_pred

    # 성능 평가
    if problem_type_ == 'regression':
        mape = mean_absolute_percentage_error(res_df[yCol+'_real'], res_df[yCol+'_pred'])
        zero_mape = mape_non_zero(res_df[yCol+'_real'], res_df[yCol+'_pred'])
        smape = smape_cal(res_df[yCol+'_real'], res_df[yCol+'_pred'])
        print(f"{yCol} >> MAPE: {mape:.4f}, zero_mape: {zero_mape:.4f}, smape: {smape:.4f}")
    else:
        recall = recall_score(res_df[yCol+'_real'], res_df[yCol+'_pred'])
        accuracy = accuracy_score(res_df[yCol+'_real'], res_df[yCol+'_pred'])
        print(f"{yCol} >> reall: {recall:.4f}, accuracy: {accuracy:.4f}")

    # 학습된 모델의 성능 요약
    predictor.leaderboard()
    print(f'='*80,'\n')

MNY_RESULT!!!!
MNY_RESULT >> reall: 0.9652, accuracy: 0.8645
                      model  score_val  pred_time_val    fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0       WeightedEnsemble_L2   0.865768      12.766993   41.689779                0.055805           7.947603            2       True         12
1     ExtraTreesEntr_BAG_L1   0.864793       3.151191    5.702573                3.151191           5.702573            1       True          9
2     ExtraTreesGini_BAG_L1   0.864233       3.074118    5.727385                3.074118           5.727385            1       True          8
3   RandomForestEntr_BAG_L1   0.863030       3.007013    8.045981                3.007013           8.045981            1       True          6
4   RandomForestGini_BAG_L1   0.862553       3.067676    7.913852                3.067676           7.913852            1       True          5
5    NeuralNetFastAI_BAG_L1   0.840734       1.956752   19.641088          

### 2) 모델 선정

In [14]:
# 선정된 모델명 리스트
trg_model_dic = {}
trg_model_dic['HS_RESULT'] = ['RandomForestEntr_BAG_L1', 'ExtraTreesEntr_BAG_L1', 'XGBoost_BAG_L1', 'LightGBM_BAG_L1']
trg_model_dic['SG_RESULT'] = ['RandomForestEntr_BAG_L1', 'ExtraTreesEntr_BAG_L1', 'XGBoost_BAG_L1', 'LightGBM_BAG_L1']
trg_model_dic['TS_RESULT'] = ['RandomForestEntr_BAG_L1', 'ExtraTreesEntr_BAG_L1', 'XGBoost_BAG_L1', 'LightGBM_BAG_L1']
trg_model_dic['EB_RESULT'] = ['RandomForestEntr_BAG_L1', 'ExtraTreesEntr_BAG_L1', 'XGBoost_BAG_L1', 'LightGBM_BAG_L1']

trg_model_dic['MNY_RESULT'] = ['ExtraTreesEntr_BAG_L1', 'RandomForestEntr_BAG_L1', 'LightGBMXT_BAG_L1', 'LightGBM_BAG_L1']
trg_model_dic['REHO_RESULT'] = ['ExtraTreesEntr_BAG_L1', 'RandomForestEntr_BAG_L1', 'LightGBMXT_BAG_L1', 'LightGBM_BAG_L1']
trg_model_dic['SCR_RESULT'] = ['ExtraTreesEntr_BAG_L1', 'RandomForestEntr_BAG_L1', 'LightGBMXT_BAG_L1', 'LightGBM_BAG_L1']

### 3) 개별 데이터의 최적 예측 모델(들) 선정 및 모델별 성능 요약
- 개별 데이터(LAB, MES-CMB, MES-FMB)

In [15]:
# 물성값: regression, 불량여부: binary
problem_type_ = 'binary' # 'regression', 'binary'

# 모델 인덱스
file_suffix='combined'

max_model_num = len(max(trg_model_dic.values(), key=len))

if problem_type_ == 'regression':
    # 물성값 예측
    # yCols=['HS','SG','TS','EB']#'MNY','REHO_MIN','REHO_MAX','REHO_TS2','REHO_TC90','SCR']
    yCols=['MNY','REHO_MIN','REHO_MAX','REHO_TS2','REHO_TC90','SCR']
    data_type_ = 'SFT_regr'
    
    col_Nms = ['type','target_y']
    for i in np.arange(max_model_num):
        col_Nms = col_Nms + ['mape','mape0','smape']    
else:
    # 불량여부 예측
    # yCols=['HS_RESULT','SG_RESULT','TS_RESULT','EB_RESULT']#,'MNY_RESULT','REHO_RESULT','SCR_RESULT']
    yCols=['MNY_RESULT','REHO_RESULT','SCR_RESULT']
    data_type_ = 'SFT_clas'
    
    col_Nms = ['type','target_y']
    for i in np.arange(max_model_num):
        col_Nms = col_Nms + ['recall','accuracy','']
    
col_Nms.append('데이터수')
fin_df = pd.DataFrame(columns=col_Nms)

for yCol in yCols:
    print(f'{yCol}!!!!')
    print(f'='*80)    
    # 데이터셋 로드 및 모델 입출력 데이터 생성
    train = pd.read_csv(f'./final_dataset/{file_suffix}_data_{yCol}_fin.csv')
    X_train, X_test, y_train, y_test = train_test_split(train.drop(['ID', 'REAL_VAL'], axis=1), train[['REAL_VAL','TYPE']], test_size=0.2, random_state=seed)

    # 모델 로드    
    predictor = TabularPredictor.load(path=f'{data_type_}/{file_suffix}_{yCol}_models', verbosity = 0)
        
    types = X_test['TYPE'].unique().tolist()
    types.sort()
    display(types)
    
    for type_ in types:
        tmp_X_test = X_test[X_test['TYPE'] == type_]
        tmp_y_test = y_test[y_test['TYPE'] == type_]
        tmp_X_test = tmp_X_test.drop(columns=['TYPE'])
        tmp_y_test = tmp_y_test['REAL_VAL']

        result_ls = []
        model_ls = []
        for i in np.arange(max_model_num):
            try :
                model_name = trg_model_dic[yCol][i]
            
                # 테스트 세트에 대한 예측
                y_pred = predictor.predict(tmp_X_test, model=model_name)

                res_df = pd.DataFrame()
                res_df[yCol+'_real'] = tmp_y_test
                res_df[yCol+'_pred'] = y_pred

                # 성능 평가
                if problem_type_ == 'regression':
                    mape = mean_absolute_percentage_error(res_df[yCol+'_real'], res_df[yCol+'_pred'])
                    zero_mape = mape_non_zero(res_df[yCol+'_real'], res_df[yCol+'_pred'])
                    smape = smape_cal(res_df[yCol+'_real'], res_df[yCol+'_pred'])
                    print(f"{type_}: {yCol} >> MAPE: {mape:.4f}, zero_mape: {zero_mape:.4f}, smape: {smape:.4f}")
                    model_ls = model_ls + [model_name, '', '']
                    result_ls = result_ls + [mape, zero_mape, smape]
                else:
                    recall = recall_score(res_df[yCol+'_real'], res_df[yCol+'_pred'])
                    accuracy = accuracy_score(res_df[yCol+'_real'], res_df[yCol+'_pred'])
                    print(f"{type_}: {yCol} >> reall: {recall:.4f}, accuracy: {accuracy:.4f}")
                    model_ls = model_ls + [model_name, '', '']
                    result_ls = result_ls + [recall, accuracy, '']
                
            except:
                model_ls = model_ls + ['', '', '']
                result_ls = result_ls + ['', '', '']                

        # fin_df.loc[len(fin_df)] = [type_, yCol] + model_ls + ['']
        fin_df.loc[len(fin_df)] = [type_, yCol] + result_ls + [len(tmp_X_test)]
        # predictor.leaderboard()

fin_df['index'] = fin_df.index
display(model_ls, fin_df.sort_values(['type','index']))

MNY_RESULT!!!!


['mCMB', 'mFMB']

mCMB: MNY_RESULT >> reall: 0.9662, accuracy: 0.8638
mCMB: MNY_RESULT >> reall: 0.9689, accuracy: 0.8633
mCMB: MNY_RESULT >> reall: 0.7797, accuracy: 0.7792
mCMB: MNY_RESULT >> reall: 0.7954, accuracy: 0.7888
mFMB: MNY_RESULT >> reall: 1.0000, accuracy: 0.8554
mFMB: MNY_RESULT >> reall: 1.0000, accuracy: 0.8554
mFMB: MNY_RESULT >> reall: 0.8406, accuracy: 0.7831
mFMB: MNY_RESULT >> reall: 0.8986, accuracy: 0.8313
REHO_RESULT!!!!


['mCMB', 'mFMB']

mCMB: REHO_RESULT >> reall: 0.9757, accuracy: 0.8763
mCMB: REHO_RESULT >> reall: 0.9304, accuracy: 0.8833
mCMB: REHO_RESULT >> reall: 0.9148, accuracy: 0.8767
mFMB: REHO_RESULT >> reall: 0.9295, accuracy: 0.8730
mFMB: REHO_RESULT >> reall: 0.8305, accuracy: 0.8412
mFMB: REHO_RESULT >> reall: 0.8296, accuracy: 0.8409
SCR_RESULT!!!!


['mCMB', 'mFMB']

mCMB: SCR_RESULT >> reall: 0.9777, accuracy: 0.8819
mCMB: SCR_RESULT >> reall: 0.9761, accuracy: 0.8802
mCMB: SCR_RESULT >> reall: 0.8869, accuracy: 0.8642
mCMB: SCR_RESULT >> reall: 0.8842, accuracy: 0.8602
mFMB: SCR_RESULT >> reall: 0.9574, accuracy: 0.8454
mFMB: SCR_RESULT >> reall: 0.9594, accuracy: 0.8506
mFMB: SCR_RESULT >> reall: 0.7777, accuracy: 0.7684
mFMB: SCR_RESULT >> reall: 0.7922, accuracy: 0.7756


['ExtraTreesEntr_BAG_L1',
 '',
 '',
 'RandomForestEntr_BAG_L1',
 '',
 '',
 'LightGBMXT_BAG_L1',
 '',
 '',
 'LightGBM_BAG_L1',
 '',
 '']

Unnamed: 0,type,target_y,recall,accuracy,Unnamed: 5,recall.1,accuracy.1,Unnamed: 8,recall.2,accuracy.2,Unnamed: 11,recall.3,accuracy.3,Unnamed: 14,데이터수,index
0,mCMB,MNY_RESULT,0.966205,0.863838,,0.968852,0.863253,,0.779723,0.779216,,0.795399,0.788823,,11971,0
2,mCMB,REHO_RESULT,,,,0.975728,0.876266,,0.930421,0.883311,,0.914779,0.876706,,2271,2
4,mCMB,SCR_RESULT,0.977695,0.881935,,0.976102,0.88016,,0.886883,0.864181,,0.884227,0.860186,,2253,4
1,mFMB,MNY_RESULT,1.0,0.855422,,1.0,0.855422,,0.84058,0.783133,,0.898551,0.831325,,83,1
3,mFMB,REHO_RESULT,,,,0.929502,0.872974,,0.830464,0.841176,,0.829648,0.84085,,36827,3
5,mFMB,SCR_RESULT,0.957435,0.845365,,0.959448,0.850614,,0.777682,0.768405,,0.792206,0.775564,,8381,5


### 4) 통합 데이터의 최적 예측 모델(들) 선정 및 모델별 성능 요약

In [16]:
# 선정된 모델명 리스트
trg_model_ls = ['RandomForestEntr_BAG_L1']
print(f'선정된 모델 : {trg_model_ls}')

선정된 모델 : ['RandomForestEntr_BAG_L1']


In [17]:
# 물성값: regression, 불량여부: binary
problem_type_ = 'binary' # 'regression', 'binary'

# 모델 인덱스
file_suffix='combined'

if problem_type_ == 'regression':
    # 물성값 예측
    yCols=['HS','SG','TS','EB']#'MNY','REHO_MIN','REHO_MAX','REHO_TS2','REHO_TC90','SCR']
    # yCols=['MNY','REHO_MIN','REHO_MAX','REHO_TS2','REHO_TC90','SCR']
    data_type_ = 'SFT_regr'
    
    col_Nms = ['target_y']
    for trg_model in trg_model_ls:
        col_Nms = col_Nms + ['mape','mape0','smape']
else:
    # 불량여부 예측
    # yCols=['HS_RESULT','SG_RESULT','TS_RESULT','EB_RESULT']#,'MNY_RESULT','REHO_RESULT','SCR_RESULT']
    yCols=['MNY_RESULT','REHO_RESULT','SCR_RESULT']
    data_type_ = 'SFT_clas'
    
    col_Nms = ['target_y']
    for trg_model in trg_model_ls:
        col_Nms = col_Nms + ['recall','accuracy','']
    
col_Nms.append('데이터수')
fin_df = pd.DataFrame(columns=col_Nms)

for yCol in yCols:    
    print(f'{yCol}!!!!')
    print(f'='*80)
    # 데이터셋 로드 및 모델 입출력 데이터 생성
    train = pd.read_csv(f'./final_dataset/{file_suffix}_data_{yCol}_fin.csv')
    X_train, X_test, y_train, y_test = train_test_split(train.drop(['ID', 'REAL_VAL','TYPE'], axis=1), train[['REAL_VAL']], test_size=0.2, random_state=seed)

    # 모델 로드    
    predictor = TabularPredictor.load(path=f'{data_type_}/{file_suffix}_{yCol}_models', verbosity = 0)
        
    result_ls = []
    for model_name in trg_model_ls:
        # 테스트 세트에 대한 예측
        y_pred = predictor.predict(X_test, model=model_name)

        res_df = pd.DataFrame()
        res_df[yCol+'_real'] = y_test
        res_df[yCol+'_pred'] = y_pred

        # 성능 평가
        if problem_type_ == 'regression':
            mape = mean_absolute_percentage_error(res_df[yCol+'_real'], res_df[yCol+'_pred'])
            zero_mape = mape_non_zero(res_df[yCol+'_real'], res_df[yCol+'_pred'])
            smape = smape_cal(res_df[yCol+'_real'], res_df[yCol+'_pred'])
            print(f"{yCol} >> MAPE: {mape:.4f}, zero_mape: {zero_mape:.4f}, smape: {smape:.4f}")
            result_ls = result_ls + [mape, zero_mape, smape]
        else:
            recall = recall_score(res_df[yCol+'_real'], res_df[yCol+'_pred'])
            accuracy = accuracy_score(res_df[yCol+'_real'], res_df[yCol+'_pred'])
            print(f"{yCol} >> reall: {recall:.4f}, accuracy: {accuracy:.4f}")
            result_ls = result_ls + [recall, accuracy, '']

    fin_df.loc[len(fin_df)] = [yCol] + result_ls + [len(y_test)]
    # predictor.leaderboard()

fin_df

MNY_RESULT!!!!
MNY_RESULT >> reall: 0.9691, accuracy: 0.8632
REHO_RESULT!!!!
REHO_RESULT >> reall: 0.9324, accuracy: 0.8732
SCR_RESULT!!!!
SCR_RESULT >> reall: 0.9630, accuracy: 0.8569


Unnamed: 0,target_y,recall,accuracy,Unnamed: 4,데이터수
0,MNY_RESULT,0.969069,0.863199,,12054
1,REHO_RESULT,0.932356,0.873165,,39098
2,SCR_RESULT,0.962996,0.856874,,10634


### 5) 최종적으로 선정된 모델 외 삭제 (모델 로딩 시간 단축)
<p style="font-weight:bold"> <span style="color:red">** 주의 : 실수로 모델 제거시, 다시 학습해야하므로 복사 해두고 진행하기 바랍니다.</span> </p>

In [18]:
print(f'선정된 모델 : {trg_model_ls}')

선정된 모델 : ['RandomForestEntr_BAG_L1']


In [19]:
# 물성값: regression, 불량여부: binary
problem_type_ = 'binary' # 'regression', 'binary'

# 모델 인덱스
file_suffix='combined'

if problem_type_ == 'regression':
    # 물성값 예측
    yCols=['HS','SG','TS','EB']#'MNY','REHO_MIN','REHO_MAX','REHO_TS2','REHO_TC90','SCR']
    yCols=['MNY','REHO_MIN','REHO_MAX','REHO_TS2','REHO_TC90','SCR']
    data_type_ = 'SFT_regr'
else:
    # 불량여부 예측
    # yCols=['HS_RESULT','SG_RESULT','TS_RESULT','EB_RESULT']#,'MNY_RESULT','REHO_RESULT','SCR_RESULT']
    yCols=['MNY_RESULT','REHO_RESULT','SCR_RESULT']
    data_type_ = 'SFT_clas'

for yCol in yCols:
    print(yCol)
    predictor = TabularPredictor.load(path=f'{data_type_}/{file_suffix}_{yCol}_models', verbosity = 0)
    predictor.delete_models(models_to_keep=trg_model_ls, models_to_delete=None, dry_run=False)
    predictor.leaderboard()

MNY_RESULT
                     model  score_val  pred_time_val  fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0  RandomForestEntr_BAG_L1    0.86303       3.007013  8.045981                3.007013           8.045981            1       True          1
REHO_RESULT
                     model  score_val  pred_time_val   fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0  RandomForestEntr_BAG_L1   0.869843       7.861004  24.131513                7.861004          24.131513            1       True          1
SCR_RESULT
                     model  score_val  pred_time_val  fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0  RandomForestEntr_BAG_L1    0.85767         3.1224  6.746012                  3.1224           6.746012            1       True          1
