In [None]:
# 파일이름 : model_evaluation_same_models.ipynb
# 코드설명 : 물성값 및 불량여부 예측 AI 모델의 성능 비교 및 최적(동일 기술) 모델 선정, 필요없는 모델 제거
# 입/출력 : 예측 대상별 학습된 모델들 / 예측 대상별 최종 선정 모델(동일 기술)
# 유의 사항 :
# 1. 경량화에서 실수로 제거 시 재학습에 오랜 시간이 소요되므로, 복사해두고 진행
# 2. autogluon(0.8.2)과 python(3.9.18) 버전을 맞추어야 함
# 3. 모델 저장 경로를 잘 수정하여 진행
# 최종수정 : 2023년 11월 23일
# 제 작 자 : 홍민성 (mshong@micube.co.kr), 맹영준 (myj6223@micube.co.kr)
# Copyright : MICUBE Solution, Inc.

# python 버전 : 3.9.18
# autogluon 버전 : 0.8.2

In [1]:
# 관련 라이브러리 로드
import numpy as np
import pandas as pd
pd.set_option("display.max_columns", 500)
pd.set_option('display.max_rows', 50)

import os
import random
import warnings
warnings.filterwarnings(action='ignore')

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_percentage_error, recall_score, accuracy_score

# autoML
from autogluon.tabular import TabularPredictor

# 동일 결과 재생성을 위한 랜덤성 고정
seed = 42
random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)
np.random.seed(seed)

In [3]:
# SMAPE 계산 함수 정의
def smape_cal(y_true, y_pred):
    return np.mean(2 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred)))

# 0을 제외한 MAPE 계산 함수 정의
def mape_non_zero(y_true, y_pred):
    mask = y_true != 0
    return np.mean(np.abs((y_pred[mask] - y_true[mask]) / y_true[mask]))

## 1. AutoML (Autogluon) 회귀 모델 로드 및 모델 조정

### 1) 물성값 예측에 대한 회귀 모델 로드 및 leaderboard 출력

In [None]:
# 물성값: regression, 불량여부: binary
problem_type_ = 'regression' # 'regression', 'binary'

# 모델 인덱스
file_suffix='combined'

if problem_type_ == 'regression':
    # 물성값 예측
    # yCols=['HS','SG','TS','EB']#'MNY','REHO_MIN','REHO_MAX','REHO_TS2','REHO_TC90','SCR']
    yCols=['MNY','REHO_MIN','REHO_MAX','REHO_TS2','REHO_TC90','SCR']
    data_type_ = 'SFT_regr'
else:
    # 불량여부 예측
    # yCols=['HS_RESULT','SG_RESULT','TS_RESULT','EB_RESULT']#,'MNY_RESULT','REHO_RESULT','SCR_RESULT']
    yCols=['MNY_RESULT','REHO_RESULT','SCR_RESULT']
    data_type_ = 'SFT_clas'

for yCol in yCols:
    print(f'{yCol}!!!!')
    print(f'='*80)
    # 데이터셋 로드 및 모델 입출력 데이터 생성
    train = pd.read_csv(f'./final_dataset/{file_suffix}_data_{yCol}_fin.csv')
    X_train, X_test, y_train, y_test = train_test_split(train.drop(['ID', 'REAL_VAL','TYPE'], axis=1), train['REAL_VAL'], test_size=0.2, random_state=seed)
    train_data = train.iloc[X_train.index].drop(['ID','TYPE'], axis=1)

    # 모델 로드    
    predictor = TabularPredictor.load(path=f'{data_type_}/{file_suffix}_{yCol}_models', verbosity = 0)

    # 테스트 세트에 대한 예측
    y_pred = predictor.predict(X_test)

    res_df = pd.DataFrame()
    res_df[yCol+'_real'] = y_test
    res_df[yCol+'_pred'] = y_pred

    # 성능 평가
    if problem_type_ == 'regression':
        mape = mean_absolute_percentage_error(res_df[yCol+'_real'], res_df[yCol+'_pred'])
        zero_mape = mape_non_zero(res_df[yCol+'_real'], res_df[yCol+'_pred'])
        smape = smape_cal(res_df[yCol+'_real'], res_df[yCol+'_pred'])
        print(f"{yCol} >> MAPE: {mape:.4f}, zero_mape: {zero_mape:.4f}, smape: {smape:.4f}")
    else:
        recall = recall_score(res_df[yCol+'_real'], res_df[yCol+'_pred'])
        accuracy = accuracy_score(res_df[yCol+'_real'], res_df[yCol+'_pred'])
        print(f"{yCol} >> reall: {recall:.4f}, accuracy: {accuracy:.4f}")

    # 학습된 모델의 성능 요약
    predictor.leaderboard()
    print(f'='*80,'\n')

MNY!!!!
MNY >> MAPE: 0.0946, zero_mape: 0.0946, smape: 0.0637
                    model  score_val  pred_time_val    fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0     WeightedEnsemble_L2  -6.379864      12.463250  142.153263                0.000805           0.429593            2       True          7
1     WeightedEnsemble_L3  -6.418823      16.148978  185.673658                0.000760           0.241785            3       True         10
2         LightGBM_BAG_L2  -6.443068      15.937715  180.487148                0.130140           3.616469            2       True          9
3       LightGBMXT_BAG_L2  -6.501688      16.018078  181.815404                0.210503           4.944726            2       True          8
4  RandomForestMSE_BAG_L1  -6.605629       3.912031   61.144779                3.912031          61.144779            1       True          5
5         LightGBM_BAG_L1  -6.637255       3.348607   36.976955                3.34860

### 2) 모델 선정

In [3]:
# 선정된 모델명 리스트
trg_model_ls = ['LightGBM_BAG_L2']
# trg_model_ls = ['LightGBMXT_BAG_L1']
print(f'선정된 모델 : {trg_model_ls}')

선정된 모델 : ['LightGBM_BAG_L2']


### 3) 개별 데이터의 최적 예측 모델(들) 선정 및 모델별 성능 요약
- 개별 데이터(LAB, MES-CMB, MES-FMB)

In [5]:
# 물성값: regression, 불량여부: binary
problem_type_ = 'regression' # 'regression', 'binary'

# 모델 인덱스
file_suffix='combined'

if problem_type_ == 'regression':
    # 물성값 예측
    # yCols=['HS','SG','TS','EB']#'MNY','REHO_MIN','REHO_MAX','REHO_TS2','REHO_TC90','SCR']
    yCols=['MNY','REHO_MIN','REHO_MAX','REHO_TS2','REHO_TC90','SCR']
    data_type_ = 'SFT_regr'
    
    col_Nms = ['type','target_y']
    for trg_model in trg_model_ls:
        col_Nms = col_Nms + ['mape','mape0','smape']    
else:
    # 불량여부 예측
    # yCols=['HS_RESULT','SG_RESULT','TS_RESULT','EB_RESULT']#,'MNY_RESULT','REHO_RESULT','SCR_RESULT']
    yCols=['MNY_RESULT','REHO_RESULT','SCR_RESULT']
    data_type_ = 'SFT_clas'
    
    col_Nms = ['type','target_y']
    for trg_model in trg_model_ls:
        col_Nms = col_Nms + ['recall','accuracy','']
    
col_Nms.append('데이터수')
fin_df = pd.DataFrame(columns=col_Nms)

for yCol in yCols:    
    print(f'{yCol}!!!!')
    print(f'='*80)
    # 데이터셋 로드 및 모델 입출력 데이터 생성
    train = pd.read_csv(f'./final_dataset/{file_suffix}_data_{yCol}_fin.csv')
    X_train, X_test, y_train, y_test = train_test_split(train.drop(['ID', 'REAL_VAL'], axis=1), train[['REAL_VAL','TYPE']], test_size=0.2, random_state=seed)

    # 모델 로드    
    predictor = TabularPredictor.load(path=f'{data_type_}/{file_suffix}_{yCol}_models', verbosity = 0)
        
    types = X_test['TYPE'].unique().tolist()
    types.sort()
    display(types)
    
    for type_ in types:
        tmp_X_test = X_test[X_test['TYPE'] == type_]
        tmp_y_test = y_test[y_test['TYPE'] == type_]
        tmp_X_test = tmp_X_test.drop(columns=['TYPE'])
        tmp_y_test = tmp_y_test['REAL_VAL']

        result_ls = []
        for model_name in trg_model_ls:
            # 테스트 세트에 대한 예측
            y_pred = predictor.predict(tmp_X_test, model=model_name)

            res_df = pd.DataFrame()
            res_df[yCol+'_real'] = tmp_y_test
            res_df[yCol+'_pred'] = y_pred

            # 성능 평가
            if problem_type_ == 'regression':
                mape = mean_absolute_percentage_error(res_df[yCol+'_real'], res_df[yCol+'_pred'])
                zero_mape = mape_non_zero(res_df[yCol+'_real'], res_df[yCol+'_pred'])
                smape = smape_cal(res_df[yCol+'_real'], res_df[yCol+'_pred'])
                print(f"{type_}: {yCol} >> MAPE: {mape:.4f}, zero_mape: {zero_mape:.4f}, smape: {smape:.4f}")
                result_ls = result_ls + [mape, zero_mape, smape]
            else:
                recall = recall_score(res_df[yCol+'_real'], res_df[yCol+'_pred'])
                accuracy = accuracy_score(res_df[yCol+'_real'], res_df[yCol+'_pred'])
                print(f"{type_}: {yCol} >> reall: {recall:.4f}, accuracy: {accuracy:.4f}")
                result_ls = result_ls + [recall, accuracy, '']

        fin_df.loc[len(fin_df)] = [type_, yCol] + result_ls + [len(tmp_X_test)]
        # predictor.leaderboard()

# 데이터 타입별 성능 결과 정렬
fin_df.sort_values(['type'])

MNY!!!!


['LAB', 'mCMB', 'mFMB']

LAB: MNY >> MAPE: 0.2259, zero_mape: 0.2259, smape: 0.1993
mCMB: MNY >> MAPE: 0.0976, zero_mape: 0.0976, smape: 0.0665
mFMB: MNY >> MAPE: 0.0595, zero_mape: 0.0595, smape: 0.0601
REHO_MIN!!!!


['LAB', 'mCMB', 'mFMB']

LAB: REHO_MIN >> MAPE: 0.3339, zero_mape: 0.3339, smape: 0.2744
mCMB: REHO_MIN >> MAPE: 0.2524, zero_mape: 0.2524, smape: 0.2203
mFMB: REHO_MIN >> MAPE: 0.1573, zero_mape: 0.1573, smape: 0.1428
REHO_MAX!!!!


['LAB', 'mCMB', 'mFMB']

LAB: REHO_MAX >> MAPE: 0.2269, zero_mape: 0.2269, smape: 0.1638
mCMB: REHO_MAX >> MAPE: 0.0787, zero_mape: 0.0787, smape: 0.0755
mFMB: REHO_MAX >> MAPE: 0.0789, zero_mape: 0.0789, smape: 0.0738
REHO_TS2!!!!


['mCMB', 'mFMB']

mCMB: REHO_TS2 >> MAPE: 0.0714, zero_mape: 0.0714, smape: 0.0703
mFMB: REHO_TS2 >> MAPE: 0.0525, zero_mape: 0.0525, smape: 0.0514
REHO_TC90!!!!


['mCMB', 'mFMB']

mCMB: REHO_TC90 >> MAPE: 0.0672, zero_mape: 0.0672, smape: 0.0669
mFMB: REHO_TC90 >> MAPE: 0.0485, zero_mape: 0.0485, smape: 0.0480
SCR!!!!


['mCMB', 'mFMB']

mCMB: SCR >> MAPE: 0.1483, zero_mape: 0.1483, smape: 0.1314
mFMB: SCR >> MAPE: 0.0716, zero_mape: 0.0716, smape: 0.0689


Unnamed: 0,type,target_y,mape,mape0,smape,데이터수
0,LAB,MNY,0.225863,0.225863,0.199317,157
3,LAB,REHO_MIN,0.333926,0.333926,0.274417,1551
6,LAB,REHO_MAX,0.2269,0.2269,0.163763,1559
1,mCMB,MNY,0.097623,0.097623,0.066475,11766
4,mCMB,REHO_MIN,0.252432,0.252432,0.220344,2261
7,mCMB,REHO_MAX,0.078738,0.078738,0.075508,2258
9,mCMB,REHO_TS2,0.071393,0.071393,0.070281,2242
11,mCMB,REHO_TC90,0.067186,0.067186,0.066912,2239
13,mCMB,SCR,0.148257,0.148257,0.131356,2169
2,mFMB,MNY,0.059478,0.059478,0.060118,80


### 4) 통합 데이터의 최적 예측 모델(들) 선정 및 모델별 성능 요약

In [5]:
# 물성값: regression, 불량여부: binary
problem_type_ = 'regression' # 'regression', 'binary'

# 모델 인덱스
file_suffix='combined'

if problem_type_ == 'regression':
    # 물성값 예측
    yCols=['HS','SG','TS','EB']#'MNY','REHO_MIN','REHO_MAX','REHO_TS2','REHO_TC90','SCR']
    # yCols=['MNY','REHO_MIN','REHO_MAX','REHO_TS2','REHO_TC90','SCR']
    data_type_ = 'SFT_regr'
    
    col_Nms = ['target_y']
    for trg_model in trg_model_ls:
        col_Nms += ['mape','mape0','smape']    
else:
    # 불량여부 예측
    yCols=['HS_RESULT','SG_RESULT','TS_RESULT','EB_RESULT']#,'MNY_RESULT','REHO_RESULT','SCR_RESULT']
    # yCols=['MNY_RESULT','REHO_RESULT','SCR_RESULT']
    data_type_ = 'SFT_clas'
    
    col_Nms = ['target_y']
    for trg_model in trg_model_ls:
        col_Nms += ['recall','accuracy','']
    
col_Nms.append('데이터수')
fin_df = pd.DataFrame(columns=col_Nms)

for yCol in yCols:    
    print(f'{yCol}!!!!')
    print(f'='*80)
    # 데이터셋 로드 및 모델 입출력 데이터 생성
    train = pd.read_csv(f'./final_dataset/{file_suffix}_data_{yCol}_fin.csv')
    X_train, X_test, y_train, y_test = train_test_split(train.drop(['ID', 'REAL_VAL','TYPE'], axis=1), train[['REAL_VAL']], test_size=0.2, random_state=seed)

    # 모델 로드    
    predictor = TabularPredictor.load(path=f'{data_type_}/{file_suffix}_{yCol}_models', verbosity = 0)
        
    result_ls = []
    for model_name in trg_model_ls:
        # 테스트 세트에 대한 예측
        y_pred = predictor.predict(X_test, model=model_name)

        res_df = pd.DataFrame()
        res_df[yCol+'_real'] = y_test
        res_df[yCol+'_pred'] = y_pred

        # 성능 평가
        if problem_type_ == 'regression':
            mape = mean_absolute_percentage_error(res_df[yCol+'_real'], res_df[yCol+'_pred'])
            zero_mape = mape_non_zero(res_df[yCol+'_real'], res_df[yCol+'_pred'])
            smape = smape_cal(res_df[yCol+'_real'], res_df[yCol+'_pred'])
            print(f"{yCol} >> MAPE: {mape:.4f}, zero_mape: {zero_mape:.4f}, smape: {smape:.4f}")
            result_ls = result_ls + [mape, zero_mape, smape]
        else:
            recall = recall_score(res_df[yCol+'_real'], res_df[yCol+'_pred'])
            accuracy = accuracy_score(res_df[yCol+'_real'], res_df[yCol+'_pred'])
            print(f"{yCol} >> reall: {recall:.4f}, accuracy: {accuracy:.4f}")
            result_ls = result_ls + [recall, accuracy, '']

    fin_df.loc[len(fin_df)] = [yCol] + result_ls + [len(y_test)]
    # predictor.leaderboard()

# 데이터 타입별 성능 결과 정렬
fin_df.sort_values(['type'])

HS!!!!
HS >> MAPE: 0.0245, zero_mape: 0.0245, smape: 0.0241
SG!!!!
SG >> MAPE: 0.0100, zero_mape: 0.0100, smape: 0.0101
TS!!!!
TS >> MAPE: 0.0748, zero_mape: 0.0748, smape: 0.0631
EB!!!!
EB >> MAPE: 0.0776, zero_mape: 0.0776, smape: 0.0719


Unnamed: 0,target_y,mape,mape0,smape,데이터수
0,HS,0.024478,0.024478,0.024132,7792
1,SG,0.009982,0.009982,0.010079,13449
2,TS,0.074823,0.074823,0.063081,7355
3,EB,0.077623,0.077623,0.071922,7359


### 5) 최종적으로 선정된 모델 외 삭제 (모델 로딩 시간 단축)
<p style="font-weight:bold"> <span style="color:red">** 주의 : 실수로 모델 제거시, 다시 학습해야하므로 복사 해두고 진행하기 바랍니다.</span> </p>

In [12]:
# trg_model_ls = ['LightGBM_BAG_L2']
trg_model_ls = ['LightGBMXT_BAG_L1']
print(f'선정된 모델 : {trg_model_ls}')

선정된 모델 : ['LightGBMXT_BAG_L1']


In [13]:
# 물성값: regression, 불량여부: binary
problem_type_ = 'regression' # 'regression', 'binary'

# 모델 인덱스
file_suffix='combined'

if problem_type_ == 'regression':
    # 물성값 예측
    # yCols=['HS','SG','TS','EB']#'MNY','REHO_MIN','REHO_MAX','REHO_TS2','REHO_TC90','SCR']
    yCols=['MNY','REHO_MIN','REHO_MAX','REHO_TS2','REHO_TC90','SCR']
    data_type_ = 'SFT_regr'
else:
    # 불량여부 예측
    yCols=['HS_RESULT','SG_RESULT','TS_RESULT','EB_RESULT']#,'MNY_RESULT','REHO_RESULT','SCR_RESULT']
    yCols=['MNY_RESULT','REHO_RESULT','SCR_RESULT']
    data_type_ = 'SFT_clas'

for yCol in yCols:
    print(yCol)
    predictor = TabularPredictor.load(path=f'{data_type_}/{file_suffix}_{yCol}_models', verbosity = 0)
    predictor.delete_models(models_to_keep=trg_model_ls, models_to_delete=None, dry_run=False)
    predictor.leaderboard()

MNY
               model  score_val  pred_time_val   fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0  LightGBMXT_BAG_L1  -6.640294       5.201806  43.601936                5.201806          43.601936            1       True          1
REHO_MIN
               model  score_val  pred_time_val   fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0  LightGBMXT_BAG_L1  -1.877291      10.619226  80.194013               10.619226          80.194013            1       True          1
REHO_MAX
               model  score_val  pred_time_val    fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0  LightGBMXT_BAG_L1  -3.111157      80.201546  125.472447               80.201546         125.472447            1       True          1
REHO_TS2
               model  score_val  pred_time_val    fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0  LightGBMXT_

## 2. AutoML (Autogluon) 분류 모델 로드 및 모델 조정

### 1) 불량여부 예측에 대한 분류 모델 로드 및 leaderboard 출력

In [6]:
# 물성값: regression, 불량여부: binary
problem_type_ = 'binary' # 'regression', 'binary'

# 모델 인덱스
file_suffix='combined'

if problem_type_ == 'regression':
    # 물성값 예측
    yCols=['HS','SG','TS','EB']#'MNY','REHO_MIN','REHO_MAX','REHO_TS2','REHO_TC90','SCR']
    yCols=['MNY','REHO_MIN','REHO_MAX','REHO_TS2','REHO_TC90','SCR']
    data_type_ = 'SFT_regr'
else:
    # 불량여부 예측
    yCols=['HS_RESULT','SG_RESULT','TS_RESULT','EB_RESULT']#,'MNY_RESULT','REHO_RESULT','SCR_RESULT']
    yCols=['MNY_RESULT','REHO_RESULT','SCR_RESULT']
    data_type_ = 'SFT_clas'

for yCol in yCols:
    print(f'{yCol}!!!!')
    print(f'='*80)
    # 데이터셋 로드 및 모델 입출력 데이터 생성
    train = pd.read_csv(f'./final_dataset/{file_suffix}_data_{yCol}_fin.csv')
    X_train, X_test, y_train, y_test = train_test_split(train.drop(['ID', 'REAL_VAL','TYPE'], axis=1), train['REAL_VAL'], test_size=0.2, random_state=seed)
    train_data = train.iloc[X_train.index].drop(['ID','TYPE'], axis=1)

    # 모델 로드    
    predictor = TabularPredictor.load(path=f'{data_type_}/{file_suffix}_{yCol}_models', verbosity = 0)

    # 테스트 세트에 대한 예측
    y_pred = predictor.predict(X_test)

    res_df = pd.DataFrame()
    res_df[yCol+'_real'] = y_test
    res_df[yCol+'_pred'] = y_pred

    # 성능 평가
    if problem_type_ == 'regression':
        mape = mean_absolute_percentage_error(res_df[yCol+'_real'], res_df[yCol+'_pred'])
        zero_mape = mape_non_zero(res_df[yCol+'_real'], res_df[yCol+'_pred'])
        smape = smape_cal(res_df[yCol+'_real'], res_df[yCol+'_pred'])
        print(f"{yCol} >> MAPE: {mape:.4f}, zero_mape: {zero_mape:.4f}, smape: {smape:.4f}")
    else:
        recall = recall_score(res_df[yCol+'_real'], res_df[yCol+'_pred'])
        accuracy = accuracy_score(res_df[yCol+'_real'], res_df[yCol+'_pred'])
        print(f"{yCol} >> reall: {recall:.4f}, accuracy: {accuracy:.4f}")

    # 학습된 모델의 성능 요약
    predictor.leaderboard()
    print(f'='*80,'\n')

MNY_RESULT!!!!
MNY_RESULT >> reall: 0.9658, accuracy: 0.8625
                      model  score_val  pred_time_val    fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0       WeightedEnsemble_L2   0.866079      15.537955   30.924415                0.100324          13.252153            2       True         12
1     ExtraTreesGini_BAG_L1   0.865042       3.593026    3.111706                3.593026           3.111706            1       True          8
2     ExtraTreesEntr_BAG_L1   0.864752       3.561210    3.080195                3.561210           3.080195            1       True          9
3   RandomForestEntr_BAG_L1   0.864586       3.334895    5.696335                3.334895           5.696335            1       True          6
4   RandomForestGini_BAG_L1   0.864275       3.408038    5.652200                3.408038           5.652200            1       True          5
5         LightGBMXT_BAG_L1   0.858011       1.513970   18.055599          

### 2) 모델 선정

In [11]:
# 선정된 모델명 리스트
# 기본물성 값 예측 최종 모델
trg_model_ls = ['LightGBM_BAG_L2']
# # 기본물성 불량여부 예측 최종 모델
trg_model_ls = ['XGBoost_BAG_L1']
# trg_model_ls = ['LightGBMXT_BAG_L1', 'LightGBM_BAG_L1', 'KNeighborsDist_BAG_L1', 'KNeighborsUnif_BAG_L1']
print(f'선정된 모델 : {trg_model_ls}')

선정된 모델 : ['XGBoost_BAG_L1']


### 3) 개별 데이터의 최적 예측 모델(들) 선정 및 모델별 성능 요약
- 개별 데이터(LAB, MES-CMB, MES-FMB)

In [17]:
# 물성값: regression, 불량여부: binary
problem_type_ = 'binary' # 'regression', 'binary'

# 모델 인덱스
file_suffix='combined'

if problem_type_ == 'regression':
    # 물성값 예측
    # yCols=['HS','SG','TS','EB']#'MNY','REHO_MIN','REHO_MAX','REHO_TS2','REHO_TC90','SCR']
    yCols=['MNY','REHO_MIN','REHO_MAX','REHO_TS2','REHO_TC90','SCR']
    data_type_ = 'SFT_regr'
    
    col_Nms = ['type','target_y']
    for trg_model in trg_model_ls:
        col_Nms = col_Nms + ['mape','mape0','smape']    
else:
    # 불량여부 예측
    yCols=['HS_RESULT','SG_RESULT','TS_RESULT','EB_RESULT']#,'MNY_RESULT','REHO_RESULT','SCR_RESULT']
    # yCols=['MNY_RESULT','REHO_RESULT','SCR_RESULT']
    data_type_ = 'SFT_clas'
    
    col_Nms = ['type','target_y']
    for trg_model in trg_model_ls:
        col_Nms = col_Nms + ['recall','accuracy','']
    
col_Nms.append('데이터수')
fin_df = pd.DataFrame(columns=col_Nms)

for yCol in yCols:    
    print(f'{yCol}!!!!')
    print(f'='*80)
    # 데이터셋 로드 및 모델 입출력 데이터 생성
    train = pd.read_csv(f'./final_dataset/{file_suffix}_data_{yCol}_fin.csv')
    X_train, X_test, y_train, y_test = train_test_split(train.drop(['ID', 'REAL_VAL'], axis=1), train[['REAL_VAL','TYPE']], test_size=0.2, random_state=seed)

    # 모델 로드    
    predictor = TabularPredictor.load(path=f'{data_type_}/{file_suffix}_{yCol}_models', verbosity = 0)
        
    types = X_test['TYPE'].unique().tolist()
    types.sort()
    display(types)
    
    for type_ in types:
        tmp_X_test = X_test[X_test['TYPE'] == type_]
        tmp_y_test = y_test[y_test['TYPE'] == type_]
        tmp_X_test = tmp_X_test.drop(columns=['TYPE'])
        tmp_y_test = tmp_y_test['REAL_VAL']

        result_ls = []
        for model_name in trg_model_ls:
            # 테스트 세트에 대한 예측
            y_pred = predictor.predict(tmp_X_test, model=model_name)

            res_df = pd.DataFrame()
            res_df[yCol+'_real'] = tmp_y_test
            res_df[yCol+'_pred'] = y_pred

            # 성능 평가
            if problem_type_ == 'regression':
                mape = mean_absolute_percentage_error(res_df[yCol+'_real'], res_df[yCol+'_pred'])
                zero_mape = mape_non_zero(res_df[yCol+'_real'], res_df[yCol+'_pred'])
                smape = smape_cal(res_df[yCol+'_real'], res_df[yCol+'_pred'])
                print(f"{type_}: {yCol} >> MAPE: {mape:.4f}, zero_mape: {zero_mape:.4f}, smape: {smape:.4f}")
                result_ls = result_ls + [mape, zero_mape, smape]
            else:
                recall = recall_score(res_df[yCol+'_real'], res_df[yCol+'_pred'])
                accuracy = accuracy_score(res_df[yCol+'_real'], res_df[yCol+'_pred'])
                print(f"{type_}: {yCol} >> reall: {recall:.4f}, accuracy: {accuracy:.4f}")
                result_ls = result_ls + [recall, accuracy, '']

        fin_df.loc[len(fin_df)] = [type_, yCol] + result_ls + [len(tmp_X_test)]
        # predictor.leaderboard()

fin_df.sort_values(['type'])

MNY_RESULT!!!!


['mCMB', 'mFMB']

mCMB: MNY_RESULT >> reall: 0.9666, accuracy: 0.8587
mCMB: MNY_RESULT >> reall: 0.9718, accuracy: 0.8577
mCMB: MNY_RESULT >> reall: 0.9371, accuracy: 0.8333
mCMB: MNY_RESULT >> reall: 0.9367, accuracy: 0.8319
mFMB: MNY_RESULT >> reall: 1.0000, accuracy: 0.8434
mFMB: MNY_RESULT >> reall: 1.0000, accuracy: 0.8313
mFMB: MNY_RESULT >> reall: 0.9710, accuracy: 0.8313
mFMB: MNY_RESULT >> reall: 0.9565, accuracy: 0.8193
REHO_RESULT!!!!


['mCMB', 'mFMB']

mCMB: REHO_RESULT >> reall: 0.9617, accuracy: 0.8855
mCMB: REHO_RESULT >> reall: 0.9709, accuracy: 0.8864
mCMB: REHO_RESULT >> reall: 0.9444, accuracy: 0.8432
mCMB: REHO_RESULT >> reall: 0.9455, accuracy: 0.8388
mFMB: REHO_RESULT >> reall: 0.9408, accuracy: 0.8761
mFMB: REHO_RESULT >> reall: 0.9446, accuracy: 0.8775
mFMB: REHO_RESULT >> reall: 0.9125, accuracy: 0.8533
mFMB: REHO_RESULT >> reall: 0.9142, accuracy: 0.8542
SCR_RESULT!!!!


['mCMB', 'mFMB']

mCMB: SCR_RESULT >> reall: 0.9703, accuracy: 0.8855
mCMB: SCR_RESULT >> reall: 0.9703, accuracy: 0.8753
mCMB: SCR_RESULT >> reall: 0.9623, accuracy: 0.8651
mCMB: SCR_RESULT >> reall: 0.9649, accuracy: 0.8651
mFMB: SCR_RESULT >> reall: 0.9787, accuracy: 0.8530
mFMB: SCR_RESULT >> reall: 0.9812, accuracy: 0.8522
mFMB: SCR_RESULT >> reall: 0.9390, accuracy: 0.8435
mFMB: SCR_RESULT >> reall: 0.9436, accuracy: 0.8452


Unnamed: 0,type,target_y,recall,accuracy,Unnamed: 5,recall.1,accuracy.1,Unnamed: 8,recall.2,accuracy.2,Unnamed: 11,recall.3,accuracy.3,Unnamed: 14,데이터수
0,mCMB,MNY_RESULT,0.966612,0.858658,,0.971804,0.85774,,0.937093,0.833264,,0.936686,0.831927,,11971
2,mCMB,REHO_RESULT,0.961704,0.885513,,0.970874,0.886394,,0.944444,0.843241,,0.945523,0.838838,,2271
4,mCMB,SCR_RESULT,0.97026,0.885486,,0.97026,0.875277,,0.962294,0.865069,,0.96495,0.865069,,2253
1,mFMB,MNY_RESULT,1.0,0.843373,,1.0,0.831325,,0.971014,0.831325,,0.956522,0.819277,,83
3,mFMB,REHO_RESULT,0.940755,0.876096,,0.944553,0.877454,,0.912499,0.853314,,0.914203,0.85421,,36827
5,mFMB,SCR_RESULT,0.978717,0.853001,,0.981162,0.852166,,0.939028,0.843455,,0.94363,0.845245,,8381


### 4) 통합 데이터의 최적 예측 모델(들) 선정 및 모델별 성능 요약

In [12]:
# 물성값: regression, 불량여부: binary
problem_type_ = 'binary' # 'regression', 'binary'

# 모델 인덱스
file_suffix='combined'

if problem_type_ == 'regression':
    # 물성값 예측
    yCols=['HS','SG','TS','EB']#'MNY','REHO_MIN','REHO_MAX','REHO_TS2','REHO_TC90','SCR']
    # yCols=['MNY','REHO_MIN','REHO_MAX','REHO_TS2','REHO_TC90','SCR']
    data_type_ = 'SFT_regr'
    
    col_Nms = ['target_y']
    for trg_model in trg_model_ls:
        col_Nms = col_Nms + ['mape','mape0','smape']
else:
    # 불량여부 예측
    yCols=['HS_RESULT','SG_RESULT','TS_RESULT','EB_RESULT']#,'MNY_RESULT','REHO_RESULT','SCR_RESULT']
    # yCols=['MNY_RESULT','REHO_RESULT','SCR_RESULT']
    data_type_ = 'SFT_clas'
    
    col_Nms = ['target_y']
    for trg_model in trg_model_ls:
        col_Nms = col_Nms + ['recall','accuracy','']
    
col_Nms.append('데이터수')
fin_df = pd.DataFrame(columns=col_Nms)

for yCol in yCols:    
    print(f'{yCol}!!!!')
    print(f'='*80)
    # 데이터셋 로드 및 모델 입출력 데이터 생성
    train = pd.read_csv(f'./final_dataset/{file_suffix}_data_{yCol}_fin.csv')
    X_train, X_test, y_train, y_test = train_test_split(train.drop(['ID', 'REAL_VAL','TYPE'], axis=1), train[['REAL_VAL']], test_size=0.2, random_state=seed)

    # 모델 로드    
    predictor = TabularPredictor.load(path=f'{data_type_}/{file_suffix}_{yCol}_models', verbosity = 0)
        
    result_ls = []
    for model_name in trg_model_ls:
        # 테스트 세트에 대한 예측
        y_pred = predictor.predict(X_test, model=model_name)

        res_df = pd.DataFrame()
        res_df[yCol+'_real'] = y_test
        res_df[yCol+'_pred'] = y_pred

        # 성능 평가
        if problem_type_ == 'regression':
            mape = mean_absolute_percentage_error(res_df[yCol+'_real'], res_df[yCol+'_pred'])
            zero_mape = mape_non_zero(res_df[yCol+'_real'], res_df[yCol+'_pred'])
            smape = smape_cal(res_df[yCol+'_real'], res_df[yCol+'_pred'])
            print(f"{yCol} >> MAPE: {mape:.4f}, zero_mape: {zero_mape:.4f}, smape: {smape:.4f}")
            result_ls = result_ls + [mape, zero_mape, smape]
        else:
            recall = recall_score(res_df[yCol+'_real'], res_df[yCol+'_pred'])
            accuracy = accuracy_score(res_df[yCol+'_real'], res_df[yCol+'_pred'])
            print(f"{yCol} >> reall: {recall:.4f}, accuracy: {accuracy:.4f}")
            result_ls = result_ls + [recall, accuracy, '']

    fin_df.loc[len(fin_df)] = [yCol] + result_ls + [len(y_test)]
    # predictor.leaderboard()

fin_df

HS_RESULT!!!!
HS_RESULT >> reall: 0.9965, accuracy: 0.9741
SG_RESULT!!!!
SG_RESULT >> reall: 0.9995, accuracy: 0.9776
TS_RESULT!!!!
TS_RESULT >> reall: 0.9955, accuracy: 0.9589
EB_RESULT!!!!
EB_RESULT >> reall: 0.9957, accuracy: 0.9575


Unnamed: 0,target_y,recall,accuracy,Unnamed: 4,데이터수
0,HS_RESULT,0.996471,0.974104,,6140
1,SG_RESULT,0.999489,0.977575,,14047
2,TS_RESULT,0.995473,0.958943,,5602
3,EB_RESULT,0.995661,0.957515,,5602


### 5) 최종적으로 선정된 모델 외 삭제 (모델 로딩 시간 단축)
<p style="font-weight:bold"> <span style="color:red">** 주의 : 실수로 모델 제거시, 다시 학습해야하므로 복사 해두고 진행하기 바랍니다.</span> </p>

In [27]:
# # trg_model_ls = ['XGBoost_BAG_L1']
# trg_model_ls = ['LightGBM_BAG_L1']
print(f'선정된 모델 : {trg_model_ls}')

선정된 모델 : ['LightGBM_BAG_L1']


In [31]:
# 물성값: regression, 불량여부: binary
problem_type_ = 'binary' # 'regression', 'binary'

# 모델 인덱스
file_suffix='combined'

if problem_type_ == 'regression':
    # 물성값 예측
    yCols=['HS','SG','TS','EB']#'MNY','REHO_MIN','REHO_MAX','REHO_TS2','REHO_TC90','SCR']
    # yCols=['MNY','REHO_MIN','REHO_MAX','REHO_TS2','REHO_TC90','SCR']
    data_type_ = 'SFT_regr'
else:
    # 불량여부 예측
    yCols=['HS_RESULT','SG_RESULT','TS_RESULT','EB_RESULT']#,'MNY_RESULT','REHO_RESULT','SCR_RESULT']
    # yCols=['MNY_RESULT','REHO_RESULT','SCR_RESULT']
    data_type_ = 'SFT_clas'

for yCol in yCols:
    print(yCol)
    predictor = TabularPredictor.load(path=f'{data_type_}/{file_suffix}_{yCol}_models', verbosity = 0)
    predictor.delete_models(models_to_keep=trg_model_ls, models_to_delete=None, dry_run=False)
    predictor.leaderboard()

MNY_RESULT
             model  score_val  pred_time_val  fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0  LightGBM_BAG_L1   0.856912       0.375461  6.541237                0.375461           6.541237            1       True          1
REHO_RESULT
             model  score_val  pred_time_val   fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0  LightGBM_BAG_L1   0.874089       9.690588  73.899503                9.690588          73.899503            1       True          1
SCR_RESULT
             model  score_val  pred_time_val  fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0  LightGBM_BAG_L1   0.859198       0.236465  5.013481                0.236465           5.013481            1       True          1
