In [None]:
# 파일이름 : model_learning.ipynb
# 코드설명 : AutoML 기반 고무 배합의 물성값 및 불량여부 예측 AI 모델 학습 및 결과 저장
# 입/출력 : 물성별(물성값, 불량여부) 데이터 세트 / 예측 대상별 학습된 모델들
# 유의 사항 : 
# 1. 기본물성과 가류/점도를 한 번에 학습
# 2. 물성값 회귀모델 학습: regression, 불량여부 분류모델 학습 : binary로 설정
# 3. autogluon(0.8.2)과 python(3.9.18) 버전을 맞추어야 함
# 최종수정 : 2023년 11월 27일
# 제 작 자 : 홍민성 (mshong@micube.co.kr), 맹영준 (myj6223@micube.co.kr)
# Copyright : MICUBE Solution, Inc.

# python 버전 : 3.9.18
# autogluon 버전 : 0.8.2

In [1]:
# 관련 라이브러리 로드
import numpy as np
import pandas as pd
pd.set_option("display.max_columns", 500)
pd.set_option('display.max_rows', 50)

import os
import random
import warnings
warnings.filterwarnings(action='ignore')

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_percentage_error, recall_score, accuracy_score

# autoML
from autogluon.tabular import TabularPredictor

# 동일 결과 재생성을 위한 랜덤성 고정
seed = 42
random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)
np.random.seed(seed)

In [2]:
# SMAPE 계산 함수 정의
def smape_cal(y_true, y_pred):
    return np.mean(2 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred)))

# 0을 제외한 MAPE 계산 함수 정의
def mape_non_zero(y_true, y_pred):
    mask = y_true != 0
    return np.mean(np.abs((y_pred[mask] - y_true[mask]) / y_true[mask]))

## 1. AutoML (Autogluon) 학습 및 모델 저장

### 1) 물성 값 예측을 위한 회귀(regression) 모델 학습 및 저장

In [3]:
%%time
# 물성값: regression, 불량여부: binary
problem_type_ = 'regression' # 'regression', 'binary'

# 모델 인덱스
file_suffix='combined'

# 3분절 교차검증
nfold = 3

if problem_type_ == 'regression':
    # 물성값 예측
    yCols=['HS','SG','TS','EB','REHO_MIN','REHO_MAX','REHO_TS2','REHO_TC90','MNY','SCR']
    # yCols=['REHO_MIN','REHO_MAX','REHO_TS2','REHO_TC90','MNY','SCR']
    data_type_ = 'SFT_regr'
else:
    # 불량여부 예측
    yCols=['HS_RESULT','SG_RESULT','TS_RESULT','EB_RESULT','MNY_RESULT','REHO_RESULT','SCR_RESULT']
    # yCols=['MNY_RESULT','REHO_RESULT','SCR_RESULT']
    data_type_ = 'SFT_clas'

for yCol in yCols:
    start = time.time()
    print(f'{yCol}!!!!')
    print(f'='*80)
    # 데이터셋 로드 및 모델 입출력 데이터 생성
    train = pd.read_csv(f'./final_dataset/{file_suffix}_data_{yCol}_fin.csv')
    X_train, X_test, y_train, y_test = train_test_split(train.drop(['ID', 'REAL_VAL','TYPE'], axis=1), train['REAL_VAL'], test_size=0.2, random_state=seed)
    train_data = train.iloc[X_train.index].drop(['ID','TYPE'], axis=1)

    # 모델 학습
    predictor = TabularPredictor(
        label='REAL_VAL', 
        problem_type=problem_type_,  # 회귀 문제로 명시적 설정  binary, multiclass, regression, quantile
        eval_metric=None,           # 'accuracy' for binary/multiclass, 'root_mean_squared_error' for regression, and 'pinball_loss' for quantile.
        path=f'{data_type_}/{file_suffix}_{yCol}_models',  # 모델을 저장할 디렉토리의 전체 경로
        if problem_type_ == 'binary':
            sample_weight='balance_weight',
    ).fit(
        train_data,
        presets='best_quality',     # most accurate overall predictor: 'best_quality', good quality with minimal disk usage : 'high_quality','optimize_for_deployment'
                                    # others : 'medium_quality', 'interpretable', 'ignore_text'
        time_limit=300,
        verbosity=0,
        num_bag_folds=nfold,            # n분절-교차검증
        # auto_stack=True,          # presets에 따라 자동 설정?
        # num_gpus='auto',          # default : auto
    )

    # 테스트 세트에 대한 예측
    y_pred = predictor.predict(X_test)

    res_df = pd.DataFrame()
    res_df[yCol+'_real'] = y_test
    res_df[yCol+'_pred'] = y_pred

    # 성능 평가
    if problem_type_ == 'regression':
        mape = mean_absolute_percentage_error(res_df[yCol+'_real'], res_df[yCol+'_pred'])
        zero_mape2 = mape_non_zero(res_df[yCol+'_real'], res_df[yCol+'_pred'])
        smape = smape_cal(res_df[yCol+'_real'], res_df[yCol+'_pred'])
        print(f"{yCol} >> MAPE: {mape:.4f}, zero_mape: {zero_mape2:.4f}, smape: {smape:.4f}")
    else:
        recall = recall_score(res_df[yCol+'_real'], res_df[yCol+'_pred'])
        accuracy = accuracy_score(res_df[yCol+'_real'], res_df[yCol+'_pred'])
        print(f"{yCol} >> reall: {recall:.4f}, accuracy: {accuracy:.4f}")

    # 학습된 모델의 성능 요약
    predictor.leaderboard()
    print(f'Excution time for {yCol} : {time.time() - start}s')
    print(f'='*80,'\n')    

HS!!!!
HS >> MAPE: 0.0243, zero_mape: 0.0243, smape: 0.0240
                     model  score_val  pred_time_val    fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0      WeightedEnsemble_L3  -2.491660      36.102399  264.592798                0.000628           0.295189            3       True         14
1     ExtraTreesMSE_BAG_L2  -2.497639      33.413387  208.217978                2.597632          20.944704            2       True         13
2   RandomForestMSE_BAG_L2  -2.538483      33.374247  240.486790                2.558492          53.213516            2       True         11
3          LightGBM_BAG_L2  -2.561137      30.945648  190.139390                0.129892           2.866115            2       True         10
4          CatBoost_BAG_L2  -2.609291      31.039458  206.125921                0.223703          18.852646            2       True         12
5        LightGBMXT_BAG_L2  -2.632496      30.969712  190.231765                0.

		[36mray::_ray_fit()[39m (pid=21879, ip=172.17.0.3)
  File "/root/anaconda3/envs/python3.9/lib/python3.9/site-packages/autogluon/core/models/ensemble/fold_fitting_strategy.py", line 402, in _ray_fit
    fold_model.fit(X=X_fold, y=y_fold, X_val=X_val_fold, y_val=y_val_fold, time_limit=time_limit_fold, **resources, **kwargs_fold)
  File "/root/anaconda3/envs/python3.9/lib/python3.9/site-packages/autogluon/core/models/abstract/abstract_model.py", line 829, in fit
    out = self._fit(**kwargs)
  File "/root/anaconda3/envs/python3.9/lib/python3.9/site-packages/autogluon/tabular/models/fastainn/tabular_nn_fastai.py", line 327, in _fit
    self.model.fit_one_cycle(epochs, params["lr"], cbs=callbacks)
  File "/root/anaconda3/envs/python3.9/lib/python3.9/site-packages/fastai/callback/schedule.py", line 119, in fit_one_cycle
    self.fit(n_epoch, cbs=ParamScheduler(scheds)+L(cbs), reset_opt=reset_opt, wd=wd, start_epoch=start_epoch)
  File "/root/anaconda3/envs/python3.9/lib/python3.9/site-pa

TS >> MAPE: 0.0742, zero_mape: 0.0742, smape: 0.0621
                     model   score_val  pred_time_val    fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0      WeightedEnsemble_L3  -19.938117      14.446964  279.590407                0.000621           0.286074            3       True         15
1     ExtraTreesMSE_BAG_L2  -20.041405      11.192259  200.809494                2.837357          20.916311            2       True         14
2          LightGBM_BAG_L2  -20.199971       8.466099  182.972090                0.111197           3.078907            2       True         11
3          CatBoost_BAG_L2  -20.207609       8.597562  198.437802                0.242660          18.544619            2       True         13
4        LightGBMXT_BAG_L2  -20.236856       8.489715  182.623733                0.134813           2.730550            2       True         10
5   RandomForestMSE_BAG_L2  -20.348482      11.255129  236.764497                2.

		[36mray::_ray_fit()[39m (pid=24581, ip=172.17.0.3)
  File "/root/anaconda3/envs/python3.9/lib/python3.9/site-packages/autogluon/core/models/ensemble/fold_fitting_strategy.py", line 402, in _ray_fit
    fold_model.fit(X=X_fold, y=y_fold, X_val=X_val_fold, y_val=y_val_fold, time_limit=time_limit_fold, **resources, **kwargs_fold)
  File "/root/anaconda3/envs/python3.9/lib/python3.9/site-packages/autogluon/core/models/abstract/abstract_model.py", line 829, in fit
    out = self._fit(**kwargs)
  File "/root/anaconda3/envs/python3.9/lib/python3.9/site-packages/autogluon/tabular/models/fastainn/tabular_nn_fastai.py", line 327, in _fit
    self.model.fit_one_cycle(epochs, params["lr"], cbs=callbacks)
  File "/root/anaconda3/envs/python3.9/lib/python3.9/site-packages/fastai/callback/schedule.py", line 119, in fit_one_cycle
    self.fit(n_epoch, cbs=ParamScheduler(scheds)+L(cbs), reset_opt=reset_opt, wd=wd, start_epoch=start_epoch)
  File "/root/anaconda3/envs/python3.9/lib/python3.9/site-pa

EB >> MAPE: 0.0764, zero_mape: 0.0764, smape: 0.0709
                     model  score_val  pred_time_val    fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0      WeightedEnsemble_L3 -69.876117      14.669178  285.721379                0.000609           0.292038            3       True         14
1          LightGBM_BAG_L2 -70.653436       8.984130  184.497298                0.121230           3.190500            2       True         10
2     ExtraTreesMSE_BAG_L2 -70.799428      11.563234  201.718137                2.700334          20.411338            2       True         13
3          CatBoost_BAG_L2 -71.014563       9.097155  207.511535                0.234256          26.204737            2       True         12
4        LightGBMXT_BAG_L2 -71.174658       8.992180  184.320040                0.129281           3.013242            2       True          9
5   RandomForestMSE_BAG_L2 -71.200602      11.612748  235.622765                2.749848 

### 2) 불량여부 예측을 위한 이진분류(binary classification) 모델 학습 및 저장

In [3]:
%%time
# 물성값: regression, 불량여부: binary
problem_type_ = 'binary' # 'regression', 'binary'

# 모델 인덱스
file_suffix='combined'

# 교차분절 수
nfold = 3

if problem_type_ == 'regression':
    # 물성값 예측
    yCols=['HS','SG','TS','EB','REHO_MIN','REHO_MAX','REHO_TS2','REHO_TC90','MNY','SCR']
    # yCols=['MNY','REHO_MIN','REHO_MAX','REHO_TS2','REHO_TC90','SCR']
    data_type_ = 'SFT_regr'
else:
    # 불량여부 예측
    yCols=['HS_RESULT','SG_RESULT','TS_RESULT','EB_RESULT','MNY_RESULT','REHO_RESULT','SCR_RESULT']
    # yCols=['MNY_RESULT','REHO_RESULT','SCR_RESULT']
    data_type_ = 'SFT_clas'

for yCol in yCols:
    start = time.time()
    print(f'{yCol}!!!!')
    print(f'='*80)
    # 데이터셋 로드 및 모델 입출력 데이터 생성
    train = pd.read_csv(f'./final_dataset/{file_suffix}_data_{yCol}_fin.csv')
    X_train, X_test, y_train, y_test = train_test_split(train.drop(['ID', 'REAL_VAL','TYPE'], axis=1), train['REAL_VAL'], test_size=0.2, random_state=seed)
    train_data = train.iloc[X_train.index].drop(['ID','TYPE'], axis=1)

    # 모델 학습
    predictor = TabularPredictor(
        label='REAL_VAL', 
        problem_type=problem_type_,  # 회귀 문제로 명시적 설정  binary, multiclass, regression, quantile
        eval_metric=None,           # 'accuracy' for binary/multiclass, 'root_mean_squared_error' for regression, and 'pinball_loss' for quantile.
        path=f'{data_type_}/{file_suffix}_{yCol}_models',  # 모델을 저장할 디렉토리의 전체 경로
        if problem_type_ == 'binary':
            sample_weight='balance_weight',
    ).fit(
        train_data,
        presets='best_quality',     # most accurate overall predictor: 'best_quality', good quality with minimal disk usage : 'high_quality','optimize_for_deployment'
                                    # others : 'medium_quality', 'interpretable', 'ignore_text'
        time_limit=300,
        verbosity=0,
        num_bag_folds=nfold,            # n분절-교차검증
        # auto_stack=True,          # presets에 따라 자동 설정?
        # num_gpus='auto',          # default : auto
    )

    # 테스트 세트에 대한 예측
    y_pred = predictor.predict(X_test)

    res_df = pd.DataFrame()
    res_df[yCol+'_real'] = y_test
    res_df[yCol+'_pred'] = y_pred

    # 성능 평가
    if problem_type_ == 'regression':
        mape = mean_absolute_percentage_error(res_df[yCol+'_real'], res_df[yCol+'_pred'])
        zero_mape2 = mape_non_zero(res_df[yCol+'_real'], res_df[yCol+'_pred'])
        smape = smape_cal(res_df[yCol+'_real'], res_df[yCol+'_pred'])
        print(f"{yCol} >> MAPE: {mape:.4f}, zero_mape: {zero_mape2:.4f}, smape: {smape:.4f}")
    else:
        recall = recall_score(res_df[yCol+'_real'], res_df[yCol+'_pred'])
        accuracy = accuracy_score(res_df[yCol+'_real'], res_df[yCol+'_pred'])
        print(f"{yCol} >> reall: {recall:.4f}, accuracy: {accuracy:.4f}")

    # 학습된 모델의 성능 요약
    predictor.leaderboard()
    print(f'Excution time for {yCol} : {time.time() - start}s')
    print(f'='*80,'\n')

HS_RESULT!!!!
HS_RESULT >> reall: 0.9936, accuracy: 0.9726
                      model  score_val  pred_time_val    fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0       WeightedEnsemble_L2   0.970842      10.400176  223.369455                0.030001           4.778485            2       True         14
1   RandomForestEntr_BAG_L1   0.970028       1.674091    2.389398                1.674091           2.389398            1       True          6
2   RandomForestGini_BAG_L1   0.969458       1.750335    2.279443                1.750335           2.279443            1       True          5
3     KNeighborsUnif_BAG_L1   0.968806       0.935827    0.090096                0.935827           0.090096            1       True          1
4     KNeighborsDist_BAG_L1   0.968439       0.925845    0.087107                0.925845           0.087107            1       True          2
5     ExtraTreesEntr_BAG_L1   0.968236       1.651319    1.546919            