In [8]:
# 파일이름 : data_integration.ipynb
# 코드설명 : 예측 대상별 데이터 통합 및 모델 학습 데이터 세트 생성
# 입/출력 : LAB, MES CMB, MES FMB 데이터 세트 / 물성별 데이터 세트 (17개)
# 유의 사항 : 무한대 값 제외
# 최종수정 : 2023년 11월 20일
# 제 작 자 : 홍민성 (mshong@micube.co.kr), 맹영준 (myj6223@micube.co.kr)
# Copyright : MICUBE Solution, Inc.

In [1]:
import pandas as pd
import numpy as np

pd.set_option("display.max_columns", 500)
pd.set_option('display.max_rows', 50)

from tqdm import tqdm

import matplotlib.pyplot as plt
import seaborn as sns
sns.set(rc={'figure.figsize':(15, 5)})
%matplotlib inline

import warnings
warnings.filterwarnings(action='ignore')

## 1. 불량여부의 데이터 불균형 체크

In [7]:
mes_clas_yCols = ['HS_RESULT','SG_RESULT','TS_RESULT','EB_RESULT','MNY_RESULT', 'REHO_RESULT', 'SCR_RESULT']

show_df = pd.DataFrame(columns=['TARGET','TRUE#','FALSE#','TOTAL#'])

for mes_yCol in mes_clas_yCols:

    mes_cmb_df = pd.read_csv(f'./tmp_dataset/mes_clas_cmb_data_{mes_yCol}_fin.csv')
    mes_fmb_df = pd.read_csv(f'./tmp_dataset/mes_clas_fmb_data_{mes_yCol}_fin.csv')

    total_cnt = len(mes_cmb_df)+len(mes_fmb_df)
    true_cnt = len(mes_cmb_df[mes_cmb_df[mes_yCol]==1])+len(mes_fmb_df[mes_fmb_df[mes_yCol]==1])
    false_cnt = total_cnt-true_cnt

    show_df.loc[len(show_df)] = [mes_yCol,true_cnt,false_cnt,total_cnt]

show_df

Unnamed: 0,TARGET,TRUE#,FALSE#,TOTAL#
0,HS_RESULT,29725,971,30696
1,SG_RESULT,68458,1775,70233
2,TS_RESULT,26568,1439,28007
3,EB_RESULT,26568,1439,28007
4,MNY_RESULT,49339,10930,60269
5,REHO_RESULT,149780,45706,195486
6,SCR_RESULT,44257,8912,53169


## 2. 기본물성 데이터 통합

In [2]:
in_file_path = './tmp_dataset/'
out_file_path = './final_dataset/'

lab_yCols = ['insp_pps_hs_val','insp_pps_sg_val','insp_pps_ts_val','insp_pps_eb_val']
mes_regr_yCols = ['HS','SG','TS','EB']
mes_clas_yCols = ['HS_RESULT','SG_RESULT','TS_RESULT','EB_RESULT']

### 1) 최종 데이터셋의 형태(변수명) 정의

In [3]:
fin_cols = ['ID', 'TYPE', 'INSP_TIME', 'INSP_TEMP', 'RPM1', 'RPM2', 'RPM3', 'RPM4', 'RPM5', 'RPM6', 'RPM7', 'RPM8', 'RPM9', 'RPM10', 'RPM11', 'RPM12', 'RPM13', 'RPM14', 'RPM15', 'RPM16', 'RPM17', 'RPM18', 'RPM19', 'RPM20', 'RPM21', 'RPM22', 'RPM23', 'TEMP1', 'TEMP2', 'TEMP3', 'TEMP4', 'TEMP5', 'TEMP6', 'TEMP7', 'TEMP8', 'TEMP9', 'TEMP10', 'TEMP11', 'TEMP12', 'TEMP13', 'TEMP14', 'TEMP15', 'TEMP16', 'TEMP17', 'TEMP18', 'TEMP19', 'TEMP20', 'TEMP21', 'TEMP22', 'TEMP23', 'TIME1', 'TIME2', 'TIME3', 'TIME4', 'TIME5', 'TIME6', 'TIME7', 'TIME8', 'TIME9', 'TIME10', 'TIME11', 'TIME12', 'TIME13', 'TIME14', 'TIME15', 'TIME16', 'TIME17', 'TIME18', 'TIME19', 'TIME20', 'TIME21', 'TIME22', 'TIME23', 'JUK1', 'JUK2', 'JUK3', 'JUK4', 'JUK5', 'JUK6', 'JUK7', 'JUK8', 'JUK9', 'JUK10', 'JUK11', 'JUK12', 'JUK13', 'JUK14', 'JUK15', 'JUK16', 'JUK17', 'JUK18', 'JUK19', 'JUK20', 'JUK21', 'JUK22', 'JUK23', 'JRCODE1', 'JRCODE2', 'JRCODE3', 'JRCODE4', 'JRCODE5', 'JRCODE6', 'JRCODE7', 'JRCODE8', 'JRCODE9', 'JRCODE10', 'JRCODE11', 'JRCODE12', 'JRCODE13', 'JRCODE14', 'JRCODE15', 'JRCODE16', 'JRCODE17', 'JRCODE18', 'JRCODE19', 'JRCODE20', 'JRCODE21', 'JRCODE22', 'JRCODE23', 'JRCODE24', 'JRCODE25', 'PHR1', 'PHR2', 'PHR3', 'PHR4', 'PHR5', 'PHR6', 'PHR7', 'PHR8', 'PHR9', 'PHR10', 'PHR11', 'PHR12', 'PHR13', 'PHR14', 'PHR15', 'PHR16', 'PHR17', 'PHR18', 'PHR19', 'PHR20', 'PHR21', 'PHR22', 'PHR23', 'PHR24', 'PHR25', 'PUTGB1', 'PUTGB2', 'PUTGB3', 'PUTGB4', 'PUTGB5', 'PUTGB6', 'PUTGB7', 'PUTGB8', 'PUTGB9', 'PUTGB10', 'PUTGB11', 'PUTGB12', 'PUTGB13', 'PUTGB14', 'PUTGB15', 'PUTGB16', 'PUTGB17', 'PUTGB18', 'PUTGB19', 'PUTGB20', 'PUTGB21', 'PUTGB22', 'PUTGB23', 'PUTGB24', 'PUTGB25', 'REAL_VAL']

### 2) regression data set 재구축

In [12]:
show_df = pd.DataFrame(columns=['TARGET','LAB#','CMB#','FMB#','TOTAL#'])

for i in np.arange(0,len(mes_regr_yCols)):
    lab_yCol = lab_yCols[i]
    mes_yCol = mes_regr_yCols[i]

    # 데이터별 컬럼명 단일화를 위한 dictionary 정의
    lab_rename_dic = {'lab_pk': 'ID', f'{lab_yCol}': 'REAL_VAL',
                           '1_JRCODE': 'JRCODE1', '2_JRCODE': 'JRCODE2', '3_JRCODE': 'JRCODE3', '4_JRCODE': 'JRCODE4', '5_JRCODE': 'JRCODE5', '6_JRCODE': 'JRCODE6', '7_JRCODE': 'JRCODE7', '8_JRCODE': 'JRCODE8', '9_JRCODE': 'JRCODE9', '10_JRCODE': 'JRCODE10', '11_JRCODE': 'JRCODE11', '12_JRCODE': 'JRCODE12', '13_JRCODE': 'JRCODE13', '14_JRCODE': 'JRCODE14', '15_JRCODE': 'JRCODE15', '16_JRCODE': 'JRCODE16', '17_JRCODE': 'JRCODE17', '18_JRCODE': 'JRCODE18', '19_JRCODE': 'JRCODE19', '20_JRCODE': 'JRCODE20',
                            '1_PHR': 'PHR1', '2_PHR': 'PHR2', '3_PHR': 'PHR3', '4_PHR': 'PHR4', '5_PHR': 'PHR5', '6_PHR': 'PHR6', '7_PHR': 'PHR7', '8_PHR': 'PHR8', '9_PHR': 'PHR9', '10_PHR': 'PHR10', '11_PHR': 'PHR11', '12_PHR': 'PHR12', '13_PHR': 'PHR13', '14_PHR': 'PHR14', '15_PHR': 'PHR15', '16_PHR': 'PHR16', '17_PHR': 'PHR17', '18_PHR': 'PHR18', '19_PHR': 'PHR19', '20_PHR': 'PHR20',
                           }
    mes_cmb_rename_dic = {'LOTNO': 'ID', f'{mes_yCol}': 'REAL_VAL', f'{mes_yCol}_TIME': 'INSP_TIME', f'{mes_yCol}_TEMP': 'INSP_TEMP', 
                            '1_JRCODE': 'JRCODE1', '2_JRCODE': 'JRCODE2', '3_JRCODE': 'JRCODE3', '4_JRCODE': 'JRCODE4', '5_JRCODE': 'JRCODE5', '6_JRCODE': 'JRCODE6', '7_JRCODE': 'JRCODE7', '8_JRCODE': 'JRCODE8', '9_JRCODE': 'JRCODE9', '10_JRCODE': 'JRCODE10', '11_JRCODE': 'JRCODE11', '12_JRCODE': 'JRCODE12', '13_JRCODE': 'JRCODE13', '14_JRCODE': 'JRCODE14', '15_JRCODE': 'JRCODE15', '16_JRCODE': 'JRCODE16', '17_JRCODE': 'JRCODE17', '18_JRCODE': 'JRCODE18', '19_JRCODE': 'JRCODE19', '20_JRCODE': 'JRCODE20', '21_JRCODE': 'JRCODE21',
                            '1_PHR': 'PHR1', '2_PHR': 'PHR2', '3_PHR': 'PHR3', '4_PHR': 'PHR4', '5_PHR': 'PHR5', '6_PHR': 'PHR6', '7_PHR': 'PHR7', '8_PHR': 'PHR8', '9_PHR': 'PHR9', '10_PHR': 'PHR10', '11_PHR': 'PHR11', '12_PHR': 'PHR12', '13_PHR': 'PHR13', '14_PHR': 'PHR14', '15_PHR': 'PHR15', '16_PHR': 'PHR16', '17_PHR': 'PHR17', '18_PHR': 'PHR18', '19_PHR': 'PHR19', '20_PHR': 'PHR20', '21_PHR': 'PHR21',
                            '1_PUTGB': 'PUTGB1', '2_PUTGB': 'PUTGB2', '3_PUTGB': 'PUTGB3', '4_PUTGB': 'PUTGB4', '5_PUTGB': 'PUTGB5', '6_PUTGB': 'PUTGB6', '7_PUTGB': 'PUTGB7', '8_PUTGB': 'PUTGB8', '9_PUTGB': 'PUTGB9', '10_PUTGB': 'PUTGB10', '11_PUTGB': 'PUTGB11', '12_PUTGB': 'PUTGB12', '13_PUTGB': 'PUTGB13', '14_PUTGB': 'PUTGB14', '15_PUTGB': 'PUTGB15', '16_PUTGB': 'PUTGB16', '17_PUTGB': 'PUTGB17', '18_PUTGB': 'PUTGB18', '19_PUTGB': 'PUTGB19', '20_PUTGB': 'PUTGB20', '21_PUTGB': 'PUTGB21'
                            }
    mes_fmb_rename_dic = {'LOTNO': 'ID', f'{mes_yCol}': 'REAL_VAL', f'{mes_yCol}_TEMP': 'INSP_TIME', f'{mes_yCol}_TEMP': 'INSP_TEMP',
                            '1_JRCODE': 'JRCODE1', '2_JRCODE': 'JRCODE2', '3_JRCODE': 'JRCODE3', '4_JRCODE': 'JRCODE4', '5_JRCODE': 'JRCODE5', '6_JRCODE': 'JRCODE6', '7_JRCODE': 'JRCODE7', '8_JRCODE': 'JRCODE8', '9_JRCODE': 'JRCODE9', '10_JRCODE': 'JRCODE10', '11_JRCODE': 'JRCODE11', '12_JRCODE': 'JRCODE12', '13_JRCODE': 'JRCODE13', '14_JRCODE': 'JRCODE14', '15_JRCODE': 'JRCODE15', '16_JRCODE': 'JRCODE16', '17_JRCODE': 'JRCODE17', '18_JRCODE': 'JRCODE18', '19_JRCODE': 'JRCODE19', '20_JRCODE': 'JRCODE20', '21_JRCODE': 'JRCODE21', '22_JRCODE': 'JRCODE22', '23_JRCODE': 'JRCODE23', '24_JRCODE': 'JRCODE24', '25_JRCODE': 'JRCODE25',
                            '1_PHR': 'PHR1', '2_PHR': 'PHR2', '3_PHR': 'PHR3', '4_PHR': 'PHR4', '5_PHR': 'PHR5', '6_PHR': 'PHR6', '7_PHR': 'PHR7', '8_PHR': 'PHR8', '9_PHR': 'PHR9', '10_PHR': 'PHR10', '11_PHR': 'PHR11', '12_PHR': 'PHR12', '13_PHR': 'PHR13', '14_PHR': 'PHR14', '15_PHR': 'PHR15', '16_PHR': 'PHR16', '17_PHR': 'PHR17', '18_PHR': 'PHR18', '19_PHR': 'PHR19', '20_PHR': 'PHR20', '21_PHR': 'PHR21', '22_PHR': 'PHR22', '23_PHR': 'PHR23', '24_PHR': 'PHR24', '25_PHR': 'PHR25',
                            '1_PUTGB': 'PUTGB1', '2_PUTGB': 'PUTGB2', '3_PUTGB': 'PUTGB3', '4_PUTGB': 'PUTGB4', '5_PUTGB': 'PUTGB5', '6_PUTGB': 'PUTGB6', '7_PUTGB': 'PUTGB7', '8_PUTGB': 'PUTGB8', '9_PUTGB': 'PUTGB9', '10_PUTGB': 'PUTGB10', '11_PUTGB': 'PUTGB11', '12_PUTGB': 'PUTGB12', '13_PUTGB': 'PUTGB13', '14_PUTGB': 'PUTGB14', '15_PUTGB': 'PUTGB15', '16_PUTGB': 'PUTGB16', '17_PUTGB': 'PUTGB17', '18_PUTGB': 'PUTGB18', '19_PUTGB': 'PUTGB19', '20_PUTGB': 'PUTGB20', '21_PUTGB': 'PUTGB21', '22_PUTGB': 'PUTGB22', '23_PUTGB': 'PUTGB23', '24_PUTGB': 'PUTGB24', '25_PUTGB': 'PUTGB25',
                            }

    # 데이터별 파일 로드
    lab_regr_df = pd.read_csv(in_file_path+f'lab_regr_data_{lab_yCol}_fin_noZero.csv')
    mes_cmb_df = pd.read_csv(in_file_path+f'mes_regr_cmb_data_{mes_yCol}_fin_noZero.csv')
    mes_fmb_df = pd.read_csv(in_file_path+f'mes_regr_fmb_data_{mes_yCol}_fin_noZero.csv')

    # 데이터별 컬럼명 단일화
    lab_regr_df.rename(columns=lab_rename_dic,inplace=True)
    mes_cmb_df.rename(columns=mes_cmb_rename_dic,inplace=True)
    mes_fmb_df.rename(columns=mes_fmb_rename_dic,inplace=True)

    # 데이터 종류 변수 추가
    lab_regr_df['TYPE'] = 'LAB'
    mes_cmb_df['TYPE'] = 'mCMB'
    mes_fmb_df['TYPE'] = 'mFMB'

    # 데이터별 누락된 컬럼을 nan값을 채워서 추가
    for fCol in fin_cols:
        if fCol not in lab_regr_df.columns:
            lab_regr_df[fCol] = np.nan
        if fCol not in mes_cmb_df.columns:
            mes_cmb_df[fCol] = np.nan
        if fCol not in mes_fmb_df.columns:
            mes_fmb_df[fCol] = np.nan

    # 필요 없는 컬럼 제거
    mes_fmb_df.drop(columns=['RUBBER'], inplace=True)

    # 컬럼 순서 동일화
    lab_regr_df = lab_regr_df[fin_cols]
    mes_cmb_df = mes_cmb_df[fin_cols]
    mes_fmb_df = mes_fmb_df[fin_cols]

    # 데이터 통합
    combined_df = pd.concat([lab_regr_df, mes_cmb_df, mes_fmb_df])

    # 결과 출력
    display(f'LAB: {lab_regr_df.shape}, CMB: {mes_cmb_df.shape}, FMB: {mes_fmb_df.shape}, TOTAL: {combined_df.shape}')

    # 통합 데이터 저장
    combined_df.to_csv(out_file_path+f'combined_data_{mes_yCol}_fin.csv', index=False)
    # 무한대 라벨 값 Nan으로 대체
    combined_df['REAL_VAL'].replace([np.inf, -np.inf], np.nan, inplace=True)
    # Nan 라벨 값 제거
    combined_df.dropna(subset=['REAL_VAL'], how="all", inplace=True)

    # 통합 결과 저장
    show_df.loc[len(show_df)] = [mes_yCol, len(lab_regr_df), len(mes_cmb_df), len(mes_fmb_df), len(combined_df)]
    
show_df

'LAB: (7702, 172), CMB: (10144, 172), FMB: (21110, 172), TOTAL: (38956, 172)'

'LAB: (7671, 172), CMB: (41072, 172), FMB: (18500, 172), TOTAL: (67243, 172)'

'LAB: (7606, 172), CMB: (9851, 172), FMB: (19317, 172), TOTAL: (36774, 172)'

'LAB: (7623, 172), CMB: (9852, 172), FMB: (19318, 172), TOTAL: (36793, 172)'

Unnamed: 0,TARGET,LAB#,CMB#,FMB#,TOTAL#
0,HS,7702,10144,21110,38956
1,SG,7671,41072,18500,67243
2,TS,7606,9851,19317,36774
3,EB,7623,9852,19318,36793


### 3) classification data set 재구축

In [13]:
show_df = pd.DataFrame(columns=['TARGET','CMB#','FMB#','TOTAL#'])

for i in np.arange(0,len(mes_clas_yCols)):
    mes_yCol = mes_clas_yCols[i]

    # 데이터별 컬럼명 단일화를 위한 dictionary 정의
    mes_cmb_rename_dic = {'LOTNO': 'ID', f'{mes_yCol}': 'REAL_VAL', f'{mes_yCol}_TIME': 'INSP_TIME', f'{mes_yCol}_TEMP': 'INSP_TEMP', 
                            '1_JRCODE': 'JRCODE1', '2_JRCODE': 'JRCODE2', '3_JRCODE': 'JRCODE3', '4_JRCODE': 'JRCODE4', '5_JRCODE': 'JRCODE5', '6_JRCODE': 'JRCODE6', '7_JRCODE': 'JRCODE7', '8_JRCODE': 'JRCODE8', '9_JRCODE': 'JRCODE9', '10_JRCODE': 'JRCODE10', '11_JRCODE': 'JRCODE11', '12_JRCODE': 'JRCODE12', '13_JRCODE': 'JRCODE13', '14_JRCODE': 'JRCODE14', '15_JRCODE': 'JRCODE15', '16_JRCODE': 'JRCODE16', '17_JRCODE': 'JRCODE17', '18_JRCODE': 'JRCODE18', '19_JRCODE': 'JRCODE19', '20_JRCODE': 'JRCODE20', '21_JRCODE': 'JRCODE21',
                            '1_PHR': 'PHR1', '2_PHR': 'PHR2', '3_PHR': 'PHR3', '4_PHR': 'PHR4', '5_PHR': 'PHR5', '6_PHR': 'PHR6', '7_PHR': 'PHR7', '8_PHR': 'PHR8', '9_PHR': 'PHR9', '10_PHR': 'PHR10', '11_PHR': 'PHR11', '12_PHR': 'PHR12', '13_PHR': 'PHR13', '14_PHR': 'PHR14', '15_PHR': 'PHR15', '16_PHR': 'PHR16', '17_PHR': 'PHR17', '18_PHR': 'PHR18', '19_PHR': 'PHR19', '20_PHR': 'PHR20', '21_PHR': 'PHR21',
                            '1_PUTGB': 'PUTGB1', '2_PUTGB': 'PUTGB2', '3_PUTGB': 'PUTGB3', '4_PUTGB': 'PUTGB4', '5_PUTGB': 'PUTGB5', '6_PUTGB': 'PUTGB6', '7_PUTGB': 'PUTGB7', '8_PUTGB': 'PUTGB8', '9_PUTGB': 'PUTGB9', '10_PUTGB': 'PUTGB10', '11_PUTGB': 'PUTGB11', '12_PUTGB': 'PUTGB12', '13_PUTGB': 'PUTGB13', '14_PUTGB': 'PUTGB14', '15_PUTGB': 'PUTGB15', '16_PUTGB': 'PUTGB16', '17_PUTGB': 'PUTGB17', '18_PUTGB': 'PUTGB18', '19_PUTGB': 'PUTGB19', '20_PUTGB': 'PUTGB20', '21_PUTGB': 'PUTGB21'
                            }
    mes_fmb_rename_dic = {'LOTNO': 'ID', f'{mes_yCol}': 'REAL_VAL', f'{mes_yCol}_TEMP': 'INSP_TIME', f'{mes_yCol}_TEMP': 'INSP_TEMP',
                            '1_JRCODE': 'JRCODE1', '2_JRCODE': 'JRCODE2', '3_JRCODE': 'JRCODE3', '4_JRCODE': 'JRCODE4', '5_JRCODE': 'JRCODE5', '6_JRCODE': 'JRCODE6', '7_JRCODE': 'JRCODE7', '8_JRCODE': 'JRCODE8', '9_JRCODE': 'JRCODE9', '10_JRCODE': 'JRCODE10', '11_JRCODE': 'JRCODE11', '12_JRCODE': 'JRCODE12', '13_JRCODE': 'JRCODE13', '14_JRCODE': 'JRCODE14', '15_JRCODE': 'JRCODE15', '16_JRCODE': 'JRCODE16', '17_JRCODE': 'JRCODE17', '18_JRCODE': 'JRCODE18', '19_JRCODE': 'JRCODE19', '20_JRCODE': 'JRCODE20', '21_JRCODE': 'JRCODE21', '22_JRCODE': 'JRCODE22', '23_JRCODE': 'JRCODE23', '24_JRCODE': 'JRCODE24', '25_JRCODE': 'JRCODE25',
                            '1_PHR': 'PHR1', '2_PHR': 'PHR2', '3_PHR': 'PHR3', '4_PHR': 'PHR4', '5_PHR': 'PHR5', '6_PHR': 'PHR6', '7_PHR': 'PHR7', '8_PHR': 'PHR8', '9_PHR': 'PHR9', '10_PHR': 'PHR10', '11_PHR': 'PHR11', '12_PHR': 'PHR12', '13_PHR': 'PHR13', '14_PHR': 'PHR14', '15_PHR': 'PHR15', '16_PHR': 'PHR16', '17_PHR': 'PHR17', '18_PHR': 'PHR18', '19_PHR': 'PHR19', '20_PHR': 'PHR20', '21_PHR': 'PHR21', '22_PHR': 'PHR22', '23_PHR': 'PHR23', '24_PHR': 'PHR24', '25_PHR': 'PHR25',
                            '1_PUTGB': 'PUTGB1', '2_PUTGB': 'PUTGB2', '3_PUTGB': 'PUTGB3', '4_PUTGB': 'PUTGB4', '5_PUTGB': 'PUTGB5', '6_PUTGB': 'PUTGB6', '7_PUTGB': 'PUTGB7', '8_PUTGB': 'PUTGB8', '9_PUTGB': 'PUTGB9', '10_PUTGB': 'PUTGB10', '11_PUTGB': 'PUTGB11', '12_PUTGB': 'PUTGB12', '13_PUTGB': 'PUTGB13', '14_PUTGB': 'PUTGB14', '15_PUTGB': 'PUTGB15', '16_PUTGB': 'PUTGB16', '17_PUTGB': 'PUTGB17', '18_PUTGB': 'PUTGB18', '19_PUTGB': 'PUTGB19', '20_PUTGB': 'PUTGB20', '21_PUTGB': 'PUTGB21', '22_PUTGB': 'PUTGB22', '23_PUTGB': 'PUTGB23', '24_PUTGB': 'PUTGB24', '25_PUTGB': 'PUTGB25',
                            }

    # 데이터별 파일 로드
    mes_cmb_df = pd.read_csv(in_file_path+f'mes_clas_cmb_data_{mes_yCol}_fin.csv')
    mes_fmb_df = pd.read_csv(in_file_path+f'mes_clas_fmb_data_{mes_yCol}_fin.csv')

    # 데이터별 컬럼명 단일화
    mes_cmb_df.rename(columns=mes_cmb_rename_dic,inplace=True)
    mes_fmb_df.rename(columns=mes_fmb_rename_dic,inplace=True)

    # 데이터 종류 변수 추가
    mes_cmb_df['TYPE'] = 'mCMB'
    mes_fmb_df['TYPE'] = 'mFMB'

    # 데이터별 누락된 컬럼을 nan값을 채워서 추가
    for fCol in fin_cols:
        if fCol not in mes_cmb_df.columns:
            mes_cmb_df[fCol] = np.nan
        if fCol not in mes_fmb_df.columns:
            mes_fmb_df[fCol] = np.nan

    # 필요 없는 컬럼 제거
    mes_fmb_df.drop(columns=['RUBBER'], inplace=True)

    # 컬럼 순서 동일화
    mes_cmb_df = mes_cmb_df[fin_cols]
    mes_fmb_df = mes_fmb_df[fin_cols]

    # 데이터 통합
    combined_df = pd.concat([mes_cmb_df, mes_fmb_df])

    # 결과 출력
    display(f'CMB: {mes_cmb_df.shape}, FMB: {mes_fmb_df.shape}, TOTAL: {combined_df.shape}')

    # 통합 데이터 저장
    combined_df.to_csv(out_file_path+f'combined_data_{mes_yCol}_fin.csv', index=False)    
    # 무한대 라벨 값 Nan으로 대체
    combined_df['REAL_VAL'].replace([np.inf, -np.inf], np.nan, inplace=True)
    # Nan 라벨 값 제거
    combined_df.dropna(subset=['REAL_VAL'], how="all", inplace=True)

    # 통합 결과 저장
    show_df.loc[len(show_df)] = [mes_yCol, len(mes_cmb_df), len(mes_fmb_df), len(combined_df)]

show_df

'CMB: (10001, 172), FMB: (20695, 172), TOTAL: (30696, 172)'

'CMB: (51708, 172), FMB: (18525, 172), TOTAL: (70233, 172)'

'CMB: (9538, 172), FMB: (18469, 172), TOTAL: (28007, 172)'

'CMB: (9538, 172), FMB: (18469, 172), TOTAL: (28007, 172)'

Unnamed: 0,TARGET,CMB#,FMB#,TOTAL#
0,HS_RESULT,10001,20695,30696
1,SG_RESULT,51708,18525,70233
2,TS_RESULT,9538,18469,28007
3,EB_RESULT,9538,18469,28007


## 3. 가류도/점성 데이터 통합

In [4]:
in_file_path = './tmp_dataset/'
out_file_path = './final_dataset/'

lab_yCols = ['insp_mny_min_val', 'insp_rheo_min_val', 'insp_rheo_max_val']
mes_regr_yCols = ['MNY', 'REHO_MIN', 'REHO_MAX', 'REHO_TS2', 'REHO_TC90', 'SCR']
mes_clas_yCols = ['MNY_RESULT', 'REHO_RESULT', 'SCR_RESULT']

### 1) 최종 데이터셋의 형태(변수명) 정의

In [5]:
fin_cols = ['ID', 'TYPE', 'INSP_TIME', 'INSP_TEMP', 'RPM1', 'RPM2', 'RPM3', 'RPM4', 'RPM5', 'RPM6', 'RPM7', 'RPM8', 'RPM9', 'RPM10', 'RPM11', 'RPM12', 'RPM13', 'RPM14', 'RPM15', 'RPM16', 'RPM17', 'RPM18', 'RPM19', 'RPM20', 'RPM21', 'RPM22', 'RPM23', 'TEMP1', 'TEMP2', 'TEMP3', 'TEMP4', 'TEMP5', 'TEMP6', 'TEMP7', 'TEMP8', 'TEMP9', 'TEMP10', 'TEMP11', 'TEMP12', 'TEMP13', 'TEMP14', 'TEMP15', 'TEMP16', 'TEMP17', 'TEMP18', 'TEMP19', 'TEMP20', 'TEMP21', 'TEMP22', 'TEMP23', 'TIME1', 'TIME2', 'TIME3', 'TIME4', 'TIME5', 'TIME6', 'TIME7', 'TIME8', 'TIME9', 'TIME10', 'TIME11', 'TIME12', 'TIME13', 'TIME14', 'TIME15', 'TIME16', 'TIME17', 'TIME18', 'TIME19', 'TIME20', 'TIME21', 'TIME22', 'TIME23', 'JUK1', 'JUK2', 'JUK3', 'JUK4', 'JUK5', 'JUK6', 'JUK7', 'JUK8', 'JUK9', 'JUK10', 'JUK11', 'JUK12', 'JUK13', 'JUK14', 'JUK15', 'JUK16', 'JUK17', 'JUK18', 'JUK19', 'JUK20', 'JUK21', 'JUK22', 'JUK23', 'JRCODE1', 'JRCODE2', 'JRCODE3', 'JRCODE4', 'JRCODE5', 'JRCODE6', 'JRCODE7', 'JRCODE8', 'JRCODE9', 'JRCODE10', 'JRCODE11', 'JRCODE12', 'JRCODE13', 'JRCODE14', 'JRCODE15', 'JRCODE16', 'JRCODE17', 'JRCODE18', 'JRCODE19', 'JRCODE20', 'JRCODE21', 'JRCODE22', 'JRCODE23', 'JRCODE24', 'JRCODE25', 'PHR1', 'PHR2', 'PHR3', 'PHR4', 'PHR5', 'PHR6', 'PHR7', 'PHR8', 'PHR9', 'PHR10', 'PHR11', 'PHR12', 'PHR13', 'PHR14', 'PHR15', 'PHR16', 'PHR17', 'PHR18', 'PHR19', 'PHR20', 'PHR21', 'PHR22', 'PHR23', 'PHR24', 'PHR25', 'PUTGB1', 'PUTGB2', 'PUTGB3', 'PUTGB4', 'PUTGB5', 'PUTGB6', 'PUTGB7', 'PUTGB8', 'PUTGB9', 'PUTGB10', 'PUTGB11', 'PUTGB12', 'PUTGB13', 'PUTGB14', 'PUTGB15', 'PUTGB16', 'PUTGB17', 'PUTGB18', 'PUTGB19', 'PUTGB20', 'PUTGB21', 'PUTGB22', 'PUTGB23', 'PUTGB24', 'PUTGB25', 'REAL_VAL']

### 2) regression data set 재구축

In [6]:
show_df = pd.DataFrame(columns=['TARGET','LAB#','CMB#','FMB#','TOTAL#'])

for i in np.arange(0,len(mes_regr_yCols)):
    # Lab 데이터 처리
    try:
        lab_yCol = lab_yCols[i]
        # 데이터별 컬럼명 단일화를 위한 dictionary 정의
        lab_rename_dic = {'lab_pk': 'ID', f'{lab_yCol}': 'REAL_VAL',
                          '1_JRCODE': 'JRCODE1', '2_JRCODE': 'JRCODE2', '3_JRCODE': 'JRCODE3', '4_JRCODE': 'JRCODE4', '5_JRCODE': 'JRCODE5', '6_JRCODE': 'JRCODE6', '7_JRCODE': 'JRCODE7', '8_JRCODE': 'JRCODE8', '9_JRCODE': 'JRCODE9', '10_JRCODE': 'JRCODE10', '11_JRCODE': 'JRCODE11', '12_JRCODE': 'JRCODE12', '13_JRCODE': 'JRCODE13', '14_JRCODE': 'JRCODE14', '15_JRCODE': 'JRCODE15', '16_JRCODE': 'JRCODE16', '17_JRCODE': 'JRCODE17', '18_JRCODE': 'JRCODE18', '19_JRCODE': 'JRCODE19', '20_JRCODE': 'JRCODE20',
                          '1_PHR': 'PHR1', '2_PHR': 'PHR2', '3_PHR': 'PHR3', '4_PHR': 'PHR4', '5_PHR': 'PHR5', '6_PHR': 'PHR6', '7_PHR': 'PHR7', '8_PHR': 'PHR8', '9_PHR': 'PHR9', '10_PHR': 'PHR10', '11_PHR': 'PHR11', '12_PHR': 'PHR12', '13_PHR': 'PHR13', '14_PHR': 'PHR14', '15_PHR': 'PHR15', '16_PHR': 'PHR16', '17_PHR': 'PHR17', '18_PHR': 'PHR18', '19_PHR': 'PHR19', '20_PHR': 'PHR20'
                          }
    except:
        lab_yCol = 'NO'
        lab_rename_dic = np.nan
    
    mes_yCol = mes_regr_yCols[i]
    mes_cmb_rename_dic = {'LOTNO': 'ID', f'{mes_yCol}': 'REAL_VAL', f'{mes_yCol}_TIME': 'INSP_TIME', f'{mes_yCol}_TEMP': 'INSP_TEMP', 
                            '1_JRCODE': 'JRCODE1', '2_JRCODE': 'JRCODE2', '3_JRCODE': 'JRCODE3', '4_JRCODE': 'JRCODE4', '5_JRCODE': 'JRCODE5', '6_JRCODE': 'JRCODE6', '7_JRCODE': 'JRCODE7', '8_JRCODE': 'JRCODE8', '9_JRCODE': 'JRCODE9', '10_JRCODE': 'JRCODE10', '11_JRCODE': 'JRCODE11', '12_JRCODE': 'JRCODE12', '13_JRCODE': 'JRCODE13', '14_JRCODE': 'JRCODE14', '15_JRCODE': 'JRCODE15', '16_JRCODE': 'JRCODE16', '17_JRCODE': 'JRCODE17', '18_JRCODE': 'JRCODE18', '19_JRCODE': 'JRCODE19', '20_JRCODE': 'JRCODE20', '21_JRCODE': 'JRCODE21',
                            '1_PHR': 'PHR1', '2_PHR': 'PHR2', '3_PHR': 'PHR3', '4_PHR': 'PHR4', '5_PHR': 'PHR5', '6_PHR': 'PHR6', '7_PHR': 'PHR7', '8_PHR': 'PHR8', '9_PHR': 'PHR9', '10_PHR': 'PHR10', '11_PHR': 'PHR11', '12_PHR': 'PHR12', '13_PHR': 'PHR13', '14_PHR': 'PHR14', '15_PHR': 'PHR15', '16_PHR': 'PHR16', '17_PHR': 'PHR17', '18_PHR': 'PHR18', '19_PHR': 'PHR19', '20_PHR': 'PHR20', '21_PHR': 'PHR21',
                            '1_PUTGB': 'PUTGB1', '2_PUTGB': 'PUTGB2', '3_PUTGB': 'PUTGB3', '4_PUTGB': 'PUTGB4', '5_PUTGB': 'PUTGB5', '6_PUTGB': 'PUTGB6', '7_PUTGB': 'PUTGB7', '8_PUTGB': 'PUTGB8', '9_PUTGB': 'PUTGB9', '10_PUTGB': 'PUTGB10', '11_PUTGB': 'PUTGB11', '12_PUTGB': 'PUTGB12', '13_PUTGB': 'PUTGB13', '14_PUTGB': 'PUTGB14', '15_PUTGB': 'PUTGB15', '16_PUTGB': 'PUTGB16', '17_PUTGB': 'PUTGB17', '18_PUTGB': 'PUTGB18', '19_PUTGB': 'PUTGB19', '20_PUTGB': 'PUTGB20', '21_PUTGB': 'PUTGB21'
                            }
    mes_fmb_rename_dic = {'LOTNO': 'ID', f'{mes_yCol}': 'REAL_VAL', f'{mes_yCol}_TEMP': 'INSP_TIME', f'{mes_yCol}_TEMP': 'INSP_TEMP',
                            '1_JRCODE': 'JRCODE1', '2_JRCODE': 'JRCODE2', '3_JRCODE': 'JRCODE3', '4_JRCODE': 'JRCODE4', '5_JRCODE': 'JRCODE5', '6_JRCODE': 'JRCODE6', '7_JRCODE': 'JRCODE7', '8_JRCODE': 'JRCODE8', '9_JRCODE': 'JRCODE9', '10_JRCODE': 'JRCODE10', '11_JRCODE': 'JRCODE11', '12_JRCODE': 'JRCODE12', '13_JRCODE': 'JRCODE13', '14_JRCODE': 'JRCODE14', '15_JRCODE': 'JRCODE15', '16_JRCODE': 'JRCODE16', '17_JRCODE': 'JRCODE17', '18_JRCODE': 'JRCODE18', '19_JRCODE': 'JRCODE19', '20_JRCODE': 'JRCODE20', '21_JRCODE': 'JRCODE21', '22_JRCODE': 'JRCODE22', '23_JRCODE': 'JRCODE23', '24_JRCODE': 'JRCODE24', '25_JRCODE': 'JRCODE25',
                            '1_PHR': 'PHR1', '2_PHR': 'PHR2', '3_PHR': 'PHR3', '4_PHR': 'PHR4', '5_PHR': 'PHR5', '6_PHR': 'PHR6', '7_PHR': 'PHR7', '8_PHR': 'PHR8', '9_PHR': 'PHR9', '10_PHR': 'PHR10', '11_PHR': 'PHR11', '12_PHR': 'PHR12', '13_PHR': 'PHR13', '14_PHR': 'PHR14', '15_PHR': 'PHR15', '16_PHR': 'PHR16', '17_PHR': 'PHR17', '18_PHR': 'PHR18', '19_PHR': 'PHR19', '20_PHR': 'PHR20', '21_PHR': 'PHR21', '22_PHR': 'PHR22', '23_PHR': 'PHR23', '24_PHR': 'PHR24', '25_PHR': 'PHR25',
                            '1_PUTGB': 'PUTGB1', '2_PUTGB': 'PUTGB2', '3_PUTGB': 'PUTGB3', '4_PUTGB': 'PUTGB4', '5_PUTGB': 'PUTGB5', '6_PUTGB': 'PUTGB6', '7_PUTGB': 'PUTGB7', '8_PUTGB': 'PUTGB8', '9_PUTGB': 'PUTGB9', '10_PUTGB': 'PUTGB10', '11_PUTGB': 'PUTGB11', '12_PUTGB': 'PUTGB12', '13_PUTGB': 'PUTGB13', '14_PUTGB': 'PUTGB14', '15_PUTGB': 'PUTGB15', '16_PUTGB': 'PUTGB16', '17_PUTGB': 'PUTGB17', '18_PUTGB': 'PUTGB18', '19_PUTGB': 'PUTGB19', '20_PUTGB': 'PUTGB20', '21_PUTGB': 'PUTGB21', '22_PUTGB': 'PUTGB22', '23_PUTGB': 'PUTGB23', '24_PUTGB': 'PUTGB24', '25_PUTGB': 'PUTGB25',
                            }

    # 데이터별 파일 로드    
    if lab_yCol != 'NO': lab_regr_df = pd.read_csv(in_file_path+f'lab_regr_data_{lab_yCol}_fin_noZero.csv')
    mes_cmb_df = pd.read_csv(in_file_path+f'mes_regr_cmb_data_{mes_yCol}_fin_noZero.csv')
    mes_fmb_df = pd.read_csv(in_file_path+f'mes_regr_fmb_data_{mes_yCol}_fin_noZero.csv')

    # 데이터별 컬럼명 단일화
    if lab_yCol != 'NO': lab_regr_df.rename(columns=lab_rename_dic,inplace=True)
    mes_cmb_df.rename(columns=mes_cmb_rename_dic,inplace=True)
    mes_fmb_df.rename(columns=mes_fmb_rename_dic,inplace=True)

    # 데이터 종류 변수 추가
    if lab_yCol != 'NO': lab_regr_df['TYPE'] = 'LAB'
    mes_cmb_df['TYPE'] = 'mCMB'
    mes_fmb_df['TYPE'] = 'mFMB'

    # 데이터별 누락된 컬럼을 nan값을 채워서 추가
    for fCol in fin_cols:
        if (fCol not in lab_regr_df.columns) and (lab_yCol != 'NO'):
            lab_regr_df[fCol] = np.nan
        if fCol not in mes_cmb_df.columns:
            mes_cmb_df[fCol] = np.nan
        if fCol not in mes_fmb_df.columns:
            mes_fmb_df[fCol] = np.nan

    # 필요 없는 컬럼 제거
    mes_fmb_df.drop(columns=['RUBBER'], inplace=True)

    # 컬럼 순서 동일화
    if lab_yCol != 'NO': lab_regr_df = lab_regr_df[fin_cols]
    mes_cmb_df = mes_cmb_df[fin_cols]
    mes_fmb_df = mes_fmb_df[fin_cols]

    # 데이터 통합 및 결과 출력
    if lab_yCol != 'NO':
        combined_df = pd.concat([lab_regr_df, mes_cmb_df, mes_fmb_df])
        display(f'{mes_yCol} >> LAB: {lab_regr_df.shape}, CMB: {mes_cmb_df.shape}, FMB: {mes_fmb_df.shape}, TOTAL: {combined_df.shape}')
    else:
        combined_df = pd.concat([mes_cmb_df, mes_fmb_df])
        display(f'{mes_yCol} >> LAB: No, CMB: {mes_cmb_df.shape}, FMB: {mes_fmb_df.shape}, TOTAL: {combined_df.shape}')

    # 통합 데이터 저장
    combined_df.to_csv(out_file_path+f'combined_data_{mes_yCol}_fin.csv', index=False)
    # 무한대 라벨 값 Nan으로 대체
    combined_df['REAL_VAL'].replace([np.inf, -np.inf], np.nan, inplace=True)
    # Nan 라벨 값 제거
    combined_df.dropna(subset=['REAL_VAL'], how="all", inplace=True)

    # 통합 결과 저장
    if lab_yCol != 'NO':
        show_df.loc[len(show_df)] = [mes_yCol, len(lab_regr_df), len(mes_cmb_df), len(mes_fmb_df), len(combined_df)]
    else:
        show_df.loc[len(show_df)] = [mes_yCol, 0, len(mes_cmb_df), len(mes_fmb_df), len(combined_df)]        
    
show_df

'MNY >> LAB: (787, 172), CMB: (58817, 172), FMB: (410, 172), TOTAL: (60014, 172)'

'REHO_MIN >> LAB: (7909, 172), CMB: (11236, 172), FMB: (183105, 172), TOTAL: (202250, 172)'

'REHO_MAX >> LAB: (7962, 172), CMB: (11251, 172), FMB: (183090, 172), TOTAL: (202303, 172)'

'REHO_TS2 >> LAB: No, CMB: (11225, 172), FMB: (183084, 172), TOTAL: (194309, 172)'

'REHO_TC90 >> LAB: No, CMB: (11228, 172), FMB: (183086, 172), TOTAL: (194314, 172)'

'SCR >> LAB: No, CMB: (10786, 172), FMB: (37288, 172), TOTAL: (48074, 172)'

Unnamed: 0,TARGET,LAB#,CMB#,FMB#,TOTAL#
0,MNY,787,58817,410,60014
1,REHO_MIN,7909,11236,183105,202250
2,REHO_MAX,7962,11251,183090,202303
3,REHO_TS2,0,11225,183084,194309
4,REHO_TC90,0,11228,183086,194314
5,SCR,0,10786,37288,48074


### 3) classification data set 재구축

In [7]:
show_df = pd.DataFrame(columns=['TARGET','CMB#','FMB#','TOTAL#'])

for i in np.arange(0,len(mes_clas_yCols)):
    mes_yCol = mes_clas_yCols[i]

    # 데이터별 컬럼명 단일화를 위한 dictionary 정의
    mes_cmb_rename_dic = {'LOTNO': 'ID', f'{mes_yCol}': 'REAL_VAL', f'{mes_yCol}_TIME': 'INSP_TIME', f'{mes_yCol}_TEMP': 'INSP_TEMP', 
                            '1_JRCODE': 'JRCODE1', '2_JRCODE': 'JRCODE2', '3_JRCODE': 'JRCODE3', '4_JRCODE': 'JRCODE4', '5_JRCODE': 'JRCODE5', '6_JRCODE': 'JRCODE6', '7_JRCODE': 'JRCODE7', '8_JRCODE': 'JRCODE8', '9_JRCODE': 'JRCODE9', '10_JRCODE': 'JRCODE10', '11_JRCODE': 'JRCODE11', '12_JRCODE': 'JRCODE12', '13_JRCODE': 'JRCODE13', '14_JRCODE': 'JRCODE14', '15_JRCODE': 'JRCODE15', '16_JRCODE': 'JRCODE16', '17_JRCODE': 'JRCODE17', '18_JRCODE': 'JRCODE18', '19_JRCODE': 'JRCODE19', '20_JRCODE': 'JRCODE20', '21_JRCODE': 'JRCODE21',
                            '1_PHR': 'PHR1', '2_PHR': 'PHR2', '3_PHR': 'PHR3', '4_PHR': 'PHR4', '5_PHR': 'PHR5', '6_PHR': 'PHR6', '7_PHR': 'PHR7', '8_PHR': 'PHR8', '9_PHR': 'PHR9', '10_PHR': 'PHR10', '11_PHR': 'PHR11', '12_PHR': 'PHR12', '13_PHR': 'PHR13', '14_PHR': 'PHR14', '15_PHR': 'PHR15', '16_PHR': 'PHR16', '17_PHR': 'PHR17', '18_PHR': 'PHR18', '19_PHR': 'PHR19', '20_PHR': 'PHR20', '21_PHR': 'PHR21',
                            '1_PUTGB': 'PUTGB1', '2_PUTGB': 'PUTGB2', '3_PUTGB': 'PUTGB3', '4_PUTGB': 'PUTGB4', '5_PUTGB': 'PUTGB5', '6_PUTGB': 'PUTGB6', '7_PUTGB': 'PUTGB7', '8_PUTGB': 'PUTGB8', '9_PUTGB': 'PUTGB9', '10_PUTGB': 'PUTGB10', '11_PUTGB': 'PUTGB11', '12_PUTGB': 'PUTGB12', '13_PUTGB': 'PUTGB13', '14_PUTGB': 'PUTGB14', '15_PUTGB': 'PUTGB15', '16_PUTGB': 'PUTGB16', '17_PUTGB': 'PUTGB17', '18_PUTGB': 'PUTGB18', '19_PUTGB': 'PUTGB19', '20_PUTGB': 'PUTGB20', '21_PUTGB': 'PUTGB21'
                            }
    mes_fmb_rename_dic = {'LOTNO': 'ID', f'{mes_yCol}': 'REAL_VAL', f'{mes_yCol}_TEMP': 'INSP_TIME', f'{mes_yCol}_TEMP': 'INSP_TEMP',
                            '1_JRCODE': 'JRCODE1', '2_JRCODE': 'JRCODE2', '3_JRCODE': 'JRCODE3', '4_JRCODE': 'JRCODE4', '5_JRCODE': 'JRCODE5', '6_JRCODE': 'JRCODE6', '7_JRCODE': 'JRCODE7', '8_JRCODE': 'JRCODE8', '9_JRCODE': 'JRCODE9', '10_JRCODE': 'JRCODE10', '11_JRCODE': 'JRCODE11', '12_JRCODE': 'JRCODE12', '13_JRCODE': 'JRCODE13', '14_JRCODE': 'JRCODE14', '15_JRCODE': 'JRCODE15', '16_JRCODE': 'JRCODE16', '17_JRCODE': 'JRCODE17', '18_JRCODE': 'JRCODE18', '19_JRCODE': 'JRCODE19', '20_JRCODE': 'JRCODE20', '21_JRCODE': 'JRCODE21', '22_JRCODE': 'JRCODE22', '23_JRCODE': 'JRCODE23', '24_JRCODE': 'JRCODE24', '25_JRCODE': 'JRCODE25',
                            '1_PHR': 'PHR1', '2_PHR': 'PHR2', '3_PHR': 'PHR3', '4_PHR': 'PHR4', '5_PHR': 'PHR5', '6_PHR': 'PHR6', '7_PHR': 'PHR7', '8_PHR': 'PHR8', '9_PHR': 'PHR9', '10_PHR': 'PHR10', '11_PHR': 'PHR11', '12_PHR': 'PHR12', '13_PHR': 'PHR13', '14_PHR': 'PHR14', '15_PHR': 'PHR15', '16_PHR': 'PHR16', '17_PHR': 'PHR17', '18_PHR': 'PHR18', '19_PHR': 'PHR19', '20_PHR': 'PHR20', '21_PHR': 'PHR21', '22_PHR': 'PHR22', '23_PHR': 'PHR23', '24_PHR': 'PHR24', '25_PHR': 'PHR25',
                            '1_PUTGB': 'PUTGB1', '2_PUTGB': 'PUTGB2', '3_PUTGB': 'PUTGB3', '4_PUTGB': 'PUTGB4', '5_PUTGB': 'PUTGB5', '6_PUTGB': 'PUTGB6', '7_PUTGB': 'PUTGB7', '8_PUTGB': 'PUTGB8', '9_PUTGB': 'PUTGB9', '10_PUTGB': 'PUTGB10', '11_PUTGB': 'PUTGB11', '12_PUTGB': 'PUTGB12', '13_PUTGB': 'PUTGB13', '14_PUTGB': 'PUTGB14', '15_PUTGB': 'PUTGB15', '16_PUTGB': 'PUTGB16', '17_PUTGB': 'PUTGB17', '18_PUTGB': 'PUTGB18', '19_PUTGB': 'PUTGB19', '20_PUTGB': 'PUTGB20', '21_PUTGB': 'PUTGB21', '22_PUTGB': 'PUTGB22', '23_PUTGB': 'PUTGB23', '24_PUTGB': 'PUTGB24', '25_PUTGB': 'PUTGB25',
                            }

    # 데이터별 파일 로드
    mes_cmb_df = pd.read_csv(in_file_path+f'mes_clas_cmb_data_{mes_yCol}_fin.csv')
    mes_fmb_df = pd.read_csv(in_file_path+f'mes_clas_fmb_data_{mes_yCol}_fin.csv')

    # 데이터별 컬럼명 단일화
    mes_cmb_df.rename(columns=mes_cmb_rename_dic,inplace=True)
    mes_fmb_df.rename(columns=mes_fmb_rename_dic,inplace=True)

    # 데이터 종류 변수 추가
    mes_cmb_df['TYPE'] = 'mCMB'
    mes_fmb_df['TYPE'] = 'mFMB'

    # 데이터별 누락된 컬럼을 nan값을 채워서 추가
    for fCol in fin_cols:
        if fCol not in mes_cmb_df.columns:
            mes_cmb_df[fCol] = np.nan
        if fCol not in mes_fmb_df.columns:
            mes_fmb_df[fCol] = np.nan

    # 필요 없는 컬럼 제거
    mes_fmb_df.drop(columns=['RUBBER'], inplace=True)

    # 컬럼 순서 동일화
    mes_cmb_df = mes_cmb_df[fin_cols]
    mes_fmb_df = mes_fmb_df[fin_cols]

    # 데이터 통합
    combined_df = pd.concat([mes_cmb_df, mes_fmb_df])

    # 결과 출력
    display(f'CMB: {mes_cmb_df.shape}, FMB: {mes_fmb_df.shape}, TOTAL: {combined_df.shape}')

    # 통합 데이터 저장
    combined_df.to_csv(out_file_path+f'combined_data_{mes_yCol}_fin.csv', index=False)
    # 무한대 라벨 값 Nan으로 대체
    combined_df['REAL_VAL'].replace([np.inf, -np.inf], np.nan, inplace=True)
    # Nan 라벨 값 제거
    combined_df.dropna(subset=['REAL_VAL'], how="all", inplace=True)

    # 통합 결과 저장
    show_df.loc[len(show_df)] = [mes_yCol, len(mes_cmb_df), len(mes_fmb_df), len(combined_df)]

show_df

'CMB: (59851, 172), FMB: (418, 172), TOTAL: (60269, 172)'

'CMB: (11493, 172), FMB: (183993, 172), TOTAL: (195486, 172)'

'CMB: (11067, 172), FMB: (42102, 172), TOTAL: (53169, 172)'

Unnamed: 0,TARGET,CMB#,FMB#,TOTAL#
0,MNY_RESULT,59851,418,60269
1,REHO_RESULT,11493,183993,195486
2,SCR_RESULT,11067,42102,53169
