In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
%cd /content/drive/MyDrive/Dyetec

In [None]:
!pip install colormath

In [None]:
import os
import random
import pandas as pd
import numpy as np
pd.options.display.max_rows = 100
pd.options.display.max_columns = 50

import matplotlib.pyplot as plt
import seaborn as sns

from colormath.color_objects import LabColor
from colormath.color_diff import delta_e_cie1976, delta_e_cie1994, delta_e_cie2000, delta_e_cmc

from sklearn.model_selection import train_test_split, KFold
# from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMClassifier, early_stopping, plot_importance
from sklearn.metrics import mean_squared_error, confusion_matrix, accuracy_score, recall_score, plot_confusion_matrix


In [None]:
# Configs
SEED = 42

# RandomForest
N_ESTIMATORS = 100
# N_ESTIMATORS = 200
MAX_DEPTH = 20
CRITERION = 'squared_error'  # 'absolute_error'
MAX_SAMPLES = 1.0  # range (0, 1.0)

# LightGBM
# MAX_DEPTH = 100
# FEATURE_FRACTION = 0.6
# BAGGING_FRACTION = 0.6
# N_ESTIMATORS = 500

In [None]:
def seed_everything(seed=42):
    random.seed(seed)
    np.random.seed(seed)

seed_everything(SEED)

## Data Loading

In [None]:
df = pd.read_excel('rawdata.xlsx', skiprows=2)
df.drop(labels=0, inplace=True)
df.reset_index(drop=True, inplace=True)
df.drop(columns=['태그'], inplace=True)
df = df.iloc[:, :54]
df

### 컬럼명 변경

In [None]:
kr2en = {'Lab 후가공 후 검사_L*': 'L*', 
         'Lab 후가공 후 검사_a*': 'a*', 
         'Lab 후가공 후 검사_b*': 'b*',
         '전처리_CH3COOH': 'prep_CH3COOH', 
         '제직_중량_생지': 'weaving_dough', 
         '배합_Sera Fast P-UVC': 'comb_Sera Fast', 
         '배합_UVK-400 H/C': 'comb_UVK400', 
         '배합_Fadex® TS liq': 'comb_Fadex', 
         '배합_빙초산': 'comb_acid',
         '배합_UVK-200': 'comb_UVK200',
         'Lab 염색 상승속도 #1': 'dye_up_rate1', 
         'Lab 염색 상승속도 #2': 'dye_up_rate2', 
         'Lab 염색 상승속도 #3': 'dye_up_rate3', 
         'Lab 염색 상승온도 #3': 'dye_up_temp3', 
         'Lab 염색 상승온도 #3 유지시간': 'dye_up_temp3_hold', 
         'Lab 염색 하강속도 #1': 'dye_down_rate1',
         '후처리_Sera Con M-FAS': 'after_Sera', 
         '후처리_NaOH': 'after_NaOH'
}

for col in df.columns:
    if col not in kr2en.keys(): continue
    df.rename(columns={col: kr2en[col]}, inplace=True)

df.columns

### Train & Test Split

In [None]:
X, y = df.iloc[:,3:], df.iloc[:,:3]
print(X.shape, y.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED)

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

## Model Training & Prediction

In [None]:
model = RandomForestRegressor(n_estimators=N_ESTIMATORS, 
                              max_depth=MAX_DEPTH,
                              criterion=CRITERION, # cretrion customize!!
                              max_samples=MAX_SAMPLES,
                              warm_start=False, # default 
                              random_state=SEED)

In [None]:
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)
df_pred = pd.DataFrame(y_pred, columns=['L_pred', 'a_pred', 'b_pred'])
df_pred

In [None]:
y_train_pred = model.predict(X_train)
df_train_pred = pd.DataFrame(y_train_pred, columns=['L_train_pred', 'a_train_pred', 'b_train_pred'])
df_train_pred

## Performance Evaluation
- Lab 한꺼번에 학습 및 예측한 결과만!!
- R-sqaured, MSE
- CMC 2:1, CMC 1:1 색차값

### R-squared, MSE

In [None]:
print(f'[Train] R-squared for L,a,b: {round(model.score(X_train, y_train),6)}')
# print(f'[Test]  R-squared for L,a,b: {round(model.score(X_test, y_test),6)}')  # Test 결정계수가 의미있는 값인가??

print(f'\n[Train] MSE for L,a,b: {round(mean_squared_error(y_train, y_train_pred), 4)}')
print(f'[Test]  MSE for L,a,b: {round(mean_squared_error(y_test, y_pred),4)}') # MSE 차이가 꽤 크다. 오버피팅 개선하기!

Test 데이터셋 mse = Train 데이터셋 mse * 2 --> 오버피팅 발생!!

### CMC 색차값, ΔE 계산 (Test)

In [None]:
delta_E_21 = []
delta_E_11 = [] 

for i, (L_pred, a_pred, b_pred) in enumerate(y_pred):
    L_test, a_test, b_test = y_test.iloc[i].values
    target_color = LabColor(L_test, a_test, b_test)
    pred_color = LabColor(L_pred, a_pred, b_pred)
    
    # CMC2:1 - Typically used for acceptability
    delta_E_21.append(delta_e_cmc(target_color, pred_color, pl=2, pc=1))

    # CMC1:1 - Typically used to more closely model human perception
    delta_E_11.append(delta_e_cmc(target_color, pred_color, pl=1, pc=1))

assert len(delta_E_21) == len(delta_E_11)

In [None]:
df_pred['Delta_E (CMC2:1)'] = delta_E_21
df_pred['Below 1 (CMC2:1)'] = (df_pred['Delta_E (CMC2:1)'] < 1)

df_pred['Delta_E (CMC1:1)'] = delta_E_11
df_pred['Below 1 (CMC1:1)'] = (df_pred['Delta_E (CMC1:1)'] < 1)

In [None]:
pd.concat([y_test.reset_index(drop=True), df_pred], axis=1)

In [None]:
print(f"[Test] CMC(2:1) ΔE < 1 : {df_pred['Below 1 (CMC2:1)'].sum()} / {len(df_pred)} = {df_pred['Below 1 (CMC2:1)'].mean():.4f}")
print(f"[Test] CMC(1:1) ΔE < 1 : {df_pred['Below 1 (CMC1:1)'].sum()} / {len(df_pred)} = {df_pred['Below 1 (CMC1:1)'].mean():.4f}")

Test 데이터 중 목표치(ΔE < 1)를 만족하는 데이터는 약 29% 뿐이다. \
Train 데이터 중 목표치를 만족하는 데이터의 비율을 구하여 비교해보았다.

### CMC 색차값, ΔE 계산 (Train)

In [None]:
delta_E_21_train = []
delta_E_11_train = []

for i, (L_train_pred, a_train_pred, b_train_pred) in enumerate(y_train_pred):
    L_train, a_train, b_train = y_train.iloc[i].values
    target_color = LabColor(L_train, a_train, b_train)
    pred_color = LabColor(L_train_pred, a_train_pred, b_train_pred)
    
    # CMC2:1 - Typically used for acceptability
    delta_E_21_train.append(delta_e_cmc(target_color, pred_color, pl=2, pc=1))

    # CMC1:1 - Typically used to more closely model human perception
    delta_E_11_train.append(delta_e_cmc(target_color, pred_color, pl=1, pc=1))

assert len(delta_E_21_train) == len(delta_E_11_train)

In [None]:
df_train_pred['Delta_E (CMC2:1)'] = delta_E_21_train
df_train_pred['Below 1 (CMC2:1)'] = (df_train_pred['Delta_E (CMC2:1)'] < 1)

df_train_pred['Delta_E (CMC1:1)'] = delta_E_11_train
df_train_pred['Below 1 (CMC1:1)'] = (df_train_pred['Delta_E (CMC1:1)'] < 1)

pd.concat([y_train.reset_index(drop=True), df_train_pred], axis=1)

In [None]:
print(f"[Train] CMC(2:1) ΔE < 1 : {df_train_pred['Below 1 (CMC2:1)'].sum()} / {len(df_train_pred)} = {df_train_pred['Below 1 (CMC2:1)'].mean():.4f}")
print(f"[Train] CMC(1:1) ΔE < 1 : {df_train_pred['Below 1 (CMC1:1)'].sum()} / {len(df_train_pred)} = {df_train_pred['Below 1 (CMC1:1)'].mean():.4f}")

약 50% 정도의 학습 데이터만 목표치를 만족하므로 모델의 성능 자체가 낮은 것으로 보인다. \
**언더피팅**을 개선하기 위해 다른 모델을 적용해보면 좋을 것 같다.

### Residual Plot

In [None]:
fig, axes = plt.subplots(1,2, figsize=(10,4))
plt.subplots_adjust(wspace=0.3)
fig.suptitle('[Test] Residual plot', fontsize=15)
sns.set_theme(style='darkgrid')

# CMC 2:1
sns.scatterplot(ax=axes[0], x=df_pred.index, y=df_pred['Delta_E (CMC2:1)'], size=1, legend=False)
axes[0].axhline(1.0, 0, 1, color='red', linewidth=1.5)
axes[0].set_xlabel('ID')

sns.scatterplot(ax=axes[1], x=df_pred.index, y=df_pred['Delta_E (CMC1:1)'], size=1, legend=False)
axes[1].axhline(1.0, 0, 1, color='red', linewidth=1.5)
axes[1].set_xlabel('ID')
plt.show()

In [None]:
fig, axes = plt.subplots(1,2, figsize=(10,4))
plt.subplots_adjust(wspace=0.3)
fig.suptitle('[Train] Residual plot', fontsize=15)
sns.set_theme(style='darkgrid')

sns.scatterplot(ax=axes[0], x=df_train_pred.index, y=df_train_pred['Delta_E (CMC2:1)'], size=1, legend=False)
axes[0].axhline(1.0, 0, 1, color='red', linewidth=1.5)
axes[0].set_xlabel('ID')

sns.scatterplot(ax=axes[1], x=df_train_pred.index, y=df_train_pred['Delta_E (CMC1:1)'], size=1, legend=False)
axes[1].axhline(1.0, 0, 1, color='red', linewidth=1.5)
axes[1].set_xlabel('ID')
plt.show()

## Feature Selection

In [None]:
# Feature Importance
topN = 15

importances = model.feature_importances_
indices = np.argsort(importances)[::-1]

plt.figure()
plt.title(f'Feature Importance Top {topN}')
plt.bar(range(topN), importances[indices[:topN]])
plt.xticks(range(topN), X_train.columns[indices[:topN]], rotation=90)
plt.show()

### 중요도 낮은 피쳐 제거

In [None]:
# 중요도 하위 10개 피쳐 제거
bottomN = 10
select = list(set(X_train.columns.values) - set(X_train.columns[indices[-bottomN:]].values))

### 모델 재학습

In [None]:
model_select = RandomForestRegressor(n_estimators=N_ESTIMATORS, 
                              max_depth=MAX_DEPTH,
                              criterion=CRITERION, # cretrion customize!!
                              max_samples=MAX_SAMPLES,
                              warm_start=False, # default 
                              random_state=SEED)

In [None]:
model_select.fit(X_train[select], y_train)

In [None]:
y_pred_select = model_select.predict(X_test[select])
df_pred_select = pd.DataFrame(y_pred_select, columns=['L_pred', 'a_pred', 'b_pred'])

In [None]:
y_train_pred_select = model_select.predict(X_train[select])
df_train_pred_select = pd.DataFrame(y_train_pred_select, columns=['L_train_pred', 'a_train_pred', 'b_train_pred'])

In [None]:
print(f'[Train] R-squared for L,a,b: {round(model_select.score(X_train[select], y_train),6)}')

print(f'\n[Train] MSE for L,a,b: {round(mean_squared_error(y_train, y_train_pred_select), 4)}')
print(f'[Test]  MSE for L,a,b: {round(mean_squared_error(y_test, y_pred_select),4)}') # MSE 차이가 꽤 크다. 오버피팅 개선하기!

Train, Test MSE 격차 미미하게 감소.... 큰 효과는 없다. \
차원 축소의 영향이 없는 건지 vs 차원을 더 많이 축소해야 할지??