In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
%cd /content/drive/MyDrive/Dyetec

In [None]:
import os
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt

pd.options.display.max_rows = 100
pd.options.display.max_columns = 50

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [None]:
# Configs
SEED = 42

N_ESTIMATORS = 200
MAX_DEPTH = 20
CRITERION = 'squared_error'  # 'absolute_error'
# MAX_SAMPLES = 0.1  # range (0, 1.0)

In [None]:
def seed_everything(seed=42):
    random.seed(seed)
    np.random.seed(seed)

seed_everything(SEED)

### Load Data

In [None]:
df = pd.read_excel('rawdata.xlsx')
df.drop(columns=df.columns[0], inplace=True)

n_components = df.iloc[0,:].values  # Number of components in each column
colnames = df.iloc[1,:].values      # Column names

# Rename columns
df.columns = colnames
df.drop(labels=[1,2], inplace=True)
df.reset_index(drop=True, inplace=True)

# Remove 119 columns which have unique value
df.iloc[0,:] = [round(v) for v in df.iloc[0,:]]  # float -> int
df = df.loc[:, df.iloc[0,:] > 1]
df

### Split data based on number of component values

- `df_2val` : Extract 16 columns with **Two component** values

In [None]:
# Extract columns with two component values
df_2val = df.loc[:, df.iloc[0,:] == 2].drop(labels=0).reset_index(drop=True)
# print(df_2val.columns.values)
df_2val

In [None]:
for col in df_2val.columns:
    print(df_2val[col].value_counts())
    print()

### Grouping 2-value features

In [None]:
# 피처 그룹핑
velocity, Dianix, Doros, Syno, others = [], [], [], [], []

for col in df_2val.columns:
    if '속도' in col:
        velocity.append(col)
    elif 'Dianix' in col:
        Dianix.append(col)
    elif 'Doros' in col:
        Doros.append(col)
    elif 'Syno' in col:
        Syno.append(col)
    else:
        others.append(col)

velocity, Dianix, Doros, Syno, others

In [None]:
df_2val[velocity].value_counts()

In [None]:
df_2val[Dianix].value_counts()

In [None]:
df_2val[Doros].value_counts()

In [None]:
df_2val[Syno].value_counts()

In [None]:
df_2val[others].value_counts()

- 전처리_CH3COOH, 배합_빙초산 일치
- 배합_UVK-200 = 4 이면 항상 전처리_CH3COOH=0.5, 배합_빙초산=0.5, 제직_중량_생지=237, 후처리_NaOH=2.0 (총 54개)
- 제직_중량_생지 = 242 이면 항상 전처리_CH3COOH=1.0, 배합_빙초산=0.2, 배합_UVK-200=0, 후처리_NaOH=2.0 (총 601개)
- 후처리_NaOH=1.5 이면 항상 전처리_CH3COOH=0.5, 배합_빙초산=0.5, 배합_UVK-200=0, 제직_중량_생지=237 (총 198개)

## Last Week

### Train & Test Dataset

In [None]:
X, y = df.iloc[1:,3:], df.iloc[1:,:3]
print(X.shape, y.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED)

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

타겟변수 3개 -> 일단 하나씩 예측해보자.

In [None]:
# y가 1d-array여야 함.
y_train_L, y_train_a, y_train_b = y_train.iloc[:,0], y_train.iloc[:,1], y_train.iloc[:,2]
print(y_train_L.shape, y_train_a.shape, y_train_b.shape)

# y_test_L, y_test_a, y_test_b = y_test.iloc[:,0], y_test.iloc[:,1], y_test.iloc[:,2]
# print(y_test_L.shape, y_test_a.shape, y_test_b.shape)

### Null check

In [None]:
X_train.info()

In [None]:
# 결측값 존재 확인
X_train.isna().sum().sum(), X_test.isna().sum().sum()

### Training

In [None]:
model = RandomForestRegressor(n_estimators=200, 
                              max_depth=20,
                            #   criterion='squared_error',
                            #   max_samples=MAX_SAMPLES,
                            #   warm_start=False, # default 
                              random_state=42)

### L, a, b 따로 학습 및 예측

- 타켓변수 = L

In [None]:
model.fit(X_train, y_train_L)
print(f'R-squared for L*: {round(model.score(X_train, y_train_L),4)}')

In [None]:
L_pred = model.predict(X_test)

- 타켓변수 = a

In [None]:
model.fit(X_train, y_train_a)
print(f'R-squared for a*: {round(model.score(X_train, y_train_a),4)}')

In [None]:
a_pred = model.predict(X_test)

- 타켓변수 = b

In [None]:
model.fit(X_train, y_train_b)
print(f'R-squared for b*: {round(model.score(X_train, y_train_b),4)}')

In [None]:
b_pred = model.predict(X_test)

In [None]:
df_pred_sep = pd.DataFrame()
df_pred_sep['L_pred_sep'] = L_pred
df_pred_sep['a_pred_sep'] = a_pred
df_pred_sep['b_pred_sep'] = b_pred

df_pred_sep

### L, a, b 한꺼번에 학습 및 예측

In [None]:
model.fit(X_train, y_train)
print(f'R-squared for L*,a*,b*: {round(model.score(X_train, y_train),4)}')

In [None]:
y_pred = model.predict(X_test)
df_pred = pd.DataFrame(y_pred, columns=['L_pred', 'a_pred', 'b_pred'])
df_pred

### 정답, Lab 따로, Lab 한꺼번에  결과 비교

In [None]:
pd.concat([y_test.reset_index(drop=True), df_pred_sep, df_pred], axis=1)

### 모델 성능 평가
- Metric 뭘로 하지?? L,a,b의 MSE 총합??

In [None]:
mean_squared_error(y_test, y_pred)

In [None]:
print(round(mean_squared_error(y_test.iloc[:,0], L_pred),4))
print(round(mean_squared_error(y_test.iloc[:,1], a_pred),4))
print(round(mean_squared_error(y_test.iloc[:,2], b_pred),4))

### 변수 중요도

In [None]:
importances = model.feature_importances_
std = np.std([tree.feature_importances_ for tree in model.estimators_], axis=0)
indices = np.argsort(importances)[::-1]

In [None]:
N = 15

plt.figure()
plt.title(f'Feature Importance Top {N}')
# plt.bar(range(N), importances[indices[:N]], yerr=std[indices[:N]], align='center')
plt.bar(range(N), importances[indices[:N]])
plt.xticks(range(N), X_train.columns[indices[:N]], rotation=90)
plt.show()

### 컬럼명 바꾸기

In [None]:
# cols = df.columns
# df.rename(columns={cols[0]:'L', cols[1]:'a', cols[2]:'b'}, inplace=True)

In [None]:
#  L,a,b = df.iloc[:,0], df.iloc[:,1], df.iloc[:,2]