In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
%cd /content/drive/MyDrive/Dyetec

In [None]:
import os
import pandas as pd
import numpy as np
import random

pd.options.display.max_rows = 100
pd.options.display.max_columns = 50

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [None]:
# Configs
SEED = 42

N_ESTIMATORS = 100
MAX_DEPTH = 20
CRITERION = 'squared_error'  # 'absolute_error'
# MAX_SAMPLES = 0.1  # range (0, 1.0)

In [None]:
def seed_everything(seed=42):
    random.seed(seed)
    np.random.seed(seed)

seed_everything(SEED)

### Data Loading

In [None]:
df = pd.read_excel('rawdata.xlsx', skiprows=2)
df.drop(labels=0, inplace=True)
df.reset_index(drop=True, inplace=True)
df.drop(columns=['태그'], inplace=True)
df = df.iloc[:, :54]
df

### Train & Test Dataset

In [None]:
X, y = df.iloc[:,3:], df.iloc[:,:3]
print(X.shape, y.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED)

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

타겟변수 3개 -> 일단 하나씩 예측해보자.

In [None]:
# y가 1d-array여야 함.
y_train_L, y_train_a, y_train_b = y_train.iloc[:,0], y_train.iloc[:,1], y_train.iloc[:,2]
print(y_train_L.shape, y_train_a.shape, y_train_b.shape)

# y_test_L, y_test_a, y_test_b = y_test.iloc[:,0], y_test.iloc[:,1], y_test.iloc[:,2]
# print(y_test_L.shape, y_test_a.shape, y_test_b.shape)

### EDA

In [None]:
X_train.info()

In [None]:
# 결측값 존재 확인
X_train.isna().sum().sum(), X_test.isna().sum().sum()

### Training

In [None]:
model = RandomForestRegressor(n_estimators=N_ESTIMATORS, 
                              max_depth=MAX_DEPTH,
                              criterion=CRITERION,
                              max_samples=MAX_SAMPLES,
                              warm_start=False, # default 
                              random_state=SEED)

### L, a, b 따로 학습 및 예측

- 타켓변수 = L

In [None]:
model.fit(X_train, y_train_L)
print(f'R-squared for L*: {round(model.score(X_train, y_train_L),4)}')

In [None]:
L_pred = model.predict(X_test)

- 타켓변수 = a

In [None]:
model.fit(X_train, y_train_a)
print(f'R-squared for a*: {round(model.score(X_train, y_train_a),4)}')

In [None]:
a_pred = model.predict(X_test)

- 타켓변수 = b

In [None]:
model.fit(X_train, y_train_b)
print(f'R-squared for b*: {round(model.score(X_train, y_train_b),4)}')

In [None]:
b_pred = model.predict(X_test)

In [None]:
df_pred_sep = pd.DataFrame()
df_pred_sep['L_pred_sep'] = L_pred
df_pred_sep['a_pred_sep'] = a_pred
df_pred_sep['b_pred_sep'] = b_pred

df_pred_sep

### L, a, b 한꺼번에 학습 및 예측

In [None]:
model.fit(X_train, y_train)
print(f'R-squared for L*,a*,b*: {round(model.score(X_train, y_train),4)}')

In [None]:
y_pred = model.predict(X_test)
df_pred = pd.DataFrame(y_pred, columns=['L_pred', 'a_pred', 'b_pred'])
df_pred

### 정답, Lab 따로, Lab 한꺼번에  결과 비교

In [None]:
pd.concat([y_test.reset_index(drop=True), df_pred_sep, df_pred], axis=1)

### 모델 성능 평가
- Metric 뭘로 하지?? L,a,b의 MSE 총합??

In [None]:
mean_squared_error(y_test, y_pred)

In [None]:
print(round(mean_squared_error(y_test.iloc[:,0], L_pred),4))
print(round(mean_squared_error(y_test.iloc[:,1], a_pred),4))
print(round(mean_squared_error(y_test.iloc[:,2], b_pred),4))

### 컬럼명 바꾸기

In [None]:
# cols = df.columns
# df.rename(columns={cols[0]:'L', cols[1]:'a', cols[2]:'b'}, inplace=True)

In [None]:
#  L,a,b = df.iloc[:,0], df.iloc[:,1], df.iloc[:,2]