<a href='https://nurilee.com/2020/04/03/lightgbm-definition-parameter-tuning/'>**관련 링크**</a>

### 1. Package Import

In [12]:
import pyarrow
import pickle

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import sklearn
import joblib

### 2. 데이터 전처리

##### (1) Importing the dataset

In [13]:
# Importing the dataset
dataset = pd.read_csv('../res/working_df.csv', encoding='utf-8', index_col=0, engine='pyarrow')
dataset

Unnamed: 0,Date,Time,Lot,pH,Temp,Voltage,sep
,,,,,,,
0,2021-09-06,16:29:54,1,2.15,43.15,19.74,0
1,2021-09-06,16:29:59,1,2.08,40.13,18.01,0
2,2021-09-06,16:30:04,1,2.18,43.46,18.73,0
3,2021-09-06,16:30:09,1,1.99,41.72,16.75,0
4,2021-09-06,16:30:14,1,1.85,43.65,18.02,0
...,...,...,...,...,...,...,...
50089,2021-10-27,18:36:03,22,2.05,42.84,15.38,0
50090,2021-10-27,18:36:08,22,1.91,42.64,19.08,0
50091,2021-10-27,18:36:13,22,2.11,44.09,18.14,0


In [14]:
# df.values : 2차원 numpy 형태의 값을 얻을 수 있다
X = dataset.iloc[:, [3, 4, 5]].values
X

array([[ 2.15, 43.15, 19.74],
       [ 2.08, 40.13, 18.01],
       [ 2.18, 43.46, 18.73],
       ...,
       [ 2.11, 44.09, 18.14],
       [ 1.92, 43.95, 17.96],
       [ 1.81, 44.11, 19.22]])

In [15]:
y = dataset.iloc[:, 6].values
y

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

##### (2) Splitting the dataset into the Training set and Test set

In [16]:
# sklearn.cross_validation → sklearn.model_selection
from sklearn.model_selection import train_test_split

In [17]:
# ① 원 데이터 → train, test로 나누기
x_train, x_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2, stratify=y)

In [87]:
# ② train 데이터 → train, validation으로 나누기
# 데이터가 적으면 이 validation set으로 나누는 과정은 하지 않는다
# x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, random_state=64, test_size=0.25)

In [18]:
x_train

array([[ 1.98, 43.1 , 19.1 ],
       [ 2.1 , 44.5 , 18.4 ],
       [ 1.86, 41.74, 19.57],
       ...,
       [ 2.01, 44.15, 16.41],
       [ 1.99, 41.88, 16.36],
       [ 1.86, 44.85, 17.39]])

In [19]:
x_test

array([[ 1.85, 40.17, 17.17],
       [ 2.05, 40.68, 18.27],
       [ 2.11, 41.44, 17.89],
       ...,
       [ 1.82, 41.01, 17.  ],
       [ 1.9 , 41.49, 17.58],
       [ 1.99, 42.45, 16.91]])

In [9]:
y_train

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [10]:
y_test

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

##### (3) Feature Scaling

정규화 해서도 해보고, 안하고도 해본다

In [14]:
from sklearn.preprocessing import MinMaxScaler
sc = MinMaxScaler()

In [15]:
# train만 fit_transform() 하고 나머지는 transform()
# fit을 입력하면 훈련에 들어가는 느낌이다
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

In [16]:
x_train

array([[0.        , 0.60084388, 0.7817623 ],
       [0.42647059, 0.87763713, 0.80020492],
       [0.67647059, 0.62700422, 0.21106557],
       ...,
       [0.29411765, 0.82194093, 0.62602459],
       [0.51470588, 0.67257384, 0.85758197],
       [0.26470588, 0.70379747, 0.58811475]])

In [17]:
x_test

array([[0.55882353, 0.94261603, 0.71209016],
       [0.35294118, 0.88607595, 0.67622951],
       [0.20588235, 0.95780591, 0.82889344],
       ...,
       [0.20588235, 0.98396624, 0.94672131],
       [0.44117647, 0.60337553, 0.81659836],
       [0.52941176, 0.64303797, 0.85348361]])

### 3. 모델 빌드 및 트레이닝

- 트레이닝 데이터를 LightGBM에 맞는 데이터 세트 포맷으로 변환해야 한다

- 변환된 데이터세트 생성 이후에 파라미터와 그 값으로 구성된 Python Dictionary를 생성. 모델의 정확도는 설정된 파라미터 값에 전적으로 달려있다

In [30]:
import lightgbm as lgb
from lightgbm import LGBMClassifier

In [21]:
d_train = lgb.Dataset(x_train, label=y_train)
d_train

<lightgbm.basic.Dataset at 0x1db7c660190>

In [22]:
lgb_model = lgb.LGBMClassifier()

In [23]:
# 딕셔너리 생성 : 여기서 paramater를 조절한다
params = {}

params['boosting_type'] = 'gbdt'
params['objective'] = 'binary'
params['metric'] = 'binary_logloss'
params['n_estimators'] = 100
params['max_depth'] = 30
params['num_leaves'] = 30
params['learning_rate'] = 0.1

In [29]:
lgb_model.fit(X, y)

In [28]:
clf = lgb.train(params, d_train)

AttributeError: 'LGBMClassifier' object has no attribute 'train'

### 4. test 데이터로 Prediction

In [15]:
y_test_pred = clf.predict(x_test)
y_test_pred

array([0.01183395, 0.01183846, 0.01183846, ..., 0.01183846, 0.01183846,
       0.01183846])

In [16]:
pred_min = min(y_test_pred)
pred_max = max(y_test_pred)
pred_avg = (min(y_test_pred) + max(y_test_pred))/2

In [17]:
print(f'최소값: {pred_min}, 최대값: {pred_max}')
print(f'평균값: {pred_avg}')

최소값: 0.01079244499608317, 최대값: 0.07324391259452853
평균값: 0.04201817879530585


In [18]:
# y_pred : convert into binary values
for i in range(len(y_test_pred)):
    if y_test_pred[i] >= pred_avg:   # 임계값을 y_test_pred의 평균값으로 설정
        y_test_pred[i] = 1
    else:
        y_test_pred[i] = 0
y_test_pred

array([0., 0., 0., ..., 0., 0., 0.])

### 5. 결과

##### (1) confusion matrix 또는 accuracy를 계산해서 결과 확인

In [19]:
# Confusion Matrix of Light GBM model
from sklearn.metrics import confusion_matrix

In [20]:
cm = confusion_matrix(y_test, y_test_pred)
cm

array([[9889,    6],
       [ 123,    1]], dtype=int64)

In [21]:
# Accuracy of Light GBM model
from sklearn.metrics import accuracy_score

In [22]:
accuracy = accuracy_score(y_test_pred, y_test)
accuracy

0.9871244635193133

##### (2) 정확도 직접 체크

In [23]:
df = pd.DataFrame(columns=['y_test', 'y_pred', 'check'])
df

Unnamed: 0,y_test,y_pred,check


In [24]:
df['y_test'] = y_test
df['y_pred'] = y_test_pred.astype(int)
df['check'] = 0
df

Unnamed: 0,y_test,y_pred,check
0,0,0,0
1,0,0,0
2,0,0,0
3,0,0,0
4,0,0,0
...,...,...,...
10014,0,0,0
10015,0,0,0
10016,0,0,0
10017,0,0,0


In [25]:
# ( df['y_test'] == df['y_pred'] ) : true인 인덱스만 뽑을 수 있음
# true인 인덱스들만 'check'컬럼을 1로 바꾸기
df.loc[( df['y_test'] == df['y_pred'] ), 'check'] = 1

In [26]:
# false는 0으로 그대로 둠 : false인 것들만 표시된다
df[(df['check']==0)]

Unnamed: 0,y_test,y_pred,check
18,1,0,0
60,1,0,0
64,1,0,0
117,1,0,0
198,1,0,0
...,...,...,...
9585,1,0,0
9828,1,0,0
9833,1,0,0
9838,1,0,0


In [27]:
# 맞춘 비율
len(df[(df['check']==1)]) / len(df)

0.9871244635193133

In [28]:
# 틀린 비율
len(df[(df['check']==0)]) / len(df)

0.012875536480686695

In [29]:
# Test Set로 확인
cnt0 = 0
cnt1 = 0
for v in y_test:
    if v == 0 :     # 정상 0
        cnt0 += 1
    else :          # 불량 1
        cnt1 += 1
print(f'정상: {cnt0}, 불량: {cnt1}')

정상: 9895, 불량: 124


In [30]:
tmp_df = pd.DataFrame(columns=['y_test', 'y_pred', 'check'])
tmp_df['y_test'] = y_test
tmp_df['y_pred'] = y_test_pred
tmp_df['check'] = 0

# 예측 결과가 같으면 check에 1로 표시
tmp_df.loc[(tmp_df['y_test'] == tmp_df['y_pred']), 'check'] = 1

In [31]:
# 정상인데 정상으로 예측 못하는 것들
tmp_df[(tmp_df['y_test'] == 0) & (tmp_df['check'] == 0)]
# tmp_df[(tmp_df['y_test'] == 0) & (tmp_df['check'] == 0)].shape

Unnamed: 0,y_test,y_pred,check
867,0,1.0,0
2640,0,1.0,0
2651,0,1.0,0
3768,0,1.0,0
5549,0,1.0,0
7803,0,1.0,0


In [32]:
# 불량인데 불량으로 예측 못하는 것들
tmp_df[(tmp_df['y_test'] == 1) & (tmp_df['check'] == 0)]

Unnamed: 0,y_test,y_pred,check
18,1,0.0,0
60,1,0.0,0
64,1,0.0,0
117,1,0.0,0
198,1,0.0,0
...,...,...,...
9585,1,0.0,0
9828,1,0.0,0
9833,1,0.0,0
9838,1,0.0,0


### 6. 저장 및 로드 확인

##### (1) 학습한 모델을 pickle 형태로 변수에 저장

In [33]:
lgbm_model = pickle.dumps(clf)

In [34]:
# Load the pickled model
clf_from_pickle = pickle.loads(lgbm_model)

In [35]:
# Use the loaded pickled model to make predictions
clf_from_pickle.predict(X)

array([0.01192554, 0.01174872, 0.01192554, ..., 0.01214797, 0.01216159,
       0.01211649])

##### (2) 파일에 저장

In [36]:
joblib.dump(clf, 'LGBM_model.pkl')

['LGBM_model.pkl']

In [37]:
clf_from_joblib = joblib.load('LGBM_model.pkl')

In [38]:
clf_from_joblib.predict(X)

array([0.01192554, 0.01174872, 0.01192554, ..., 0.01214797, 0.01216159,
       0.01211649])