<a href='https://nurilee.com/2020/04/03/lightgbm-definition-parameter-tuning/'>**관련 링크**</a>

### 1. Package Import

In [1]:
import pyarrow
import pickle

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import sklearn
import joblib

### 2. 데이터 전처리

##### (1) Importing the dataset

In [2]:
# Importing the dataset
dataset = pd.read_csv('../res/working_df.csv', encoding='utf-8', index_col=0, engine='pyarrow')
dataset

Unnamed: 0,Date,Time,Lot,pH,Temp,Voltage,sep
,,,,,,,
0,2021-09-06,16:29:54,1,2.15,43.15,19.74,1
1,2021-09-06,16:29:59,1,2.08,40.13,18.01,1
2,2021-09-06,16:30:04,1,2.18,43.46,18.73,1
3,2021-09-06,16:30:09,1,1.99,41.72,16.75,1
4,2021-09-06,16:30:14,1,1.85,43.65,18.02,1
...,...,...,...,...,...,...,...
50089,2021-10-27,18:36:03,22,2.05,42.84,15.38,1
50090,2021-10-27,18:36:08,22,1.91,42.64,19.08,1
50091,2021-10-27,18:36:13,22,2.11,44.09,18.14,1


In [3]:
# df.values : 2차원 numpy 형태의 값을 얻을 수 있다
X = dataset.iloc[:, [3, 4, 5]].values
X

array([[ 2.15, 43.15, 19.74],
       [ 2.08, 40.13, 18.01],
       [ 2.18, 43.46, 18.73],
       ...,
       [ 2.11, 44.09, 18.14],
       [ 1.92, 43.95, 17.96],
       [ 1.81, 44.11, 19.22]])

In [4]:
y = dataset.iloc[:, 6].values
y

array([1, 1, 1, ..., 1, 1, 1], dtype=int64)

##### (2) Splitting the dataset into the Training set and Test set

In [5]:
# sklearn.cross_validation → sklearn.model_selection
from sklearn.model_selection import train_test_split

In [6]:
# ① 원 데이터 → train, test로 나누기
x_train, x_test, y_train, y_test = train_test_split(X, y, random_state=64, test_size=0.2)

In [7]:
# ② train 데이터 → train, validation으로 나누기
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, random_state=64, test_size=0.25)

In [8]:
x_train

array([[ 2.18, 42.94, 15.79],
       [ 1.97, 41.74, 19.38],
       [ 2.05, 40.91, 18.19],
       ...,
       [ 2.18, 44.86, 17.31],
       [ 1.89, 41.8 , 19.28],
       [ 1.88, 44.27, 17.66]])

In [9]:
x_val

array([[ 2.13, 44.74, 19.44],
       [ 2.12, 41.13, 16.24],
       [ 1.97, 41.84, 19.79],
       ...,
       [ 1.82, 42.72, 15.24],
       [ 1.98, 43.43, 16.69],
       [ 2.02, 43.43, 17.17]])

In [10]:
x_test

array([[ 1.86, 42.6 , 17.09],
       [ 2.1 , 42.  , 15.19],
       [ 1.88, 42.88, 17.42],
       ...,
       [ 2.18, 42.36, 18.45],
       [ 2.08, 40.25, 15.02],
       [ 2.03, 41.65, 16.34]])

In [11]:
y_train

array([1, 1, 1, ..., 1, 1, 1], dtype=int64)

In [12]:
y_val

array([1, 1, 1, ..., 1, 1, 1], dtype=int64)

In [13]:
y_test

array([1, 1, 0, ..., 1, 1, 1], dtype=int64)

##### (3) Feature Scaling

In [14]:
from sklearn.preprocessing import StandardScaler

In [15]:
sc = StandardScaler()

In [16]:
# train만 fit_transform() 하고 나머지는 transform()
# fit을 입력하면 훈련에 들어가는 느낌이다
x_train = sc.fit_transform(x_train)
x_val = sc.transform(x_val)
x_test = sc.transform(x_test)

In [17]:
x_train

array([[ 1.40555341,  0.33429016, -1.1483352 ],
       [-0.30861335, -0.48173608,  1.32526737],
       [ 0.34440256, -1.04615423,  0.50532669],
       ...,
       [ 1.40555341,  1.63993215, -0.101016  ],
       [-0.96162926, -0.44093477,  1.25636479],
       [-1.04325624,  1.23871924,  0.14014302]])

In [18]:
x_val

array([[ 0.99741846,  1.55832952,  1.36660892],
       [ 0.91579148, -0.89654942, -0.83827359],
       [-0.30861335, -0.41373389,  1.60776794],
       ...,
       [-1.53301817,  0.18468535, -1.52729938],
       [-0.22698636,  0.66750088, -0.52821199],
       [ 0.09952159,  0.66750088, -0.19747961]])

In [19]:
x_test

array([[-1.20651022,  0.10308273, -0.25260168],
       [ 0.7525375 , -0.30493039, -1.56175067],
       [-1.04325624,  0.29348885, -0.02522317],
       ...,
       [ 1.40555341, -0.06012252,  0.68447339],
       [ 0.58928352, -1.49496866, -1.67888505],
       [ 0.18114858, -0.54293805, -0.76937101]])

### 3. 모델 빌드 및 트레이닝

- 트레이닝 데이터를 LightGBM에 맞는 데이터 세트 포맷으로 변환해야 한다

- 변환된 데이터세트 생성 이후에 파라미터와 그 값으로 구성된 Python Dictionary를 생성. 모델의 정확도는 설정된 파라미터 값에 전적으로 달려있다

In [20]:
import lightgbm as lgb

In [21]:
d_train = lgb.Dataset(x_train, label=y_train)
d_train

<lightgbm.basic.Dataset at 0x26b3ccbb7c0>

In [22]:
# 딕셔너리 생성 : 여기서 paramater를 조절한다
params = {}

params['learning_rate'] = 0.003
params['boosting_type'] = 'gbdt'
params['Task'] = 'binary'
params['metric'] = 'binary_logloss'
params['sub_feature'] = 0.5
params['num_leaves'] = 31
params['min_data'] = 50
params['max_depth'] = 20
params['lambda'] = 0.3
params['in_data_in_leaf'] = 10

파라미터에서 봐야할 요소들:

- "binary"가 objective로 사용됨 (classification 문제)

- "binary_logloss"가 metric으로 사용됨 (classification 문제)

- num_leaves는 10 (작은 데이터이기 때문)

- boosting type은 gbdt인데, gradient boosting을 구현하고 있기 때문

In [23]:
clf = lgb.train(params, d_train, 100)

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 574
[LightGBM] [Info] Number of data points in the train set: 30056, number of used features: 3
[LightGBM] [Info] Start training from score 0.987157


### 4. test 데이터로 Prediction

In [24]:
y_test_pred = clf.predict(x_test)
y_test_pred

array([0.987829  , 0.98668258, 0.987829  , ..., 0.98827409, 0.98650761,
       0.98784945])

In [25]:
# y_pred : convert into binary values
for i in range(len(y_test_pred)):
    if y_test_pred[i] >= .9783:   # 임계값을 0.9783으로 설정
        y_test_pred[i] = 1
    else:
        y_test_pred[i] = 0
y_test_pred

array([1., 1., 1., ..., 1., 1., 1.])

### 5. 결과

##### (1) confusion matrix 또는 accuracy를 계산해서 결과 확인

In [26]:
# Confusion Matrix of Light GBM model
from sklearn.metrics import confusion_matrix

In [27]:
cm = confusion_matrix(y_test, y_test_pred)
cm

array([[   4,  122],
       [  72, 9821]], dtype=int64)

In [28]:
# Accuracy of Light GBM model
from sklearn.metrics import accuracy_score

In [29]:
accuracy = accuracy_score(y_test_pred, y_test)
accuracy

0.9806367900988122

##### (2) validation set으로 검증

In [30]:
y_val_pred = clf.predict(x_val)
y_val_pred

array([0.98702007, 0.98886921, 0.98719884, ..., 0.98706223, 0.9879377 ,
       0.98772107])

In [31]:
min(y_val_pred)

0.9612866718346268

In [32]:
max(y_val_pred)

0.9901871644495334

In [33]:
# convert into binary values
for i in range(len(y_val_pred)):
    if y_val_pred[i] >= .9783:   # 임계값을 0.9783으로 설정
        y_val_pred[i] = 1
    else:
        y_val_pred[i] = 0
y_val_pred

array([1., 1., 1., ..., 1., 1., 1.])

In [34]:
accuracy = accuracy_score(y_val_pred, y_val)
accuracy

0.9819343247829124

##### (3) 정확도 직접 체크

In [35]:
df = pd.DataFrame(columns=['y_test', 'y_pred', 'check'])
df

Unnamed: 0,y_test,y_pred,check


In [36]:
df['y_test'] = y_test
df['y_pred'] = y_test_pred.astype(int)
df['check'] = 0
df

Unnamed: 0,y_test,y_pred,check
0,1,1,0
1,1,1,0
2,0,1,0
3,1,1,0
4,1,1,0
...,...,...,...
10014,1,1,0
10015,1,1,0
10016,1,1,0
10017,1,1,0


In [37]:
# ( df['y_test'] == df['y_pred'] ) : true인 인덱스만 뽑을 수 있음
# true인 인덱스들만 'check'컬럼을 1로 바꾸기
df.loc[( df['y_test'] == df['y_pred'] ), 'check'] = 1

In [38]:
# false는 0으로 그대로 둠 : false인 것들만 표시된다
df[(df['check']==0)]

Unnamed: 0,y_test,y_pred,check
2,0,1,0
34,0,1,0
234,0,1,0
391,1,0,0
449,1,0,0
...,...,...,...
9830,0,1,0
9916,0,1,0
9922,0,1,0
9946,0,1,0


In [39]:
# 맞춘 비율
len(df[(df['check']==1)]) / len(df)

0.9806367900988122

In [40]:
# 틀린 비율
len(df[(df['check']==0)]) / len(df)

0.019363209901187742

In [41]:
# Test Set로 확인
cnt0 = 0
cnt1 = 0
for v in y_test:
    if v == 1 :     # 정상
        cnt1 += 1
    else :          # 불량
        cnt0 += 1
print(f'불량: {cnt0}, 정상: {cnt1}')

불량: 126, 정상: 9893


In [42]:
tmp_df = pd.DataFrame(columns=['y_test', 'y_pred', 'check'])
tmp_df['y_test'] = y_test
tmp_df['y_pred'] = y_test_pred
tmp_df['check'] = 0

# 예측 결과가 같으면 check에 1로 표시
tmp_df.loc[(tmp_df['y_test'] == tmp_df['y_pred']), 'check'] = 1

In [43]:
# 정상인데 정상으로 예측 못하는 것들
tmp_df[(tmp_df['y_test'] == 1) & (tmp_df['check'] == 0)]

Unnamed: 0,y_test,y_pred,check
391,1,0.0,0
449,1,0.0,0
873,1,0.0,0
949,1,0.0,0
1178,1,0.0,0
...,...,...,...
9294,1,0.0,0
9296,1,0.0,0
9645,1,0.0,0
9721,1,0.0,0


In [44]:
# 불량인데 불량으로 예측 못하는 것들
tmp_df[(tmp_df['y_test'] == 0) & (tmp_df['check'] == 0)]

Unnamed: 0,y_test,y_pred,check
2,0,1.0,0
34,0,1.0,0
234,0,1.0,0
481,0,1.0,0
512,0,1.0,0
...,...,...,...
9816,0,1.0,0
9830,0,1.0,0
9916,0,1.0,0
9922,0,1.0,0


In [45]:
# Validation Set로 확인
cnt0 = 0
cnt1 = 0
for v in y_val:
    if v == 1 :     # 정상
        cnt1 += 1
    else :          # 불량
        cnt0 += 1
print(f'불량: {cnt0}, 정상: {cnt1}')

불량: 109, 정상: 9910


In [46]:
tmp_df = pd.DataFrame(columns=['y_val', 'y_val_pred', 'check'])
tmp_df['y_val'] = y_val
tmp_df['y_val_pred'] = y_val_pred
tmp_df['check'] = 0

# 예측 결과가 같으면 check에 1로 표시
tmp_df.loc[(tmp_df['y_val'] == tmp_df['y_val_pred']), 'check'] = 1

In [47]:
# 정상인데 정상으로 예측 못하는 것들
tmp_df[(tmp_df['y_val'] == 1) & (tmp_df['check'] == 0)]

Unnamed: 0,y_val,y_val_pred,check
50,1,0.0,0
129,1,0.0,0
139,1,0.0,0
144,1,0.0,0
163,1,0.0,0
...,...,...,...
9621,1,0.0,0
9665,1,0.0,0
9718,1,0.0,0
9950,1,0.0,0


In [48]:
# 불량인데 불량으로 예측 못하는 것들
tmp_df[(tmp_df['y_val'] == 0) & (tmp_df['check'] == 0)]

Unnamed: 0,y_val,y_val_pred,check
24,0,1.0,0
146,0,1.0,0
149,0,1.0,0
155,0,1.0,0
202,0,1.0,0
...,...,...,...
9446,0,1.0,0
9546,0,1.0,0
9802,0,1.0,0
9919,0,1.0,0


### 6. 저장 및 로드 확인

##### (1) 학습한 모델을 pickle 형태로 변수에 저장

In [49]:
lgbm_model = pickle.dumps(clf)

In [50]:
# Load the pickled model
clf_from_pickle = pickle.loads(lgbm_model)

In [51]:
# Use the loaded pickled model to make predictions
clf_from_pickle.predict(X)

array([0.98232192, 0.98232192, 0.98232192, ..., 0.98232192, 0.98232192,
       0.98232192])

##### (2) 파일에 저장

In [52]:
joblib.dump(clf, 'LGBM_model.pkl')

['LGBM_model.pkl']

In [53]:
clf_from_joblib = joblib.load('LGBM_model.pkl')

In [54]:
clf_from_joblib.predict(X)

array([0.98232192, 0.98232192, 0.98232192, ..., 0.98232192, 0.98232192,
       0.98232192])