In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import os

%load_ext autoreload
%autoreload 2

os.chdir('/content/drive/MyDrive/AI+X_middle_01/')

#**스마트폰 센서 데이터 기반 모션 분류**
# 단계3 : 단계별 모델링


## 0.미션

단계별로 나눠서 모델링을 수행하고자 합니다.  

* 단계1 : 정적(0), 동적(1) 행동 분류 모델 생성
* 단계2 : 세부 동작에 대한 분류모델 생성
    * 단계1 모델에서 0으로 예측 -> 정적 행동 3가지 분류 모델링
    * 단계1 모델에서 1으로 예측 -> 동적 행동 3가지 분류 모델링
* 모델 통합
    * 두 단계 모델을 통합하고, 새로운 데이터에 대해서 최종 예측결과와 성능평가가 나오도록 함수로 만들기
* 성능 비교
    * 기본 모델링의 성능과 비교
    * 모든 모델링은 [다양한 알고리즘 + 성능 튜닝]을 수행해야 합니다.


## 1.환경설정

### (1) 라이브러리 불러오기

* 세부 요구사항
    - 기본적으로 필요한 라이브러리를 import 하도록 코드가 작성되어 있습니다.
    - 필요하다고 판단되는 라이브러리를 추가하세요.

In [3]:
!pip install catboost



In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# 필요하다고 판단되는 라이브러리를 추가하세요.
import random
from tqdm import tqdm
from joblib import dump, load

import plotly.express as px

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

from sklearn.metrics import *

### (2) 데이터 불러오기

* 주어진 데이터셋
    * data01_train.csv : 학습 및 검증용

 <br/>  

* 세부 요구사항
    - data01_train.csv 를 불러와 'data' 이름으로 저장합니다.
        - data에서 변수 subject는 삭제합니다.
    - data01_test.csv 를 불러와 'new_data' 이름으로 저장합니다.


In [5]:
data = pd.read_csv('data/data01_train.csv')
data.drop('subject', axis = 1, inplace = True)
data.head()

Unnamed: 0,tBodyAcc-mean()-X,tBodyAcc-mean()-Y,tBodyAcc-mean()-Z,tBodyAcc-std()-X,tBodyAcc-std()-Y,tBodyAcc-std()-Z,tBodyAcc-mad()-X,tBodyAcc-mad()-Y,tBodyAcc-mad()-Z,tBodyAcc-max()-X,...,fBodyBodyGyroJerkMag-skewness(),fBodyBodyGyroJerkMag-kurtosis(),"angle(tBodyAccMean,gravity)","angle(tBodyAccJerkMean),gravityMean)","angle(tBodyGyroMean,gravityMean)","angle(tBodyGyroJerkMean,gravityMean)","angle(X,gravityMean)","angle(Y,gravityMean)","angle(Z,gravityMean)",Activity
0,0.288508,-0.009196,-0.103362,-0.988986,-0.962797,-0.967422,-0.989,-0.962596,-0.96565,-0.929747,...,-0.487737,-0.816696,-0.042494,-0.044218,0.307873,0.07279,-0.60112,0.331298,0.165163,STANDING
1,0.265757,-0.016576,-0.098163,-0.989551,-0.994636,-0.987435,-0.990189,-0.99387,-0.987558,-0.937337,...,-0.23782,-0.693515,-0.062899,0.388459,-0.765014,0.771524,0.345205,-0.769186,-0.147944,LAYING
2,0.278709,-0.014511,-0.108717,-0.99772,-0.981088,-0.994008,-0.997934,-0.982187,-0.995017,-0.942584,...,-0.535287,-0.829311,0.000265,-0.525022,-0.891875,0.021528,-0.833564,0.202434,-0.032755,STANDING
3,0.289795,-0.035536,-0.150354,-0.231727,-0.006412,-0.338117,-0.273557,0.014245,-0.347916,0.008288,...,-0.004012,-0.408956,-0.255125,0.612804,0.747381,-0.072944,-0.695819,0.287154,0.111388,WALKING
4,0.394807,0.034098,0.091229,0.088489,-0.106636,-0.388502,-0.010469,-0.10968,-0.346372,0.584131,...,-0.157832,-0.563437,-0.044344,-0.845268,-0.97465,-0.887846,-0.705029,0.264952,0.137758,WALKING_DOWNSTAIRS


In [6]:
new_data = pd.read_csv('data/data01_test.csv')
new_data.drop('subject', axis = 1, inplace = True)
new_data.head()

Unnamed: 0,tBodyAcc-mean()-X,tBodyAcc-mean()-Y,tBodyAcc-mean()-Z,tBodyAcc-std()-X,tBodyAcc-std()-Y,tBodyAcc-std()-Z,tBodyAcc-mad()-X,tBodyAcc-mad()-Y,tBodyAcc-mad()-Z,tBodyAcc-max()-X,...,fBodyBodyGyroJerkMag-skewness(),fBodyBodyGyroJerkMag-kurtosis(),"angle(tBodyAccMean,gravity)","angle(tBodyAccJerkMean),gravityMean)","angle(tBodyGyroMean,gravityMean)","angle(tBodyGyroJerkMean,gravityMean)","angle(X,gravityMean)","angle(Y,gravityMean)","angle(Z,gravityMean)",Activity
0,0.284379,-0.021981,-0.116683,-0.99249,-0.97964,-0.963321,-0.992563,-0.977304,-0.958142,-0.93885,...,-0.509523,-0.850065,-0.018043,0.092304,0.07422,-0.714534,-0.671943,-0.018351,-0.185733,SITTING
1,0.27744,-0.028086,-0.118412,-0.99662,-0.927676,-0.972294,-0.997346,-0.931405,-0.971788,-0.939837,...,-0.210792,-0.613367,-0.022456,-0.155414,0.247498,-0.112257,-0.826816,0.184489,-0.068699,STANDING
2,0.305833,-0.041023,-0.087303,0.00688,0.1828,-0.237984,0.005642,0.028616,-0.236474,0.016311,...,0.579587,0.394388,-0.362616,0.171069,0.576349,-0.688314,-0.743234,0.272186,0.053101,WALKING
3,0.276053,-0.016487,-0.108381,-0.995379,-0.983978,-0.975854,-0.995877,-0.98528,-0.974907,-0.941425,...,-0.566291,-0.841455,0.289548,0.079801,-0.020033,0.291898,-0.639435,-0.111998,-0.123298,SITTING
4,0.271998,0.016904,-0.078856,-0.973468,-0.702462,-0.86945,-0.97981,-0.711601,-0.856807,-0.92076,...,0.447577,0.214219,0.010111,0.114179,-0.830776,-0.325098,-0.840817,0.116237,-0.096615,STANDING


In [7]:
# LGBM을 위한 JSON 데이터 처리
import re

data.columns = [re.sub(r'[^\w]', '_', col) for col in data.columns]

## 2.데이터 전처리

* 세부 요구사항
    - Label 추가 : data 에 Activity_dynamic 를 추가합니다. Activity_dynamic은 과제1에서 is_dynamic과 동일한 값입니다.
    - x와 y1, y2로 분할하시오.
        * y1 : Activity
        * y2 : Activity_dynamic
    - train : val = 8 : 2 혹은 7 : 3
    - random_state 옵션을 사용하여 다른 모델과 비교를 위해 성능이 재현되도록 합니다.

In [8]:
# 정적:0 / 동적:1
data['Activity_dynamic'] = data['Activity'].map({'LAYING':0, 'STANDING':0, 'SITTING':0, 'WALKING':1, 'WALKING_UPSTAIRS':1, 'WALKING_DOWNSTAIRS':1})
data.head()

Unnamed: 0,tBodyAcc_mean___X,tBodyAcc_mean___Y,tBodyAcc_mean___Z,tBodyAcc_std___X,tBodyAcc_std___Y,tBodyAcc_std___Z,tBodyAcc_mad___X,tBodyAcc_mad___Y,tBodyAcc_mad___Z,tBodyAcc_max___X,...,fBodyBodyGyroJerkMag_kurtosis__,angle_tBodyAccMean_gravity_,angle_tBodyAccJerkMean__gravityMean_,angle_tBodyGyroMean_gravityMean_,angle_tBodyGyroJerkMean_gravityMean_,angle_X_gravityMean_,angle_Y_gravityMean_,angle_Z_gravityMean_,Activity,Activity_dynamic
0,0.288508,-0.009196,-0.103362,-0.988986,-0.962797,-0.967422,-0.989,-0.962596,-0.96565,-0.929747,...,-0.816696,-0.042494,-0.044218,0.307873,0.07279,-0.60112,0.331298,0.165163,STANDING,0
1,0.265757,-0.016576,-0.098163,-0.989551,-0.994636,-0.987435,-0.990189,-0.99387,-0.987558,-0.937337,...,-0.693515,-0.062899,0.388459,-0.765014,0.771524,0.345205,-0.769186,-0.147944,LAYING,0
2,0.278709,-0.014511,-0.108717,-0.99772,-0.981088,-0.994008,-0.997934,-0.982187,-0.995017,-0.942584,...,-0.829311,0.000265,-0.525022,-0.891875,0.021528,-0.833564,0.202434,-0.032755,STANDING,0
3,0.289795,-0.035536,-0.150354,-0.231727,-0.006412,-0.338117,-0.273557,0.014245,-0.347916,0.008288,...,-0.408956,-0.255125,0.612804,0.747381,-0.072944,-0.695819,0.287154,0.111388,WALKING,1
4,0.394807,0.034098,0.091229,0.088489,-0.106636,-0.388502,-0.010469,-0.10968,-0.346372,0.584131,...,-0.563437,-0.044344,-0.845268,-0.97465,-0.887846,-0.705029,0.264952,0.137758,WALKING_DOWNSTAIRS,1


In [9]:
X = data.drop(['Activity', 'Activity_dynamic'], axis = 1)
y1 = data['Activity']
y2 = data['Activity_dynamic']

In [10]:
sc = MinMaxScaler()
X = pd.DataFrame(sc.fit_transform(X), columns = X.columns)
X.head()

Unnamed: 0,tBodyAcc_mean___X,tBodyAcc_mean___Y,tBodyAcc_mean___Z,tBodyAcc_std___X,tBodyAcc_std___Y,tBodyAcc_std___Z,tBodyAcc_mad___X,tBodyAcc_mad___Y,tBodyAcc_mad___Z,tBodyAcc_max___X,...,fBodyBodyGyroJerkMag_meanFreq__,fBodyBodyGyroJerkMag_skewness__,fBodyBodyGyroJerkMag_kurtosis__,angle_tBodyAccMean_gravity_,angle_tBodyAccJerkMean__gravityMean_,angle_tBodyGyroMean_gravityMean_,angle_tBodyGyroJerkMean_gravityMean_,angle_X_gravityMean_,angle_Y_gravityMean_,angle_Z_gravityMean_
0,0.526878,0.401033,0.448319,0.005507,0.019335,0.016125,0.0055,0.01872,0.017175,0.035126,...,0.488133,0.249657,0.093564,0.472577,0.477891,0.654361,0.537449,0.201725,0.900647,0.578396
1,0.511749,0.396653,0.450918,0.005224,0.002718,0.006117,0.004906,0.002821,0.006221,0.031332,...,0.375973,0.376595,0.156521,0.462253,0.694229,0.117569,0.887502,0.680309,0.15615,0.420272
2,0.520362,0.397878,0.445641,0.00114,0.009789,0.00283,0.001033,0.00876,0.002492,0.028708,...,0.708258,0.225505,0.087117,0.49421,0.237489,0.054098,0.511768,0.084172,0.813468,0.478444
3,0.527733,0.3854,0.424823,0.384137,0.518471,0.33083,0.363222,0.515313,0.326042,0.504144,...,0.531583,0.495352,0.301955,0.365002,0.806402,0.874258,0.464438,0.153833,0.870783,0.551238
4,0.597564,0.426728,0.545615,0.544245,0.466164,0.305633,0.494765,0.452314,0.326814,0.792066,...,0.497705,0.417223,0.223002,0.471641,0.077366,0.012683,0.056187,0.149176,0.855762,0.564555


In [11]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5881 entries, 0 to 5880
Columns: 561 entries, tBodyAcc_mean___X to angle_Z_gravityMean_
dtypes: float64(561)
memory usage: 25.2 MB


In [12]:
X_train, X_val, y_train, y_val = train_test_split(X, y2, test_size = 0.2, random_state = 42, shuffle=False)

In [13]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4704 entries, 0 to 4703
Columns: 561 entries, tBodyAcc_mean___X to angle_Z_gravityMean_
dtypes: float64(561)
memory usage: 20.2 MB


## **3.단계별 모델링**

![](https://github.com/DA4BAM/image/blob/main/step%20by%20step.png?raw=true)

### (1) 단계1 : 정적/동적 행동 분류 모델

* 세부 요구사항
    * 정적 행동(Laying, Sitting, Standing)과 동적 행동(동적 : Walking, Walking-Up, Walking-Down)을 구분하는 모델 생성.
    * 몇가지 모델을 만들고 가장 성능이 좋은 모델을 선정하시오.

#### 1) 알고리즘1 : XGBClassifier

In [14]:
from xgboost import XGBClassifier

xgb_model = XGBClassifier(random_state=42)
xgb_model.fit(X_train, y_train)

pred = xgb_model.predict(X_val)

#평가
print('accuracy :',accuracy_score(y_val, pred))
print('='*60)
print(confusion_matrix(y_val, pred))
print('='*60)
print(classification_report(y_val, pred))

# F1 스코어 확인 (micro, macro, weighted 등 선택 가능)
f1_xgb = f1_score(y_val, pred, average='weighted')
print('F1 Score (weighted):', f1_xgb)

accuracy : 1.0
[[658   0]
 [  0 519]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       658
           1       1.00      1.00      1.00       519

    accuracy                           1.00      1177
   macro avg       1.00      1.00      1.00      1177
weighted avg       1.00      1.00      1.00      1177

F1 Score (weighted): 1.0


#### 2) 알고리즘2 : LGBMClassifier

In [15]:
from lightgbm import LGBMClassifier

lgbm_model = LGBMClassifier(random_state=42)
lgbm_model.fit(X_train, y_train)

pred = lgbm_model.predict(X_val)

#평가
print('accuracy :',accuracy_score(y_val, pred))
print('='*60)
print(confusion_matrix(y_val, pred))
print('='*60)
print(classification_report(y_val, pred))

# F1 스코어 확인 (micro, macro, weighted 등 선택 가능)
f1_lgbm = f1_score(y_val, pred, average='weighted')
print('F1 Score (weighted):', f1_lgbm)

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



[LightGBM] [Info] Number of positive: 2128, number of negative: 2576
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.141232 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 140131
[LightGBM] [Info] Number of data points in the train set: 4704, number of used features: 561
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.452381 -> initscore=-0.191055
[LightGBM] [Info] Start training from score -0.191055
accuracy : 1.0
[[658   0]
 [  0 519]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       658
           1       1.00      1.00      1.00       519

    accuracy                           1.00      1177
   macro avg       1.00      1.00      1.00      1177
weighted avg       1.00      1.00      1.00      1177

F1 Score (weighted): 1.0


* 기본 모델만으로 acc와 f1_score가 robust하게 1.0을 달성하기 때문에 추가적인 tunning을 수행하지 않음.

In [16]:
dump(xgb_model, 'model1_xgb.joblib')

['model1_xgb.joblib']

### (2) 단계2-1 : 정적 동작 세부 분류

* 세부 요구사항
    * 정적 행동(Laying, Sitting, Standing)인 데이터 추출
    * Laying, Sitting, Standing 를 분류하는 모델을 생성
    * 몇가지 모델을 만들고 가장 성능이 좋은 모델을 선정하시오.

In [17]:
data_static = data[data['Activity_dynamic'] == 0]
data_static.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3234 entries, 0 to 5880
Columns: 563 entries, tBodyAcc_mean___X to Activity_dynamic
dtypes: float64(561), int64(1), object(1)
memory usage: 13.9+ MB


In [18]:
X_static = data_static.drop(['Activity', 'Activity_dynamic'], axis = 1)
y1_static = data_static['Activity'].map({'LAYING':0, 'STANDING':1, 'SITTING':2})

In [19]:
X_static = pd.DataFrame(sc.transform(X_static), columns = X_static.columns)
X_static.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3234 entries, 0 to 3233
Columns: 561 entries, tBodyAcc_mean___X to angle_Z_gravityMean_
dtypes: float64(561)
memory usage: 13.8 MB


In [20]:
X_train, X_val, y_train, y_val = train_test_split(X_static, y1_static, test_size = 0.2, random_state = 42, shuffle=False)

### XGBClassifier

In [21]:
from xgboost import XGBClassifier

xgb_model_static = XGBClassifier(random_state=42)
xgb_model_static.fit(X_train, y_train)

pred = xgb_model_static.predict(X_val)

#평가
print('accuracy :',accuracy_score(y_val, pred))
print('='*60)
print(confusion_matrix(y_val, pred))
print('='*60)
print(classification_report(y_val, pred))

# F1 스코어 확인 (micro, macro, weighted 등 선택 가능)
f1_xgb = f1_score(y_val, pred, average='weighted')
print('F1 Score (weighted):', f1_xgb)

accuracy : 0.9938176197836167
[[229   0   0]
 [  0 215   2]
 [  0   2 199]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       229
           1       0.99      0.99      0.99       217
           2       0.99      0.99      0.99       201

    accuracy                           0.99       647
   macro avg       0.99      0.99      0.99       647
weighted avg       0.99      0.99      0.99       647

F1 Score (weighted): 0.9938176197836167


* HyperOpt를 이용한 hyperparameter tunning

In [22]:
# HyperOpt 검색 공간 설정
from hyperopt import hp

xgb_search_space = {'max_depth':hp.quniform('max_depth', 5, 20, 1),
                   'min_child_weight':hp.quniform('min_child_weight', 1, 2, 1),
                   'learning_rate':hp.uniform('learning_rate', 0.01, 0.2),
                   'colsample_bytree':hp.uniform('colsample_bytree', 0.5, 1)}

In [23]:
# HyperOpt 목적 함수 설정
from sklearn.model_selection import cross_val_score # 교차검증
from hyperopt import STATUS_OK

def objective_func(search_space):
    xgb_clf = XGBClassifier(
        n_estimators=100,
        max_depth=int(search_space['max_depth']), # 정수형 하이퍼 파라미터 형변환 필요:int형
        min_child_weight=int(search_space['min_child_weight']), # 정수형 하이퍼 파라미터 형변환 필요:int형
        learning_rate=search_space['learning_rate'],
        colsample_bytree=search_space['colsample_bytree'],
        eval_metric='logloss')

    # 목적 함수의 반환값은 교차검증 기반의 평균 정확도 사용
    accuracy = cross_val_score(xgb_clf, X_train, y_train, scoring='accuracy', cv=3)

    # accuracy는 cv=3 개수만큼의 결과를 리스트로 가짐. 이를 평균하여 반환하되 -1을 곱함
    return {'loss':-1 * np.mean(accuracy), 'status':STATUS_OK}

In [25]:
# HyperOpt fmin()을 이용해 최적 하이퍼 파라미터 도출
from hyperopt import fmin, tpe, Trials

trial_val = Trials()

best = fmin(fn=objective_func,
           space=xgb_search_space,
           algo=tpe.suggest,
           max_evals=20, # 입력값 시도 횟수 지정
           trials=trial_val,
           # rstate=np.random.default_rng(seed=9)
           )

print('best:', best)

100%|██████████| 20/20 [16:21<00:00, 49.06s/trial, best loss: -0.9733277233055432]
best: {'colsample_bytree': 0.8218466352123244, 'learning_rate': 0.1959387550701951, 'max_depth': 6.0, 'min_child_weight': 2.0}


In [26]:
# 모델 선언
xgb_tunning_static = XGBClassifier(
    n_estimators=400,
    learning_rate=round(best['learning_rate'], 5),
    max_depth=int(best['max_depth']),
    min_child_weight=int(best['min_child_weight']),
    colsample_bytree=round(best['colsample_bytree'], 5)
)

## model train
xgb_tunning_static.fit(
    X_train, y_train,
    verbose=True
)

pred_tunning = xgb_tunning_static.predict(X_val)

#평가
print('accuracy :',accuracy_score(y_val, pred_tunning))
print('='*60)
print(confusion_matrix(y_val, pred_tunning))
print('='*60)
print(classification_report(y_val, pred_tunning))

# F1 스코어 확인 (micro, macro, weighted 등 선택 가능)
f1_xgb = f1_score(y_val, pred_tunning, average='weighted')
print('F1 Score (weighted):', f1_xgb)

accuracy : 0.9938176197836167
[[229   0   0]
 [  0 215   2]
 [  0   2 199]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       229
           1       0.99      0.99      0.99       217
           2       0.99      0.99      0.99       201

    accuracy                           0.99       647
   macro avg       0.99      0.99      0.99       647
weighted avg       0.99      0.99      0.99       647

F1 Score (weighted): 0.9938176197836167


### LGBMClassifier


In [27]:
from lightgbm import LGBMClassifier

lgbm_model_static = LGBMClassifier(random_state=42)
lgbm_model_static.fit(X_train, y_train)

pred = lgbm_model_static.predict(X_val)

#평가
print('accuracy :',accuracy_score(y_val, pred))
print('='*60)
print(confusion_matrix(y_val, pred))
print('='*60)
print(classification_report(y_val, pred))

# F1 스코어 확인 (micro, macro, weighted 등 선택 가능)
f1_lgbm = f1_score(y_val, pred, average='weighted')
print('F1 Score (weighted):', f1_lgbm)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.016531 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 139340
[LightGBM] [Info] Number of data points in the train set: 2587, number of used features: 561
[LightGBM] [Info] Start training from score -1.071537
[LightGBM] [Info] Start training from score -1.089761
[LightGBM] [Info] Start training from score -1.135624
accuracy : 0.9891808346213292
[[229   0   0]
 [  0 216   1]
 [  0   6 195]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       229
           1       0.97      1.00      0.98       217
           2       0.99      0.97      0.98       201

    accuracy                           0.99       647
   macro avg       0.99      0.99      0.99       647
weighted avg       0.99      0.99      0.99       647

F1 Score (weighted): 0.9891743164160839


* HyperOpt를 이용한 hyperparameter tunning

In [28]:
# HyperOpt 검색 공간 설정
from hyperopt import hp

lgbm_search_space = {'max_depth':hp.quniform('max_depth', 5, 20, 1),
                   'min_child_weight':hp.quniform('min_child_weight', 1, 2, 1),
                   'learning_rate':hp.uniform('learning_rate', 0.01, 0.2),
                   'colsample_bytree':hp.uniform('colsample_bytree', 0.5, 1)}

In [29]:
# HyperOpt 목적 함수 설정
from sklearn.model_selection import cross_val_score # 교차검증
from hyperopt import STATUS_OK

def objective_func(search_space):
    lgbm_clf = LGBMClassifier(
        n_estimators=100,
        max_depth=int(search_space['max_depth']), # 정수형 하이퍼 파라미터 형변환 필요:int형
        min_child_weight=int(search_space['min_child_weight']), # 정수형 하이퍼 파라미터 형변환 필요:int형
        learning_rate=search_space['learning_rate'],
        colsample_bytree=search_space['colsample_bytree'],
        eval_metric='logloss')

    # 목적 함수의 반환값은 교차검증 기반의 평균 정확도 사용
    accuracy = cross_val_score(lgbm_clf, X_train, y_train, scoring='accuracy', cv=3)

    # accuracy는 cv=3 개수만큼의 결과를 리스트로 가짐. 이를 평균하여 반환하되 -1을 곱함
    return {'loss':-1 * np.mean(accuracy), 'status':STATUS_OK}

In [30]:
# HyperOpt fmin()을 이용해 최적 하이퍼 파라미터 도출
from hyperopt import fmin, tpe, Trials

trial_val = Trials()

best = fmin(fn=objective_func,
           space=xgb_search_space,
           algo=tpe.suggest,
           max_evals=15, # 입력값 시도 횟수 지정
           trials=trial_val,
           # rstate=np.random.default_rng(seed=9)
           )

print('best:', best)

[1;30;43m스트리밍 출력 내용이 길어서 마지막 5000줄이 삭제되었습니다.[0m
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.019516 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 139083
[LightGBM] [Info] Number of data points in the train set: 1725, number of used features: 561
[LightGBM] [Info] Start training from score -1.071166
[LightGBM] [Info] Start training from score -1.089954
[LightGBM] [Info] Start training from score -1.135818
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.025451 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 139104
[LightGBM] [Info] Number of data points in the train set: 1724, number of used features: 561
[LightGBM] [Info] Start training from score -1.072280
[LightGBM] [Info] Start training from score -1.089374
[LightGBM] [Info] Start training from score -1.135238
[LightGBM] [Info] Auto-choosing col-wise mul

In [32]:
# 모델 선언
lgbm_tunning_static = LGBMClassifier(
    n_estimators=400,
    learning_rate=round(best['learning_rate'], 5),
    max_depth=int(best['max_depth']),
    min_child_weight=int(best['min_child_weight']),
    colsample_bytree=round(best['colsample_bytree'], 5)
)

## model train
lgbm_tunning_static.fit(X_train, y_train)

pred_tunning = lgbm_tunning_static.predict(X_val)

#평가
print('accuracy :',accuracy_score(y_val, pred_tunning))
print('='*60)
print(confusion_matrix(y_val, pred_tunning))
print('='*60)
print(classification_report(y_val, pred_tunning))

# F1 스코어 확인 (micro, macro, weighted 등 선택 가능)
f1_lgbm = f1_score(y_val, pred_tunning, average='weighted')
print('F1 Score (weighted):', f1_lgbm)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.030742 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 139340
[LightGBM] [Info] Number of data points in the train set: 2587, number of used features: 561
[LightGBM] [Info] Start training from score -1.071537
[LightGBM] [Info] Start training from score -1.089761
[LightGBM] [Info] Start training from score -1.135624
accuracy : 0.9876352395672334
[[229   0   0]
 [  0 214   3]
 [  0   5 196]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       229
           1       0.98      0.99      0.98       217
           2       0.98      0.98      0.98       201

    accuracy                           0.99       647
   macro avg       0.99      0.99      0.99       647
weighted avg       0.99      0.99      0.99       647

F1 Score (weighted): 0.9876326872084285


In [33]:
dump(xgb_tunning_static, 'model2_1_xgb.joblib')

['model2_1_xgb.joblib']

### (3) 단계2-2 : 동적 동작 세부 분류

* 세부 요구사항
    * 동동적 행동(Walking, Walking Upstairs, Walking Downstairs)인 데이터 추출
    * Walking, Walking Upstairs, Walking Downstairs 를 분류하는 모델을 생성
    * 몇가지 모델을 만들고 가장 성능이 좋은 모델을 선정하시오.

In [55]:
data_dynamic = data[data['Activity_dynamic'] == 1]
data_dynamic.head()

Unnamed: 0,tBodyAcc_mean___X,tBodyAcc_mean___Y,tBodyAcc_mean___Z,tBodyAcc_std___X,tBodyAcc_std___Y,tBodyAcc_std___Z,tBodyAcc_mad___X,tBodyAcc_mad___Y,tBodyAcc_mad___Z,tBodyAcc_max___X,...,fBodyBodyGyroJerkMag_kurtosis__,angle_tBodyAccMean_gravity_,angle_tBodyAccJerkMean__gravityMean_,angle_tBodyGyroMean_gravityMean_,angle_tBodyGyroJerkMean_gravityMean_,angle_X_gravityMean_,angle_Y_gravityMean_,angle_Z_gravityMean_,Activity,Activity_dynamic
3,0.289795,-0.035536,-0.150354,-0.231727,-0.006412,-0.338117,-0.273557,0.014245,-0.347916,0.008288,...,-0.408956,-0.255125,0.612804,0.747381,-0.072944,-0.695819,0.287154,0.111388,WALKING,1
4,0.394807,0.034098,0.091229,0.088489,-0.106636,-0.388502,-0.010469,-0.10968,-0.346372,0.584131,...,-0.563437,-0.044344,-0.845268,-0.97465,-0.887846,-0.705029,0.264952,0.137758,WALKING_DOWNSTAIRS,1
5,0.330708,0.007561,-0.061371,-0.21576,0.101075,0.072949,-0.269857,0.06006,0.101298,-0.019263,...,-0.887024,-0.030645,-0.852091,-0.500195,0.306091,-0.552729,0.253885,0.291256,WALKING_UPSTAIRS,1
6,0.121465,-0.031902,-0.005196,-0.152198,-0.113104,-0.239423,-0.202401,-0.164698,-0.247099,0.114668,...,-0.775779,0.445206,-0.003487,-0.940185,0.041387,-0.886603,0.173338,-0.005627,WALKING,1
12,0.303885,0.002768,-0.038613,-0.168656,0.190336,-0.140473,-0.205134,0.101144,-0.120572,-0.000818,...,-0.329728,-0.04003,0.257252,0.076091,-0.123425,-0.752882,0.266729,0.045692,WALKING,1


In [56]:
X_dynamic = data_dynamic.drop(['Activity', 'Activity_dynamic'], axis = 1)
y1_dynamic = data_dynamic['Activity'].map({'WALKING':0, 'WALKING_UPSTAIRS':1, 'WALKING_DOWNSTAIRS':2})

In [57]:
X_dynamic = pd.DataFrame(sc.transform(X_dynamic), columns = X_dynamic.columns)
y1_dynamic.head()

Unnamed: 0,Activity
3,0
4,2
5,1
6,0
12,0


In [58]:
X_train, X_val, y_train, y_val = train_test_split(X_dynamic, y1_dynamic, test_size = 0.2, random_state = 42, shuffle=False)

### XGBClassifier

In [59]:
from xgboost import XGBClassifier

xgb_model_dynamic = XGBClassifier(random_state=42)
xgb_model_dynamic.fit(X_train, y_train)

pred = xgb_model_dynamic.predict(X_val)

#평가
print('accuracy :',accuracy_score(y_val, pred))
print('='*60)
print(confusion_matrix(y_val, pred))
print('='*60)
print(classification_report(y_val, pred))

# F1 스코어 확인 (micro, macro, weighted 등 선택 가능)
f1_xgb = f1_score(y_val, pred, average='weighted')
print('F1 Score (weighted):', f1_xgb)

accuracy : 0.9943396226415094
[[191   2   0]
 [  1 178   0]
 [  0   0 158]]
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       193
           1       0.99      0.99      0.99       179
           2       1.00      1.00      1.00       158

    accuracy                           0.99       530
   macro avg       0.99      0.99      0.99       530
weighted avg       0.99      0.99      0.99       530

F1 Score (weighted): 0.9943401550360083


* HyperOpt를 이용한 hyperparameter tunning

In [60]:
# HyperOpt 검색 공간 설정
from hyperopt import hp

xgb_search_space = {'max_depth':hp.quniform('max_depth', 5, 20, 1),
                   'min_child_weight':hp.quniform('min_child_weight', 1, 2, 1),
                   'learning_rate':hp.uniform('learning_rate', 0.01, 0.2),
                   'colsample_bytree':hp.uniform('colsample_bytree', 0.5, 1)}

In [61]:
# HyperOpt 목적 함수 설정
from sklearn.model_selection import cross_val_score # 교차검증
from hyperopt import STATUS_OK

def objective_func(search_space):
    xgb_clf = XGBClassifier(
        n_estimators=100,
        max_depth=int(search_space['max_depth']), # 정수형 하이퍼 파라미터 형변환 필요:int형
        min_child_weight=int(search_space['min_child_weight']), # 정수형 하이퍼 파라미터 형변환 필요:int형
        learning_rate=search_space['learning_rate'],
        colsample_bytree=search_space['colsample_bytree'],
        eval_metric='logloss')

    # 목적 함수의 반환값은 교차검증 기반의 평균 정확도 사용
    accuracy = cross_val_score(xgb_clf, X_train, y_train, scoring='accuracy', cv=3)

    # accuracy는 cv=3 개수만큼의 결과를 리스트로 가짐. 이를 평균하여 반환하되 -1을 곱함
    return {'loss':-1 * np.mean(accuracy), 'status':STATUS_OK}

In [62]:
# HyperOpt fmin()을 이용해 최적 하이퍼 파라미터 도출
from hyperopt import fmin, tpe, Trials

trial_val = Trials()

best = fmin(fn=objective_func,
           space=xgb_search_space,
           algo=tpe.suggest,
           max_evals=20, # 입력값 시도 횟수 지정
           trials=trial_val,
           # rstate=np.random.default_rng(seed=9)
           )

print('best:', best)

100%|██████████| 20/20 [17:13<00:00, 51.68s/trial, best loss: -0.9900816373000088]
best: {'colsample_bytree': 0.5508392428953727, 'learning_rate': 0.1218390807600367, 'max_depth': 6.0, 'min_child_weight': 2.0}


In [63]:
# 모델 선언
xgb_tunning_dynamic = XGBClassifier(
    n_estimators=400,
    learning_rate=round(best['learning_rate'], 5),
    max_depth=int(best['max_depth']),
    min_child_weight=int(best['min_child_weight']),
    colsample_bytree=round(best['colsample_bytree'], 5)
)

## model train
xgb_tunning_dynamic.fit(
    X_train, y_train,
    verbose=True
)

pred_tunning = xgb_tunning_dynamic.predict(X_val)

#평가
print('accuracy :',accuracy_score(y_val, pred_tunning))
print('='*60)
print(confusion_matrix(y_val, pred_tunning))
print('='*60)
print(classification_report(y_val, pred_tunning))

# F1 스코어 확인 (micro, macro, weighted 등 선택 가능)
f1_xgb = f1_score(y_val, pred_tunning, average='weighted')
print('F1 Score (weighted):', f1_xgb)

accuracy : 0.9981132075471698
[[192   1   0]
 [  0 179   0]
 [  0   0 158]]
              precision    recall  f1-score   support

           0       1.00      0.99      1.00       193
           1       0.99      1.00      1.00       179
           2       1.00      1.00      1.00       158

    accuracy                           1.00       530
   macro avg       1.00      1.00      1.00       530
weighted avg       1.00      1.00      1.00       530

F1 Score (weighted): 0.9981133850120029


### LGBMClassifier

In [64]:
from lightgbm import LGBMClassifier

lgbm_model_dynamic = LGBMClassifier(random_state=42)
lgbm_model_dynamic.fit(X_train, y_train)

pred = lgbm_model_dynamic.predict(X_val)

#평가
print('accuracy :',accuracy_score(y_val, pred))
print('='*60)
print(confusion_matrix(y_val, pred))
print('='*60)
print(classification_report(y_val, pred))

# F1 스코어 확인 (micro, macro, weighted 등 선택 가능)
f1_lgbm = f1_score(y_val, pred, average='weighted')
print('F1 Score (weighted):', f1_lgbm)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.014590 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 139783
[LightGBM] [Info] Number of data points in the train set: 2117, number of used features: 561
[LightGBM] [Info] Start training from score -0.966913
[LightGBM] [Info] Start training from score -1.137134
[LightGBM] [Info] Start training from score -1.207285
accuracy : 1.0
[[193   0   0]
 [  0 179   0]
 [  0   0 158]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       193
           1       1.00      1.00      1.00       179
           2       1.00      1.00      1.00       158

    accuracy                           1.00       530
   macro avg       1.00      1.00      1.00       530
weighted avg       1.00      1.00      1.00       530

F1 Score (weighted): 1.0


* HyperOpt를 이용한 hyperparameter tunning

In [65]:
# HyperOpt 검색 공간 설정
from hyperopt import hp

lgbm_search_space = {'max_depth':hp.quniform('max_depth', 5, 20, 1),
                   'min_child_weight':hp.quniform('min_child_weight', 1, 2, 1),
                   'learning_rate':hp.uniform('learning_rate', 0.01, 0.2),
                   'colsample_bytree':hp.uniform('colsample_bytree', 0.5, 1)}

In [66]:
# HyperOpt 목적 함수 설정
from sklearn.model_selection import cross_val_score # 교차검증
from hyperopt import STATUS_OK

def objective_func(search_space):
    lgbm_clf = LGBMClassifier(
        n_estimators=100,
        max_depth=int(search_space['max_depth']), # 정수형 하이퍼 파라미터 형변환 필요:int형
        min_child_weight=int(search_space['min_child_weight']), # 정수형 하이퍼 파라미터 형변환 필요:int형
        learning_rate=search_space['learning_rate'],
        colsample_bytree=search_space['colsample_bytree'],
        eval_metric='logloss')

    # 목적 함수의 반환값은 교차검증 기반의 평균 정확도 사용
    accuracy = cross_val_score(lgbm_clf, X_train, y_train, scoring='accuracy', cv=3)

    # accuracy는 cv=3 개수만큼의 결과를 리스트로 가짐. 이를 평균하여 반환하되 -1을 곱함
    return {'loss':-1 * np.mean(accuracy), 'status':STATUS_OK}

In [67]:
# HyperOpt fmin()을 이용해 최적 하이퍼 파라미터 도출
from hyperopt import fmin, tpe, Trials

trial_val = Trials()

best = fmin(fn=objective_func,
           space=xgb_search_space,
           algo=tpe.suggest,
           max_evals=15, # 입력값 시도 횟수 지정
           trials=trial_val,
           # rstate=np.random.default_rng(seed=9)
           )

print('best:', best)

[1;30;43m스트리밍 출력 내용이 길어서 마지막 5000줄이 삭제되었습니다.[0m
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.015596 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 139676
[LightGBM] [Info] Number of data points in the train set: 1412, number of used features: 561
[LightGBM] [Info] Start training from score -0.966764
[LightGBM] [Info] Start training from score -1.136870
[LightGBM] [Info] Start training from score -1.207757
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.023159 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 139684
[LightGBM] [Info] Number of data points in the train set: 1411, number of used features: 561
[LightGBM] [Info] Start training from score -0.967920
[LightGBM] [Info] Start training from score -1.136162
[LightGBM] [Info] Start training from score -1.207049
[LightGBM] [Info] Auto-choosing col-wise mul

In [68]:
# 모델 선언
lgbm_tunning_dynamic = LGBMClassifier(
    n_estimators=400,
    learning_rate=round(best['learning_rate'], 5),
    max_depth=int(best['max_depth']),
    min_child_weight=int(best['min_child_weight']),
    colsample_bytree=round(best['colsample_bytree'], 5)
)

## model train
lgbm_tunning_dynamic.fit(X_train, y_train)

pred_tunning = lgbm_tunning_dynamic.predict(X_val)

#평가
print('accuracy :',accuracy_score(y_val, pred_tunning))
print('='*60)
print(confusion_matrix(y_val, pred_tunning))
print('='*60)
print(classification_report(y_val, pred_tunning))

# F1 스코어 확인 (micro, macro, weighted 등 선택 가능)
f1_lgbm = f1_score(y_val, pred_tunning, average='weighted')
print('F1 Score (weighted):', f1_lgbm)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.024662 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 139783
[LightGBM] [Info] Number of data points in the train set: 2117, number of used features: 561
[LightGBM] [Info] Start training from score -0.966913
[LightGBM] [Info] Start training from score -1.137134
[LightGBM] [Info] Start training from score -1.207285
accuracy : 0.9962264150943396
[[191   2   0]
 [  0 179   0]
 [  0   0 158]]
              precision    recall  f1-score   support

           0       1.00      0.99      0.99       193
           1       0.99      1.00      0.99       179
           2       1.00      1.00      1.00       158

    accuracy                           1.00       530
   macro avg       1.00      1.00      1.00       530
weighted avg       1.00      1.00      1.00       530

F1 Score (weighted): 0.996227070230608


In [70]:
dump(lgbm_model_dynamic, 'model2_2_lgbm.joblib')

['model2_2_lgbm.joblib']

### (4) 분류 모델 합치기


* 세부 요구사항
    * 두 단계 모델을 통합하고, 새로운 데이터(test)에 대해서 최종 예측결과와 성능평가가 나오도록 함수로 만들기
    * 데이터 파이프라인 구축 : test데이터가 로딩되어 전처리 과정을 거치고, 예측 및 성능 평가 수행

![](https://github.com/DA4BAM/image/blob/main/pipeline%20function.png?raw=true)

#### 1) 함수 만들기

In [71]:
new_data = pd.read_csv('data/data01_test.csv')
new_data.drop('subject', axis = 1, inplace = True)
new_data.head()

Unnamed: 0,tBodyAcc-mean()-X,tBodyAcc-mean()-Y,tBodyAcc-mean()-Z,tBodyAcc-std()-X,tBodyAcc-std()-Y,tBodyAcc-std()-Z,tBodyAcc-mad()-X,tBodyAcc-mad()-Y,tBodyAcc-mad()-Z,tBodyAcc-max()-X,...,fBodyBodyGyroJerkMag-skewness(),fBodyBodyGyroJerkMag-kurtosis(),"angle(tBodyAccMean,gravity)","angle(tBodyAccJerkMean),gravityMean)","angle(tBodyGyroMean,gravityMean)","angle(tBodyGyroJerkMean,gravityMean)","angle(X,gravityMean)","angle(Y,gravityMean)","angle(Z,gravityMean)",Activity
0,0.284379,-0.021981,-0.116683,-0.99249,-0.97964,-0.963321,-0.992563,-0.977304,-0.958142,-0.93885,...,-0.509523,-0.850065,-0.018043,0.092304,0.07422,-0.714534,-0.671943,-0.018351,-0.185733,SITTING
1,0.27744,-0.028086,-0.118412,-0.99662,-0.927676,-0.972294,-0.997346,-0.931405,-0.971788,-0.939837,...,-0.210792,-0.613367,-0.022456,-0.155414,0.247498,-0.112257,-0.826816,0.184489,-0.068699,STANDING
2,0.305833,-0.041023,-0.087303,0.00688,0.1828,-0.237984,0.005642,0.028616,-0.236474,0.016311,...,0.579587,0.394388,-0.362616,0.171069,0.576349,-0.688314,-0.743234,0.272186,0.053101,WALKING
3,0.276053,-0.016487,-0.108381,-0.995379,-0.983978,-0.975854,-0.995877,-0.98528,-0.974907,-0.941425,...,-0.566291,-0.841455,0.289548,0.079801,-0.020033,0.291898,-0.639435,-0.111998,-0.123298,SITTING
4,0.271998,0.016904,-0.078856,-0.973468,-0.702462,-0.86945,-0.97981,-0.711601,-0.856807,-0.92076,...,0.447577,0.214219,0.010111,0.114179,-0.830776,-0.325098,-0.840817,0.116237,-0.096615,STANDING


In [93]:
def data_pipeline(new_data, model1, model2_1, model2_2, sc):
    """
    두 단계 모델 예측 및 평가 파이프라인 함수

    Parameters:
        X_test (array-like): 테스트 데이터
        model1: 첫 번째 분류 모델
        model2_1: 두 번째 모델 (Laying, Sitting, Standing, Walking 예측)
        model2_2: 두 번째 모델 (Walking_Up, Walking_Dw 예측)
        scaler: 데이터 스케일러 (훈련 데이터로 학습된 Scaler)

    Returns:
        dict: accuracy, classification_report 포함한 평가 결과
    """
    new_data.columns = [re.sub(r'[^\w]', '_', col) for col in new_data.columns]

    X_test = new_data.drop('Activity', axis = 1)
    y_test = new_data['Activity']

    # 전처리 (스케일링)
    X_test_scaled = sc.transform(X_test)

    # 모델1 예측
    model1_pred = model1.predict(X_test_scaled)

    # 모델2 예측
    final_preds = []
    for i, pred in enumerate(model1_pred):
        if pred == 0:  # 모델 2-1을 사용할 경우 ('LAYING', 'STANDING', 'SITTING')
            sub_pred = model2_1.predict([X_test_scaled[i]])[0]
            activity_mapping = {0: 'LAYING', 1: 'STANDING', 2: 'SITTING'}
            final_preds.append(activity_mapping[sub_pred])
        else:  # 모델 2-2를 사용할 경우 ('WALKING', 'WALKING_UPSTAIRS', 'WALKING_DOWNSTAIRS')
            sub_pred = model2_2.predict([X_test_scaled[i]])[0]
            activity_mapping = {0: 'WALKING', 1: 'WALKING_UPSTAIRS', 2: 'WALKING_DOWNSTAIRS'}
            final_preds.append(activity_mapping[sub_pred])

    # 최종 성능 평가
    results = {
        'accuracy': accuracy_score(y_test, final_preds),
        'classification_report': classification_report(y_test, final_preds, output_dict=True)
    }

    # 결과 출력
    print("Accuracy:", results['accuracy'])
    print("Classification Report:")
    print(classification_report(y_test, final_preds))

    f1 = f1_score(y_test, final_preds, average='weighted')
    print('F1 Score (weighted):', f1)

    return results

In [94]:
model1 = load('model1_xgb.joblib')
model2_1 = load('model2_1_xgb.joblib')
model2_2 = load('model2_2_lgbm.joblib')

In [95]:
results = data_pipeline(new_data, model1, model2_1, model2_2, sc)

Accuracy: 0.9857239972807614
Classification Report:
                    precision    recall  f1-score   support

            LAYING       1.00      1.00      1.00       292
           SITTING       0.96      0.96      0.96       254
          STANDING       0.97      0.97      0.97       287
           WALKING       1.00      0.99      1.00       228
WALKING_DOWNSTAIRS       1.00      1.00      1.00       195
  WALKING_UPSTAIRS       0.99      1.00      1.00       215

          accuracy                           0.99      1471
         macro avg       0.99      0.99      0.99      1471
      weighted avg       0.99      0.99      0.99      1471

F1 Score (weighted): 0.9857214611008823
