#**스마트폰 센서 데이터 기반 모션 분류**
# 단계3 : 단계별 모델링


## 0.미션

단계별로 나눠서 모델링을 수행하고자 합니다.  

* 단계1 : 정적(0), 동적(1) 행동 분류 모델 생성
* 단계2 : 세부 동작에 대한 분류모델 생성
    * 단계1 모델에서 0으로 예측 -> 정적 행동 3가지 분류 모델링
    * 단계1 모델에서 1으로 예측 -> 동적 행동 3가지 분류 모델링 
* 모델 통합
    * 두 단계 모델을 통합하고, 새로운 데이터에 대해서 최종 예측결과와 성능평가가 나오도록 함수로 만들기
* 성능 비교
    * 기본 모델링의 성능과 비교
    * 모든 모델링은 [다양한 알고리즘 + 성능 튜닝]을 수행해야 합니다.


## 1.환경설정

### (1) 라이브러리 불러오기

* 세부 요구사항
    - 기본적으로 필요한 라이브러리를 import 하도록 코드가 작성되어 있습니다.
    - 필요하다고 판단되는 라이브러리를 추가하세요.

In [43]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
# 필요하다고 판단되는 라이브러리를 추가하세요.
import warnings
warnings.filterwarnings('ignore')
import re
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import *
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.preprocessing import MinMaxScaler, StandardScaler

In [167]:
# 변수의 특성 중요도 계산하기
def plot_feature_importance(importance, names, result_only = False, topn = 'all'):
    feature_importance = np.array(importance)
    feature_name = np.array(names)

    data={'feature_name':feature_name,'feature_importance':feature_importance}
    fi_temp = pd.DataFrame(data)

    #변수의 특성 중요도 순으로 정렬하기
    fi_temp.sort_values(by=['feature_importance'], ascending=False,inplace=True)
    fi_temp.reset_index(drop=True, inplace = True)

    if topn == 'all' :
        fi_df = fi_temp.copy()
    else :
        fi_df = fi_temp.iloc[:topn]

    #변수의 특성 중요도 그래프로 그리기
    if result_only == False :
        plt.figure(figsize=(10,20))
        sns.barplot(x='feature_importance', y='feature_name', data = fi_df)

        plt.xlabel('importance')
        plt.ylabel('feature name')
        plt.grid()

    return fi_df

### (2) 데이터 불러오기

* 주어진 데이터셋
    * data01_train.csv : 학습 및 검증용

 <br/>  

* 세부 요구사항
    - data01_train.csv 를 불러와 'data' 이름으로 저장합니다.
        - data에서 변수 subject는 삭제합니다.
    - data01_test.csv 를 불러와 'new_data' 이름으로 저장합니다.


In [2]:
data = pd.read_csv('data01_train.csv')
new_data = pd.read_csv('data01_test.csv')

In [3]:
data = data.drop('subject', axis = 1)
data.head()

Unnamed: 0,tBodyAcc-mean()-X,tBodyAcc-mean()-Y,tBodyAcc-mean()-Z,tBodyAcc-std()-X,tBodyAcc-std()-Y,tBodyAcc-std()-Z,tBodyAcc-mad()-X,tBodyAcc-mad()-Y,tBodyAcc-mad()-Z,tBodyAcc-max()-X,...,fBodyBodyGyroJerkMag-skewness(),fBodyBodyGyroJerkMag-kurtosis(),"angle(tBodyAccMean,gravity)","angle(tBodyAccJerkMean),gravityMean)","angle(tBodyGyroMean,gravityMean)","angle(tBodyGyroJerkMean,gravityMean)","angle(X,gravityMean)","angle(Y,gravityMean)","angle(Z,gravityMean)",Activity
0,0.288508,-0.009196,-0.103362,-0.988986,-0.962797,-0.967422,-0.989,-0.962596,-0.96565,-0.929747,...,-0.487737,-0.816696,-0.042494,-0.044218,0.307873,0.07279,-0.60112,0.331298,0.165163,STANDING
1,0.265757,-0.016576,-0.098163,-0.989551,-0.994636,-0.987435,-0.990189,-0.99387,-0.987558,-0.937337,...,-0.23782,-0.693515,-0.062899,0.388459,-0.765014,0.771524,0.345205,-0.769186,-0.147944,LAYING
2,0.278709,-0.014511,-0.108717,-0.99772,-0.981088,-0.994008,-0.997934,-0.982187,-0.995017,-0.942584,...,-0.535287,-0.829311,0.000265,-0.525022,-0.891875,0.021528,-0.833564,0.202434,-0.032755,STANDING
3,0.289795,-0.035536,-0.150354,-0.231727,-0.006412,-0.338117,-0.273557,0.014245,-0.347916,0.008288,...,-0.004012,-0.408956,-0.255125,0.612804,0.747381,-0.072944,-0.695819,0.287154,0.111388,WALKING
4,0.394807,0.034098,0.091229,0.088489,-0.106636,-0.388502,-0.010469,-0.10968,-0.346372,0.584131,...,-0.157832,-0.563437,-0.044344,-0.845268,-0.97465,-0.887846,-0.705029,0.264952,0.137758,WALKING_DOWNSTAIRS


## 2.데이터 전처리

* 세부 요구사항
    - Label 추가 : data 에 Activity_dynamic 를 추가합니다. Activity_dynamic은 과제1에서 is_dynamic과 동일한 값입니다.
    - x와 y1, y2로 분할하시오.
        * y1 : Activity
        * y2 : Activity_dynamic
    - train : val = 8 : 2 혹은 7 : 3
    - random_state 옵션을 사용하여 다른 모델과 비교를 위해 성능이 재현되도록 합니다.

In [5]:
data['Activity_dynamic'] = np.where(data['Activity'].isin(['STANDING', 'SITTING', 'LAYING']), 0, 1)
data.head()

Unnamed: 0,tBodyAcc-mean()-X,tBodyAcc-mean()-Y,tBodyAcc-mean()-Z,tBodyAcc-std()-X,tBodyAcc-std()-Y,tBodyAcc-std()-Z,tBodyAcc-mad()-X,tBodyAcc-mad()-Y,tBodyAcc-mad()-Z,tBodyAcc-max()-X,...,fBodyBodyGyroJerkMag-kurtosis(),"angle(tBodyAccMean,gravity)","angle(tBodyAccJerkMean),gravityMean)","angle(tBodyGyroMean,gravityMean)","angle(tBodyGyroJerkMean,gravityMean)","angle(X,gravityMean)","angle(Y,gravityMean)","angle(Z,gravityMean)",Activity,Activity_dynamic
0,0.288508,-0.009196,-0.103362,-0.988986,-0.962797,-0.967422,-0.989,-0.962596,-0.96565,-0.929747,...,-0.816696,-0.042494,-0.044218,0.307873,0.07279,-0.60112,0.331298,0.165163,STANDING,0
1,0.265757,-0.016576,-0.098163,-0.989551,-0.994636,-0.987435,-0.990189,-0.99387,-0.987558,-0.937337,...,-0.693515,-0.062899,0.388459,-0.765014,0.771524,0.345205,-0.769186,-0.147944,LAYING,0
2,0.278709,-0.014511,-0.108717,-0.99772,-0.981088,-0.994008,-0.997934,-0.982187,-0.995017,-0.942584,...,-0.829311,0.000265,-0.525022,-0.891875,0.021528,-0.833564,0.202434,-0.032755,STANDING,0
3,0.289795,-0.035536,-0.150354,-0.231727,-0.006412,-0.338117,-0.273557,0.014245,-0.347916,0.008288,...,-0.408956,-0.255125,0.612804,0.747381,-0.072944,-0.695819,0.287154,0.111388,WALKING,1
4,0.394807,0.034098,0.091229,0.088489,-0.106636,-0.388502,-0.010469,-0.10968,-0.346372,0.584131,...,-0.563437,-0.044344,-0.845268,-0.97465,-0.887846,-0.705029,0.264952,0.137758,WALKING_DOWNSTAIRS,1


In [6]:
target1, target2 = 'Activity', 'Activity_dynamic'
X = data.drop([target1, target2], axis = 1)
y1 = data.loc[:, target1]
y2 = data.loc[:, target2]

In [8]:
X_train, X_val, y_train, y_val = train_test_split(X, y1, test_size = 0.2, random_state = 42) # Activity 전체
X_train2, X_val2, y_train2, y_val2 = train_test_split(X, y2, test_size = 0.2, random_state = 42) # 정적 or 동적

## **3.단계별 모델링**

![](https://github.com/DA4BAM/image/blob/main/step%20by%20step.png?raw=true)

### (1) 단계1 : 정적/동적 행동 분류 모델

* 세부 요구사항
    * 정적 행동(Laying, Sitting, Standing)과 동적 행동(동적 : Walking, Walking-Up, Walking-Down)을 구분하는 모델 생성.
    * 몇가지 모델을 만들고 가장 성능이 좋은 모델을 선정하시오.

#### 1) 알고리즘1 : Random Forest

In [18]:
for index, row in X_val2.iterrows():
    print(row)

tBodyAcc-mean()-X                       0.070536
tBodyAcc-mean()-Y                       0.005890
tBodyAcc-mean()-Z                      -0.198887
tBodyAcc-std()-X                       -0.944412
tBodyAcc-std()-Y                       -0.885860
                                          ...   
angle(tBodyGyroMean,gravityMean)       -0.661972
angle(tBodyGyroJerkMean,gravityMean)   -0.027780
angle(X,gravityMean)                    0.485030
angle(Y,gravityMean)                   -0.838496
angle(Z,gravityMean)                   -0.136590
Name: 5371, Length: 561, dtype: float64
tBodyAcc-mean()-X                       0.314412
tBodyAcc-mean()-Y                      -0.027694
tBodyAcc-mean()-Z                      -0.108811
tBodyAcc-std()-X                       -0.263084
tBodyAcc-std()-Y                       -0.071844
                                          ...   
angle(tBodyGyroMean,gravityMean)       -0.735873
angle(tBodyGyroJerkMean,gravityMean)   -0.557859
angle(X,gravityMean)         

In [34]:
model_rf = RandomForestClassifier(random_state = 42)
model_rf.fit(X_train2, y_train2)
p_rf = model_rf.predict(X_val2)

print(accuracy_score(y_val2, p_rf))
print(confusion_matrix(y_val2, p_rf))
print(classification_report(y_val2, p_rf))

1.0
[[657   0]
 [  0 520]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       657
           1       1.00      1.00      1.00       520

    accuracy                           1.00      1177
   macro avg       1.00      1.00      1.00      1177
weighted avg       1.00      1.00      1.00      1177



In [36]:
joblib.dump(model_rf, 'model1.pkl')

['model1.pkl']

In [37]:
model1 = joblib.load('model1.pkl')
p_rf = model1.predict(X_val2)
p_rf

array([0, 1, 0, ..., 0, 1, 0])

- 모델 1 : Random Forest

### (2) 단계2-1 : 정적 동작 세부 분류

* 세부 요구사항
    * 정적 행동(Laying, Sitting, Standing)인 데이터 추출
    * Laying, Sitting, Standing 를 분류하는 모델을 생성
    * 몇가지 모델을 만들고 가장 성능이 좋은 모델을 선정하시오.

In [168]:
data_nd = data.loc[data[target2] == 0]
data_nd.head()

Unnamed: 0,tBodyAcc-mean()-X,tBodyAcc-mean()-Y,tBodyAcc-mean()-Z,tBodyAcc-std()-X,tBodyAcc-std()-Y,tBodyAcc-std()-Z,tBodyAcc-mad()-X,tBodyAcc-mad()-Y,tBodyAcc-mad()-Z,tBodyAcc-max()-X,...,fBodyBodyGyroJerkMag-kurtosis(),"angle(tBodyAccMean,gravity)","angle(tBodyAccJerkMean),gravityMean)","angle(tBodyGyroMean,gravityMean)","angle(tBodyGyroJerkMean,gravityMean)","angle(X,gravityMean)","angle(Y,gravityMean)","angle(Z,gravityMean)",Activity,Activity_dynamic
0,0.288508,-0.009196,-0.103362,-0.988986,-0.962797,-0.967422,-0.989,-0.962596,-0.96565,-0.929747,...,-0.816696,-0.042494,-0.044218,0.307873,0.07279,-0.60112,0.331298,0.165163,STANDING,0
1,0.265757,-0.016576,-0.098163,-0.989551,-0.994636,-0.987435,-0.990189,-0.99387,-0.987558,-0.937337,...,-0.693515,-0.062899,0.388459,-0.765014,0.771524,0.345205,-0.769186,-0.147944,LAYING,0
2,0.278709,-0.014511,-0.108717,-0.99772,-0.981088,-0.994008,-0.997934,-0.982187,-0.995017,-0.942584,...,-0.829311,0.000265,-0.525022,-0.891875,0.021528,-0.833564,0.202434,-0.032755,STANDING,0
7,0.272026,-0.001329,-0.125491,-0.992068,-0.912985,-0.972451,-0.994752,-0.943141,-0.976428,-0.925446,...,-0.704995,-0.024442,0.076332,0.741277,0.729812,-0.817201,0.037746,0.136129,STANDING,0
8,0.284338,0.021956,-0.006925,-0.980153,-0.838394,-0.782357,-0.983683,-0.816199,-0.743923,-0.914011,...,-0.400197,0.021212,-0.009465,-0.282762,0.563343,-0.782072,0.242834,-0.025285,STANDING,0


In [169]:
X3 = data_nd.drop([target1, target2], axis = 1) # 기본
y3 = data_nd.loc[:, target1]

data_nd2 = data_nd.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x)) # LightGBM 위해서
X3_2 = data_nd2.drop([target1, target2], axis = 1)
y3_2 = data_nd2.loc[:, target1]

y3_3 = y3.map({'LAYING' : 0,  'SITTING' : 1, 'STANDING' : 2}) # XGB 위해서

In [170]:
X_train3, X_val3, y_train3, y_val3 = train_test_split(X3, y3, test_size = 0.2, random_state = 42) # 정적 동작 기본
X_train3_2, X_val3_2, y_train3_2, y_val3_2 = train_test_split(X3_2, y3_2, test_size = 0.2, random_state = 42) # 정적 동작 변수 이름에서 특수기호 제거
X_train3_3, X_val3_3, y_train3_3, y_val3_3 = train_test_split(X3, y3_3, test_size = 0.2, random_state = 42) # target 0 ~ 5로 변경

#### 1) 알고리즘1 : Random Forest

In [55]:
model_rf = RandomForestClassifier(random_state = 42)
model_rf.fit(X_train3, y_train3)
p_rf = model_rf.predict(X_val3)

print(accuracy_score(y_val3, p_rf))
print(confusion_matrix(y_val3, p_rf))
print(classification_report(y_val3, p_rf))

0.9752704791344667
[[221   0   0]
 [  0 199   6]
 [  0  10 211]]
              precision    recall  f1-score   support

      LAYING       1.00      1.00      1.00       221
     SITTING       0.95      0.97      0.96       205
    STANDING       0.97      0.95      0.96       221

    accuracy                           0.98       647
   macro avg       0.97      0.98      0.97       647
weighted avg       0.98      0.98      0.98       647



#### 2) 알고리즘2 : XGBoost

In [56]:
model_xgb = XGBClassifier(random_state = 42, device = 'gpu')
model_xgb.fit(X_train3_3, y_train3_3)
p_xgb = model_xgb.predict(X_val3_3)

print(accuracy_score(y_val3_3, p_xgb))
print(confusion_matrix(y_val3_3, p_xgb))
print(classification_report(y_val3_3, p_xgb))

0.9860896445131375
[[221   0   0]
 [  0 202   3]
 [  0   6 215]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       221
           1       0.97      0.99      0.98       205
           2       0.99      0.97      0.98       221

    accuracy                           0.99       647
   macro avg       0.99      0.99      0.99       647
weighted avg       0.99      0.99      0.99       647



#### 3) 알고리즘3 : LightGBM

In [57]:
model_lgb = LGBMClassifier(random_state = 42, verbose = -1, device = 'gpu')
model_lgb.fit(X_train3_2, y_train3_2)
p_lgb = model_lgb.predict(X_val3_2)

print(accuracy_score(y_val3_2, p_lgb))
print(confusion_matrix(y_val3_2, p_lgb))
print(classification_report(y_val3_2, p_lgb))

0.9799072642967542
[[221   0   0]
 [  0 200   5]
 [  0   8 213]]
              precision    recall  f1-score   support

      LAYING       1.00      1.00      1.00       221
     SITTING       0.96      0.98      0.97       205
    STANDING       0.98      0.96      0.97       221

    accuracy                           0.98       647
   macro avg       0.98      0.98      0.98       647
weighted avg       0.98      0.98      0.98       647



#### 4) 알고리즘4 : CatBoost

In [171]:
model_cat = CatBoostClassifier(random_state = 42, task_type = 'GPU', verbose = 0)
model_cat.fit(X_train3, y_train3)
p_cat = model_cat.predict(X_val3)

print(accuracy_score(y_val3, p_cat))
print(confusion_matrix(y_val3, p_cat))
print(classification_report(y_val3, p_cat))

0.9860896445131375
[[221   0   0]
 [  0 202   3]
 [  0   6 215]]
              precision    recall  f1-score   support

      LAYING       1.00      1.00      1.00       221
     SITTING       0.97      0.99      0.98       205
    STANDING       0.99      0.97      0.98       221

    accuracy                           0.99       647
   macro avg       0.99      0.99      0.99       647
weighted avg       0.99      0.99      0.99       647



In [172]:
cat = plot_feature_importance(model_cat.feature_importances_, list(X_train3), True)
cat.head()

Unnamed: 0,feature_name,feature_importance
0,"angle(X,gravityMean)",16.333189
1,tGravityAcc-max()-Y,10.945205
2,tGravityAcc-min()-X,7.889462
3,"angle(Y,gravityMean)",7.081946
4,tGravityAcc-mean()-Y,5.515881


In [173]:
acc = []
for i in range(200) :   # 전체 변수는 561이지만, 200 정도면 충분하다
    top_n_feature = cat.loc[:i, 'feature_name']
    X_train_n = X_train3[top_n_feature]
    X_val_n = X_val3[top_n_feature]
    model_cat.fit(X_train_n, y_train3)
    pred = model_cat.predict(X_val_n)
    acc.append(accuracy_score(y_val3, pred))
    print(i, end = ' ')
    if i % 20 == 0:
        print()

0 
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 
21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 
41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 
61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 
81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 
101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 
121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 
141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 
161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 
181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 

In [174]:
print('Max accuracy : ', max(acc))
print('best_num_features : ', acc.index(max(acc)))

Max accuracy :  0.9922720247295209
best_num_features :  25


In [177]:
top_25_feature = cat.loc[:25, 'feature_name']
X_train_25 = X_train3[top_25_feature]
X_val_25 = X_val3[top_25_feature]

model_cat.fit(X_train_25, y_train3)
p_cat = model_cat.predict(X_val_25)

print('accuracy :',accuracy_score(y_val3, p_cat))
print('='*60)
print(confusion_matrix(y_val3, p_cat))
print('='*60)
print(classification_report(y_val3, p_cat))

accuracy : 0.9922720247295209
[[221   0   0]
 [  0 204   1]
 [  0   4 217]]
              precision    recall  f1-score   support

      LAYING       1.00      1.00      1.00       221
     SITTING       0.98      1.00      0.99       205
    STANDING       1.00      0.98      0.99       221

    accuracy                           0.99       647
   macro avg       0.99      0.99      0.99       647
weighted avg       0.99      0.99      0.99       647



In [178]:
joblib.dump(model_cat, 'model2_1_25.pkl') # 상위 25개로 학습한 모델

['model2_1_25.pkl']

In [179]:
joblib.dump(top_25_feature, 'cat_top_25_features.pkl')

['cat_top_25_features.pkl']

In [59]:
joblib.dump(model_cat, 'model2_1.pkl') # 전체 변수로 학습한 모델

['model2_1.pkl']

- 모델 2_1 : CatBoost 선택

### (3) 단계2-2 : 동적 동작 세부 분류

* 세부 요구사항
    * 동적 행동(Walking, Walking Upstairs, Walking Downstairs)인 데이터 추출
    * Walking, Walking Upstairs, Walking Downstairs 를 분류하는 모델을 생성
    * 몇가지 모델을 만들고 가장 성능이 좋은 모델을 선정하시오.

In [68]:
data_d = data.loc[data[target2] == 1]
data_d.head()

Unnamed: 0,tBodyAcc-mean()-X,tBodyAcc-mean()-Y,tBodyAcc-mean()-Z,tBodyAcc-std()-X,tBodyAcc-std()-Y,tBodyAcc-std()-Z,tBodyAcc-mad()-X,tBodyAcc-mad()-Y,tBodyAcc-mad()-Z,tBodyAcc-max()-X,...,fBodyBodyGyroJerkMag-kurtosis(),"angle(tBodyAccMean,gravity)","angle(tBodyAccJerkMean),gravityMean)","angle(tBodyGyroMean,gravityMean)","angle(tBodyGyroJerkMean,gravityMean)","angle(X,gravityMean)","angle(Y,gravityMean)","angle(Z,gravityMean)",Activity,Activity_dynamic
3,0.289795,-0.035536,-0.150354,-0.231727,-0.006412,-0.338117,-0.273557,0.014245,-0.347916,0.008288,...,-0.408956,-0.255125,0.612804,0.747381,-0.072944,-0.695819,0.287154,0.111388,WALKING,1
4,0.394807,0.034098,0.091229,0.088489,-0.106636,-0.388502,-0.010469,-0.10968,-0.346372,0.584131,...,-0.563437,-0.044344,-0.845268,-0.97465,-0.887846,-0.705029,0.264952,0.137758,WALKING_DOWNSTAIRS,1
5,0.330708,0.007561,-0.061371,-0.21576,0.101075,0.072949,-0.269857,0.06006,0.101298,-0.019263,...,-0.887024,-0.030645,-0.852091,-0.500195,0.306091,-0.552729,0.253885,0.291256,WALKING_UPSTAIRS,1
6,0.121465,-0.031902,-0.005196,-0.152198,-0.113104,-0.239423,-0.202401,-0.164698,-0.247099,0.114668,...,-0.775779,0.445206,-0.003487,-0.940185,0.041387,-0.886603,0.173338,-0.005627,WALKING,1
12,0.303885,0.002768,-0.038613,-0.168656,0.190336,-0.140473,-0.205134,0.101144,-0.120572,-0.000818,...,-0.329728,-0.04003,0.257252,0.076091,-0.123425,-0.752882,0.266729,0.045692,WALKING,1


In [69]:
X4 = data_d.drop([target1, target2], axis = 1) # 기본
y4 = data_d.loc[:, target1]

data_d2 = data_d.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x)) # LightGBM 위해서
X4_2 = data_d2.drop([target1, target2], axis = 1)
y4_2 = data_d2.loc[:, target1]

y4_3 = y4.map({'WALKING' : 0,  'WALKING_UPSTAIRS' : 1, 'WALKING_DOWNSTAIRS' : 2}) # XGB 위해서

In [70]:
X_train4, X_val4, y_train4, y_val4 = train_test_split(X4, y4, test_size = 0.2, random_state = 42) # 동적 동작 기본
X_train4_2, X_val4_2, y_train4_2, y_val4_2 = train_test_split(X4_2, y4_2, test_size = 0.2, random_state = 42) # 동적 동작 변수 이름에서 특수기호 제거
X_train4_3, X_val4_3, y_train4_3, y_val4_3 = train_test_split(X4, y4_3, test_size = 0.2, random_state = 42) # target 0 ~ 5로 변경

#### 1) 알고리즘1 : Random Forest

In [71]:
model_rf = RandomForestClassifier(random_state = 42)
model_rf.fit(X_train4, y_train4)
p_rf = model_rf.predict(X_val4)

print(accuracy_score(y_val4, p_rf))
print(confusion_matrix(y_val4, p_rf))
print(classification_report(y_val4, p_rf))

0.9886792452830189
[[196   0   0]
 [  2 159   2]
 [  1   1 169]]
                    precision    recall  f1-score   support

           WALKING       0.98      1.00      0.99       196
WALKING_DOWNSTAIRS       0.99      0.98      0.98       163
  WALKING_UPSTAIRS       0.99      0.99      0.99       171

          accuracy                           0.99       530
         macro avg       0.99      0.99      0.99       530
      weighted avg       0.99      0.99      0.99       530



#### 2) 알고리즘2 : XGBoost

In [72]:
model_xgb = XGBClassifier(random_state = 42, device = 'gpu')
model_xgb.fit(X_train4_3, y_train4_3)
p_xgb = model_xgb.predict(X_val4_3)

print(accuracy_score(y_val4_3, p_xgb))
print(confusion_matrix(y_val4_3, p_xgb))
print(classification_report(y_val4_3, p_xgb))

0.9943396226415094
[[195   0   1]
 [  1 170   0]
 [  0   1 162]]
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       196
           1       0.99      0.99      0.99       171
           2       0.99      0.99      0.99       163

    accuracy                           0.99       530
   macro avg       0.99      0.99      0.99       530
weighted avg       0.99      0.99      0.99       530



#### 3) 알고리즘3 : LightGBM

In [73]:
model_lgb = LGBMClassifier(random_state = 42, verbose = -1, device = 'gpu')
model_lgb.fit(X_train4_2, y_train4_2)
p_lgb = model_lgb.predict(X_val4_2)

print(accuracy_score(y_val4_2, p_lgb))
print(confusion_matrix(y_val4_2, p_lgb))
print(classification_report(y_val4_2, p_lgb))

0.9924528301886792
[[195   1   0]
 [  0 162   1]
 [  2   0 169]]
                    precision    recall  f1-score   support

           WALKING       0.99      0.99      0.99       196
WALKING_DOWNSTAIRS       0.99      0.99      0.99       163
  WALKING_UPSTAIRS       0.99      0.99      0.99       171

          accuracy                           0.99       530
         macro avg       0.99      0.99      0.99       530
      weighted avg       0.99      0.99      0.99       530



#### 4) 알고리즘4 : CatBoost

In [74]:
model_cat = CatBoostClassifier(random_state = 42, task_type = 'GPU', verbose = 0)
model_cat.fit(X_train4, y_train4)
p_cat = model_cat.predict(X_val4)

print(accuracy_score(y_val4, p_cat))
print(confusion_matrix(y_val4, p_cat))
print(classification_report(y_val4, p_cat))

1.0
[[196   0   0]
 [  0 163   0]
 [  0   0 171]]
                    precision    recall  f1-score   support

           WALKING       1.00      1.00      1.00       196
WALKING_DOWNSTAIRS       1.00      1.00      1.00       163
  WALKING_UPSTAIRS       1.00      1.00      1.00       171

          accuracy                           1.00       530
         macro avg       1.00      1.00      1.00       530
      weighted avg       1.00      1.00      1.00       530



In [75]:
joblib.dump(model_cat, 'model2_2.pkl')

['model2_2.pkl']

- 모델 2_2 : CatBoost 모델 선택

### [선택사항] (4) 분류 모델 합치기


* 세부 요구사항
    * 두 단계 모델을 통합하고, 새로운 데이터(test)에 대해서 최종 예측결과와 성능평가가 나오도록 함수로 만들기
    * 데이터 파이프라인 구축 : test데이터가 로딩되어 전처리 과정을 거치고, 예측 및 성능 평가 수행

![](https://github.com/DA4BAM/image/blob/main/pipeline%20function.png?raw=true)

#### 1) 함수 만들어서 분류 모델 합치기

In [193]:
def predict_new_data(path):
    # 1. 데이터 로드 및 전처리
    new_data = pd.read_csv(path)
    new_data.drop('subject', axis=1, inplace=True)

    # 2. 데이터 분리
    target = 'Activity'
    X_test = new_data.drop(target, axis = 1)
    y_test = new_data.loc[:, target]

    # 3. 저장된 모델 및 변수들 로드
    model1 = joblib.load('model1.pkl')
    model2_1 = joblib.load('model2_1.pkl')
    model2_1_25 = joblib.load('model2_1_25.pkl')
    model2_2 = joblib.load('model2_2.pkl')
    top_25_feature = joblib.load('cat_top_25_features.pkl')
    
    # 4. 동적 정적 유무 예측
    pred1= model1.predict(X_test)
    index_0 = [i for i, value in enumerate(pred1) if value == 0]
    index_1 = [i for i, value in enumerate(pred1) if value == 1]

    # 5. 동적, 정적 세부 예측
    X_test_nd = X_test.loc[index_0, :]
    X_test_nd_25 = X_test_nd[list(top_25_feature)]
    X_test_d = X_test.loc[index_1, :]
    pred2_1 = model2_1.predict(X_test_nd)
    pred2_1_25 = model2_1_25.predict(X_test_nd_25)
    pred2_2 = model2_2.predict(X_test_d)

    # 6. 예측 결과 합치기
    pred2_1 = np.ravel(pred2_1)
    pred2_1_25 = np.ravel(pred2_1_25)
    pred2_2 = np.ravel(pred2_2)
    
    result_nd = pd.DataFrame({'Index': index_0, 'Prediction': pred2_1})
    result_nd_25 = pd.DataFrame({'Index': index_0, 'Prediction': pred2_1_25})
    result_d = pd.DataFrame({'Index': index_1, 'Prediction': pred2_2})
    
    result_nd = result_nd.set_index('Index')
    result_nd_25 = result_nd_25.set_index('Index')
    result_d = result_d.set_index('Index')
    
    merged_df = pd.concat([result_nd, result_d], axis=0)
    merged_df = merged_df.sort_index()
    merged_df_25 = pd.concat([result_nd_25, result_d], axis=0)
    merged_df_25 = merged_df_25.sort_index()
    
    pred_total = np.array(merged_df['Prediction'])
    pred_total_25 = np.array(merged_df_25['Prediction'])

    # 7. 예측
    print('<model2_1 all features>\n')
    print(accuracy_score(y_test, pred_total))
    print(confusion_matrix(y_test, pred_total))
    print(classification_report(y_test, pred_total))
    print('=' * 60)
    print('<model2_1 top 25 features>\n')
    print(accuracy_score(y_test, pred_total_25))
    print(confusion_matrix(y_test, pred_total_25))
    print(classification_report(y_test, pred_total_25))

In [194]:
path = 'data01_test.csv'
predict_new_data(path) # 모델 2-1 전체 변수, 상위 25개 변수 모두 출력

<model2_1 all features>

0.9877634262406526
[[290   2   0   0   0   0]
 [  0 246   8   0   0   0]
 [  0   6 281   0   0   0]
 [  0   0   0 226   0   2]
 [  0   0   0   0 195   0]
 [  0   0   0   0   0 215]]
                    precision    recall  f1-score   support

            LAYING       1.00      0.99      1.00       292
           SITTING       0.97      0.97      0.97       254
          STANDING       0.97      0.98      0.98       287
           WALKING       1.00      0.99      1.00       228
WALKING_DOWNSTAIRS       1.00      1.00      1.00       195
  WALKING_UPSTAIRS       0.99      1.00      1.00       215

          accuracy                           0.99      1471
         macro avg       0.99      0.99      0.99      1471
      weighted avg       0.99      0.99      0.99      1471

<model2_1 top 25 features>

0.9898028552005439
[[290   2   0   0   0   0]
 [  0 247   7   0   0   0]
 [  0   4 283   0   0   0]
 [  0   0   0 226   0   2]
 [  0   0   0   0 195   0]
 [  0   