## 第零部分:初始化

### 1.匯入函式庫

In [1]:
import pandas as pd #數據檢視
import numpy as np #數學運算

import glob #搜尋檔案

from sklearn.linear_model import LogisticRegression #邏輯斯迴歸
from sklearn.svm import SVC #支持向量機
from sklearn.ensemble import RandomForestClassifier #隨機森林
from xgboost import XGBClassifier #Extreme Gradient Boosting
from lightgbm import LGBMClassifier #Light Gradient Boosting

from sklearn.model_selection import cross_validate #交叉驗證
from sklearn.metrics import accuracy_score #計算精確度
from joblib import dump #儲存模型

### 2.指定資料位置

In [2]:
data_path = "./train/" #訓練數據位置
save_model_path = "./model/" #儲存模型位置

## 第一部分:單筆資料處理及觀察

### 1.讀取資料

In [3]:
data = pd.read_csv("./train/01原地走/HIMU-2020-07-24_16-53-19.csv",skiprows=np.arange(0,3),usecols = np.arange(0,7)) #讀取csv擋為pandas DataFrame格式
data.timestamp = (data.timestamp - data.timestamp.iloc[0])/1000 #時間單位由毫秒轉為秒

### 2.資料取樣

* 降低取樣率**(可調)**
* 需import math

* 降低精度, float64轉float16**(可調)**

In [4]:
sample_data = pd.DataFrame(data, columns = data.columns).drop(columns='timestamp').astype('float16')

### 3.觀察資料
* 刪除;可看統計資料

In [5]:
sample_data.describe();

### 4.特徵萃取
#### 加速度與角速度六軸
* 最大值
* 最小值
* 平均
* 標準差

In [6]:
#轉換欄位名稱
ax = sample_data.columns[3]
ay = sample_data.columns[4]
az = sample_data.columns[5]
wx = sample_data.columns[0]
wy = sample_data.columns[1]
wz = sample_data.columns[2]
#特徵值
min_ax = np.min(sample_data[ax])
max_ax = np.max(sample_data[ax])
mean_ax = np.mean(sample_data[ax])
std_ax = np.std(sample_data[ax])
min_ay = np.min(sample_data[ay])
max_ay = np.max(sample_data[ay])
mean_ay = np.mean(sample_data[ay])
std_ay = np.std(sample_data[ay])
min_az = np.min(sample_data[az])
max_az = np.max(sample_data[az])
mean_az = np.mean(sample_data[az])
std_az = np.std(sample_data[az])
min_wx = np.min(sample_data[wx])
max_wx = np.max(sample_data[wx])
mean_wx = np.mean(sample_data[wx])
std_wx = np.std(sample_data[wx])
min_wy = np.min(sample_data[wy])
max_wy = np.max(sample_data[wy])
mean_wy = np.mean(sample_data[wy])
std_wy = np.std(sample_data[wy])
min_wz = np.min(sample_data[wz])
max_wz = np.max(sample_data[wz])
mean_wz = np.mean(sample_data[wz])
std_wz = np.std(sample_data[wz])
#轉為列表(list)
X = [min_ax, max_ax, mean_ax, std_ax, \
              min_ay, max_ay, mean_ay, std_ay, \
              min_az, max_az, mean_az, std_az, \
              min_wx, max_wx, mean_wx, std_wx, \
              min_wy, max_wy, mean_wy, std_wy, \
              min_wz, max_wz, mean_wz, std_wz]

### 5.轉成函式

In [7]:
def read(filename):
    data = pd.read_csv(filename,skiprows=np.arange(0,3),usecols = np.arange(0,7))
    data.timestamp = (data.timestamp - data.timestamp.iloc[0])/1000
    return data
def sample(data):
    sample_data = pd.DataFrame(data, columns = data.columns).drop(columns='timestamp').astype('float16')
    return sample_data
def feature_extraction(sample_data):
    min_ax = np.min(sample_data[ax])
    max_ax = np.max(sample_data[ax])
    mean_ax = np.mean(sample_data[ax])
    std_ax = np.std(sample_data[ax])
    min_ay = np.min(sample_data[ay])
    max_ay = np.max(sample_data[ay])
    mean_ay = np.mean(sample_data[ay])
    std_ay = np.std(sample_data[ay])
    min_az = np.min(sample_data[az])
    max_az = np.max(sample_data[az])
    mean_az = np.mean(sample_data[az])
    std_az = np.std(sample_data[az])
    min_wx = np.min(sample_data[wx])
    max_wx = np.max(sample_data[wx])
    mean_wx = np.mean(sample_data[wx])
    std_wx = np.std(sample_data[wx])
    min_wy = np.min(sample_data[wy])
    max_wy = np.max(sample_data[wy])
    mean_wy = np.mean(sample_data[wy])
    std_wy = np.std(sample_data[wy])
    min_wz = np.min(sample_data[wz])
    max_wz = np.max(sample_data[wz])
    mean_wz = np.mean(sample_data[wz])
    std_wz = np.std(sample_data[wz])
    
    X = [min_ax, max_ax, mean_ax, std_ax, \
                  min_ay, max_ay, mean_ay, std_ay, \
                  min_az, max_az, mean_az, std_az, \
                  min_wx, max_wx, mean_wx, std_wx, \
                  min_wy, max_wy, mean_wy, std_wy, \
                  min_wz, max_wz, mean_wz, std_wz]
    return X

## 第二部分:多筆資料處理及訓練模型

### 1.獨熱編碼

In [8]:
def onehot(filename):
    #設為使用global變數
    global type_tmp
    global type_cnt
    global init
    #第一筆時紀錄種類
    if not init:
        type_tmp = filename[8:10]
        init = True
    #之後有不同的種類就增加計數
    else:
        if filename[8:10] != type_tmp:
            type_tmp = filename[8:10]
            type_cnt = type_cnt + 1
    return type_cnt

In [9]:
type_tmp = ''
type_cnt = 0
init = False
X = []
y = []
for filename in glob.glob(data_path + "*/*.csv"):
    #數據處理
    data = read(filename)
    sample_data = sample(data)
    X.append(feature_extraction(sample_data))
    y.append(onehot(filename))
X_df = pd.DataFrame(X) #List轉成DataFrame格式，xgboost需要使用此格式

### 4.訓練模型

#### 1.邏輯斯回歸

In [10]:
lr = LogisticRegression()
lr_result = cross_validate(lr, X, y, cv=5, return_estimator = True) #交叉驗證
lr_model = lr_result['estimator'][lr_result['test_score'].argmax()] #取出最好的分類器
#印出平均精確度
test_mean = np.mean(lr_result['test_score'])*100
print(f"Accuracy: {test_mean:.2f}%")
dump(lr_model, save_model_path +'lr.joblib'); #儲存模型

Accuracy: 100.00%


#### 2.支持向量機

In [11]:
svc = SVC()
svc_result = cross_validate(svc, X, y, cv=5, return_estimator = True) #交叉驗證
svc_model = svc_result['estimator'][svc_result['test_score'].argmax()] #取出最好的分類器
#印出平均精確度
test_mean = np.mean(svc_result['test_score'])*100
print(f"Accuracy: {test_mean:.2f}%")
dump(svc_model, save_model_path +'svc.joblib'); #儲存模型

Accuracy: 100.00%


#### 3.隨機森林

In [12]:
rf = RandomForestClassifier()
rf_result = cross_validate(rf, X, y, cv=5, return_estimator = True) #交叉驗證
rf_model = rf_result['estimator'][rf_result['test_score'].argmax()] #取出最好的分類器
#印出平均精確度
test_mean = np.mean(rf_result['test_score'])*100
print(f"Accuracy: {test_mean:.2f}%")
dump(rf_model, save_model_path + 'rf.joblib'); #儲存模型

Accuracy: 100.00%


#### 4.Extreme Gradient Boosting

In [13]:
xgb = XGBClassifier()
xgb_result = cross_validate(xgb, X_df, y, cv=5, return_estimator = True) #交叉驗證
xgb_model = xgb_result['estimator'][xgb_result['test_score'].argmax()] #取出最好的分類器
#印出平均精確度
test_mean = np.mean(xgb_result['test_score'])*100
print(f"Accuracy: {test_mean:.2f}%")
dump(xgb_model, save_model_path + 'xgb.joblib'); #儲存模型

Accuracy: 96.67%


#### 5.Light Gradient Boosting

In [14]:
lgb = LGBMClassifier()
lgb_result = cross_validate(lgb, X, y, cv=5, return_estimator = True) #交叉驗證
lgb_model = lgb_result['estimator'][lgb_result['test_score'].argmax()] #取出最好的分類器
#印出平均精確度
test_mean = np.mean(lgb_result['test_score'])*100
print(f"Accuracy: {test_mean:.2f}%")
dump(lgb_model, save_model_path + 'lgb.joblib');

Accuracy: 100.00%
