# 作品背景阐释

智能手机和以智能手机为基础的生物识别使用日常生活活动是非常具有价值与意义的。而在众多的户外运动中，登山运动深受广大户外运动爱好者追捧，克服重重困难登上山顶的那一刻，兴奋与刺激溢于言表。然而近年来，登山队迷路、队员走失、山内信号微弱导致无法对外求援等问题带来的意外伤害事故时有发生。如果可以帮助登山爱好者在登山前做好智能追踪等安全保障工作，户外运动就会更加无忧、快乐——智能追踪手环、遇险自动报警装置、偏离路线智能提醒、装备不合格智能预警……

# 作品说明
本项目利用智能手机陀螺仪传感器的数据，对常见的两种传感器数据组织形式进行了建模，并搭建了高效准确、自动化、端到端的机器学习模型，能够有效的对人类活动进行识别与预测，保障登山爱好者的安全，对路径进行智能追踪与安全保障，让我们成为登山爱好者的后盾

## 创新性
本项目使用非常前沿的Permutation特征重要性技术对海量特征进行筛选，并构建多模型进一步确保了算法稳定性。在神经网络中，我们使用了Lookahead技术，能够有效找到更加光滑的梯度鞍点，并使用mish激活函数，使网络学习的更深更好

## 完成度
本项目将现有可能的数据输入形式做了汇总，并构建了端到端的易用准确模型，对人类活动数据进行了非常完善的建模与评估，完成了从自动化的特征工程到自动化的特征筛选，并可持久化存储了模型权重，方便任意时刻对类似的数据输入进行推理

## 实用性
本项目采取一键式训练方案，即无需人工进行特征工程构造，我们使用大量前沿有效的算法，将学术界优雅的解决方案带到工业界，减少先验知识的构造，更加专业的去分析用户轨迹

# 商业前景
本项目在`B端`，不仅可以利用在登山爱好者的安全保障，更可以使用在各式各样的推荐场景，对用户行为的判别可以使得推荐更准确，更符合用户需求，能够有效为推荐模型提供更为丰富的决策依据。在`C端`，可以绑定在AMS的服务器上，以服务（外设）的形式出售给登山爱好者，为其提供安全保障。

##### 环境依赖
```
tensorboard==2.2.2
tensorboard-plugin-wit==1.6.0.post3
tensorflow-addons==0.11.2
tensorflow-estimator==2.2.0
tensorflow-gpu==2.2.0
tensorflow-probability==0.10.0
scikit-learn==0.23.1
scipy==1.4.1
numpy==1.18.5
pandas==1.0.4
eli5==0.10.1
lightgbm==3.0.0
tqdm==4.46.1
```

In [1]:
# 包引入
import warnings
import os
import gc
import eli5
from tqdm import tqdm
from keras.utils.np_utils import to_categorical
from eli5.sklearn import PermutationImportance
from tensorflow.keras.optimizers import Adam
import tensorflow.keras.backend as K
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import *
from tensorflow.keras.layers import *
from scipy.signal import resample
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import accuracy_score
import lightgbm as lgb
from tensorflow_addons.activations import mish, gelu
from tensorflow_addons.callbacks import TQDMProgressBar
from tensorflow_addons.metrics import F1Score
from tensorflow_addons.layers import WeightNormalization
from tensorflow_addons.optimizers import Lookahead, AdamW, RectifiedAdam
import tensorflow as tf
import tensorflow_addons as tfa
tfa.options.TF_ADDONS_PY_OPS = True

warnings.filterwarnings("ignore")
os.environ['CUDA_VISIBLE_DEVICES'] = "2"

physical_devices = tf.config.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], True)



## 两种建模方式

#### 1. 针对已经提取好的特征维表，我们使用LightGBM、DNN模型，并测试特征筛选、模型集成的效果

In [2]:
train = pd.read_csv("./zhin1768/train.csv")
test = pd.read_csv("./zhin1768/test.csv")
train.shape, test.shape

df = pd.concat([train, test], ignore_index=True)
df.shape, df['subject'].nunique()  # 拼接训练集测试集、并输出总人数

((10299, 563), 30)

In [3]:
# 将label映射成int

label_hash = LabelEncoder()
df['Activity'] = label_hash.fit_transform(df['Activity'])

# 将数值特征标准化
feature_name = [i for i in df.columns if i not in ['subject', 'Activity']]
df[feature_name] = StandardScaler().fit_transform(df[feature_name].fillna(0)) # 缺失值按0填充

target = 'Activity'

In [4]:
# 传入处理后的训练集与测试集
train = df[:len(train)]
test = df[len(train):].reset_index(drop=True)
train.shape, test.shape

((7352, 563), (2947, 563))

In [5]:
# 我们使用大名鼎鼎的Permutation Importance做特征筛选

max_features = 500
show_featuers = 30
lr = LogisticRegression(C=0.1, n_jobs=20)
lr.fit(train[feature_name], train[target])

perm = PermutationImportance(lr, scoring='f1_macro', random_state=2020).\
                             fit(train[feature_name], train[target])

fi = pd.DataFrame()
fi['feature_name'] = feature_name
fi['score'] = perm.feature_importances_
select_feature_name = fi.sort_values(by=['score'],ascending=False)[:max_features]['feature_name'].values

eli5.show_weights(perm, feature_names = feature_name, top=show_featuers)

Weight,Feature
0.0379  ± 0.0022,tBodyGyroJerk-entropy()-X
0.0154  ± 0.0021,fBodyGyro-entropy()-X
0.0060  ± 0.0022,tGravityAcc-max()-Y
0.0058  ± 0.0014,"tBodyGyroJerk-arCoeff()-X,2"
0.0058  ± 0.0004,tGravityAcc-mean()-Y
0.0057  ± 0.0010,"tBodyGyroJerk-arCoeff()-X,1"
0.0044  ± 0.0021,tGravityAcc-min()-Y
0.0042  ± 0.0014,tBodyGyroJerk-entropy()-Y
0.0037  ± 0.0016,"tBodyGyro-arCoeff()-X,1"
0.0035  ± 0.0012,"tBodyGyroJerk-arCoeff()-X,3"


In [6]:
def lgb_model(train, target, test, kfolds=5, drop_col=['label']):
    feats = [f for f in train.columns if f not in drop_col]
    print('Current num of features:', len(feats))
    folds = StratifiedKFold(n_splits=kfolds, shuffle=True, random_state=2020)
    oof_probs = np.zeros((train.shape[0], len(label_hash.classes_)))
    output_preds = np.zeros((test.shape[0], len(label_hash.classes_)))
    offline_score = []
    feature_importance_df = pd.DataFrame()
    parameters = {
        'learning_rate': 0.05,
        'boosting_type': 'gbdt',
        'objective': 'multiclass',
        'num_class' : len(label_hash.classes_),
        'metric': 'multi_error',
        'num_leaves': 128,
        'feature_fraction': 0.3,
        'bagging_fraction': 0.3,
        'min_data_in_leaf': 10,
        'verbose': -1,
        'nthread': 30,
    }

    BOOST_ROUND = 100000
    ES = 300
    VE = 300
    
    for i, (train_index, test_index) in enumerate(folds.split(train, target)):
        train_y, test_y = target[train_index], target[test_index]
        train_X, test_X = train[feats].iloc[train_index, :], train[feats].iloc[test_index, :]

        dtrain = lgb.Dataset(train_X.values,
                             label=train_y)
        dval = lgb.Dataset(test_X.values,
                           label=test_y)
        lgb_model = lgb.train(
                parameters,
                dtrain,
                num_boost_round=BOOST_ROUND,
                valid_sets=[dval],
                early_stopping_rounds=ES,
                verbose_eval=VE,
        )
        oof_probs[test_index] = lgb_model.predict(test_X[feats].values, num_iteration=lgb_model.best_iteration)
        offline_score.append(lgb_model.best_score['valid_0']['multi_error'])
        output_preds += lgb_model.predict(test[feats].values, num_iteration=lgb_model.best_iteration) / folds.n_splits
        print(offline_score)
        # feature importance
        fold_importance_df = pd.DataFrame()
        fold_importance_df["feature"] = feats
        fold_importance_df["importance"] = lgb_model.feature_importance(importance_type='gain')
        fold_importance_df["fold"] = i + 1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    print('OOF-MEAN-ACC:%.6f, OOF-STD-ACC:%.6f' % (1 - np.mean(offline_score), np.std(offline_score)))
    print('feature importance:')
    print(feature_importance_df.groupby(['feature'])['importance'].mean().sort_values(ascending=False).head(30))

    return output_preds, oof_probs, np.mean(offline_score), feature_importance_df

In [7]:
def build_model(len_fi):
    inp = Input(len_fi, )
    x = Dropout(0.5)(inp)
    x = BatchNormalization()(x)
    x = WeightNormalization(Dense(2048, activation=mish))(x)
    
    x = Dropout(0.4)(inp)
    x = BatchNormalization()(x)
    x = WeightNormalization(Dense(1024, activation=mish))(x)
    
    x = Dropout(0.3)(inp)
    x = BatchNormalization()(x)
    x = WeightNormalization(Dense(512, activation=mish))(x)
    
    output = WeightNormalization(Dense(len(label_hash.classes_), activation='softmax'))(x)
    
    model = Model([inp], output)
    model.compile(
        loss='categorical_crossentropy',
        optimizer=Lookahead(AdamW(weight_decay=1e-5, beta_1=0.9, beta_2=0.999, learning_rate=1e-3),
                            sync_period=5),
        metrics=[F1Score(num_classes=len(label_hash.classes_), average='macro', threshold=0.5),
                 'acc']
    )
    return model

def dnn_model(train, target, test, kfolds=5, drop_col=['label']):
    feats = [f for f in train.columns if f not in drop_col]
    folds = KFold(n_splits=kfolds, shuffle=True, random_state=2020)
    oof_probs = np.zeros((train.shape[0], len(label_hash.classes_)))
    output_preds = np.zeros((test.shape[0], len(label_hash.classes_)))
    
    try:
        os.system("mkdir ./dnn_weights")
    except:
        print("MKDIR FINISHED...")
        
    ES = 20
    EPOCHS = 2000
    BATCH_SIZE = 128
    target = to_categorical(target, num_classes=len(label_hash.classes_))
    
    for i, (train_index, test_index) in enumerate(folds.split(train, target)):
        train_y, test_y = target[train_index], target[test_index]
        train_X, test_X = train[feats].iloc[train_index, :], train[feats].iloc[test_index, :]

        model = build_model(len(feats))
        early_stopping = EarlyStopping(monitor='val_acc', mode='max', verbose=1, patience=ES)
        plateau = ReduceLROnPlateau(monitor="val_acc", verbose=0, mode='max', factor=0.1, patience=ES//2)
        checkpoint = ModelCheckpoint('./dnn_weights/' + str(i) + '.hdf5', monitor='val_acc', 
                                         verbose=0, save_best_only=True, mode='max',save_weights_only=True)
        tqdmbar = TQDMProgressBar(show_epoch_progress=False)
        
        model.fit(
            train_X, train_y,
            batch_size = BATCH_SIZE,
            epochs = EPOCHS,
            verbose=0,
            validation_data = (test_X, test_y),
            shuffle = True,
            callbacks=[early_stopping, plateau, checkpoint, tqdmbar],
        )
        
        model.load_weights('./dnn_weights/' + str(i) + '.hdf5')
        oof_probs[test_index] = model.predict(test_X[feats].values, batch_size=BATCH_SIZE*4)
        output_preds += model.predict(test[feats].values, batch_size=BATCH_SIZE*4) / folds.n_splits
        
        del model; gc.collect()
        K.clear_session()
        
    return output_preds, oof_probs

In [8]:
lgb_preds, lgb_oof, lgb_score, fi = lgb_model(train=train, 
                                              target=train[target].values, 
                                              test=test, 
                                              kfolds=5, drop_col=['subject','Activity'])

dnn_preds, dnn_oof = dnn_model(train=train, 
                               target=train[target].values, 
                               test=test, 
                               kfolds=5, 
                               drop_col=['subject','Activity'])

Current num of features: 561
Training until validation scores don't improve for 300 rounds
[300]	valid_0's multi_error: 0.00747791
Early stopping, best iteration is:
[203]	valid_0's multi_error: 0.00747791
[0.007477906186267845]
Training until validation scores don't improve for 300 rounds
[300]	valid_0's multi_error: 0.0067981
Early stopping, best iteration is:
[216]	valid_0's multi_error: 0.0067981
[0.007477906186267845, 0.006798096532970768]
Training until validation scores don't improve for 300 rounds
[300]	valid_0's multi_error: 0.00408163
Early stopping, best iteration is:
[129]	valid_0's multi_error: 0.00408163
[0.007477906186267845, 0.006798096532970768, 0.004081632653061225]
Training until validation scores don't improve for 300 rounds
[300]	valid_0's multi_error: 0.00816327
Early stopping, best iteration is:
[204]	valid_0's multi_error: 0.00816327
[0.007477906186267845, 0.006798096532970768, 0.004081632653061225, 0.00816326530612245]
Training until validation scores don't imp

HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=2000.0, style=Pro…

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Epoch 00064: early stopping



HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=2000.0, style=Pro…

Epoch 00085: early stopping



HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=2000.0, style=Pro…

Epoch 00048: early stopping



HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=2000.0, style=Pro…

Epoch 00071: early stopping



HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=2000.0, style=Pro…

Epoch 00092: early stopping



In [9]:
# 我们使用简单的线性加权来查看分数

average = accuracy_score(train[target], np.argmax(dnn_oof+lgb_oof, axis=1))
lgb_score = accuracy_score(train[target], np.argmax(lgb_oof, axis=1))
dnn_score = accuracy_score(train[target], np.argmax(dnn_oof, axis=1))

print("Average {} \nLGB {} \nDNN {}".format(average, lgb_score, dnn_score))

Average 0.9940152339499456 
LGB 0.9930631120783461 
DNN 0.9883025027203483


#### 2. 根据原始陀螺仪数据进行自动化的建模

In [10]:
train = pd.read_csv("./xinwang/sensor_train.csv")
test = pd.read_csv("./xinwang/sensor_test.csv")
test['fragment_id'] += 10000 

df = pd.concat([train, test], ignore_index=True)
label = train[['fragment_id','behavior_id']].drop_duplicates().reset_index(drop=True)

feature_name = [i for i in df.columns if i not in ['fragment_id','behavior_id','time_point']]

df[feature_name] = StandardScaler().fit_transform(df[feature_name].fillna(0).replace([np.inf, -np.inf], 0))
train = df[df['fragment_id'].isin(train['fragment_id'])].reset_index(drop=True)
test = df[df['fragment_id'].isin(test['fragment_id'])].reset_index(drop=True)

In [11]:
maxlen = 60

x = np.zeros((train['fragment_id'].nunique(), maxlen, len(feature_name)))
t = np.zeros((test['fragment_id'].nunique(), maxlen, len(feature_name)))
for i in tqdm(range(train['fragment_id'].nunique())):
    tmp = train[train.fragment_id == i][:maxlen]
    x[i,:,:] = resample(tmp[feature_name], maxlen, np.array(tmp.time_point))[0]
for i in tqdm(range(test['fragment_id'].nunique())):
    tmp = test[test.fragment_id == i + 10000][:maxlen]
    t[i,:,:] = resample(tmp[feature_name], maxlen, np.array(tmp.time_point))[0]
    
y = train.groupby('fragment_id')['behavior_id'].min()
class_num = train['behavior_id'].nunique()

100%|██████████| 7292/7292 [00:11<00:00, 613.45it/s]
100%|██████████| 7500/7500 [00:11<00:00, 630.86it/s]


In [12]:
def build_model():
    input = Input(shape=(maxlen, len(feature_name)))

    x1 = Conv1D(256, kernel_size=4, activation=mish, padding='same')(input)
    x2 = Conv1D(256, kernel_size=3, activation=mish, padding='same')(x1)
    x2 = Add()([x1, x2])
    x3 = Conv1D(128, kernel_size=2, activation=mish, padding='same')(x2)
    x4 = Conv1D(128, kernel_size=2, activation=mish, padding='same')(x3)
    x4 = Add()([x3, x4])
    x5 = Conv1D(128, kernel_size=2, activation=mish, padding='same')(x4)
    x5 = Add()([x4, x5])
    x6 = Conv1D(128, kernel_size=2, activation=mish, padding='same')(x5)
    x6 = Add()([x5, x6])
    x7 = Conv1D(128, kernel_size=2, activation=mish, padding='same')(x6)
    x7 = Add()([x6, x7])
    x8 = Conv1D(128, kernel_size=2, activation=mish, padding='same')(x7)
    x8 = Add()([x7, x8])

    Y1 = Lambda(lambda temp: K.max(temp, axis=1))(x1)
    Y2 = Lambda(lambda temp: K.max(temp, axis=1))(x2)
    Y3 = Lambda(lambda temp: K.max(temp, axis=1))(x3)
    Y4 = Lambda(lambda temp: K.max(temp, axis=1))(x4)
    Y5 = Lambda(lambda temp: K.max(temp, axis=1))(x5)
    Y6 = Lambda(lambda temp: K.max(temp, axis=1))(x6)
    Y7 = Lambda(lambda temp: K.max(temp, axis=1))(x7)
    Y8 = Lambda(lambda temp: K.max(temp, axis=1))(x8)

    X1 = concatenate([Y1, Y2, Y3, Y4, Y5, Y6, Y7, Y8], axis=-1)

    Y1 = Lambda(lambda temp: K.mean(temp, axis=1))(x1)
    Y2 = Lambda(lambda temp: K.mean(temp, axis=1))(x2)
    Y3 = Lambda(lambda temp: K.mean(temp, axis=1))(x3)
    Y4 = Lambda(lambda temp: K.mean(temp, axis=1))(x4)
    Y5 = Lambda(lambda temp: K.mean(temp, axis=1))(x5)
    Y6 = Lambda(lambda temp: K.mean(temp, axis=1))(x6)
    Y7 = Lambda(lambda temp: K.mean(temp, axis=1))(x7)
    Y8 = Lambda(lambda temp: K.mean(temp, axis=1))(x8)

    X2 = concatenate([Y1, Y2, Y3, Y4, Y5, Y6, Y7, Y8], axis=-1)
    X = Concatenate()([X1, X2])

    X = Dropout(0.3)(X)
    X = WeightNormalization(Dense(class_num, activation='softmax'))(X)

    return Model([input], X)


proba_t = np.zeros((test['fragment_id'].nunique(), class_num))
oof_x = np.zeros((train['fragment_id'].nunique(), class_num))
kfold = StratifiedKFold(5, shuffle=True, random_state=2020)

for fold, (train_index, valid_index) in enumerate(kfold.split(x, y)):
    K.clear_session()
    try:
        os.system("mkdir ./dnn_weights")
    except:
        print("MKDIR FINISHED...")
    y_ = to_categorical(y, num_classes=class_num)

    model = build_model()
    model.compile(loss='categorical_crossentropy',
                  optimizer=Adam(1e-3),
                  metrics=['acc'])
    plateau = ReduceLROnPlateau(monitor="val_acc",
                                verbose=0,
                                mode='max',
                                factor=0.1,
                                patience=6)
    early_stopping = EarlyStopping(monitor='val_acc',
                                   verbose=0,
                                   mode='max',
                                   patience=10)
    checkpoint = ModelCheckpoint(f'./dnn_weights/big-fold{fold}.h5',
                                 monitor='val_acc',
                                 verbose=0,
                                 mode='max',
                                 save_best_only=True)
    tqdmbar = TQDMProgressBar(show_epoch_progress=False)

    model.fit(x[train_index], y_[train_index],
              epochs=300,
              batch_size=128,
              verbose=0,
              shuffle=True,
              validation_data=(x[valid_index], y_[valid_index]),
              callbacks=[plateau, early_stopping, checkpoint, tqdmbar])
    model.load_weights(f'./dnn_weights/big-fold{fold}.h5')
    oof_x[valid_index] += model.predict(x[valid_index],
                                        verbose=0, batch_size=256) / 5.
    proba_t += model.predict(t, verbose=0, batch_size=256) / 5.

HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=300.0, style=Prog…




HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=300.0, style=Prog…




HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=300.0, style=Prog…




HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=300.0, style=Prog…




HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=300.0, style=Prog…




In [13]:
def acc_combo(y, y_pred):
    # 数值ID与行为编码的对应关系
    mapping = {0: 'A_0', 1: 'A_1', 2: 'A_2', 3: 'A_3', 
        4: 'D_4', 5: 'A_5', 6: 'B_1',7: 'B_5', 
        8: 'B_2', 9: 'B_3', 10: 'B_0', 11: 'A_6', 
        12: 'C_1', 13: 'C_3', 14: 'C_0', 15: 'B_6', 
        16: 'C_2', 17: 'C_5', 18: 'C_6'}
    # 将行为ID转为编码
    code_y, code_y_pred = mapping[y], mapping[y_pred]
    if code_y == code_y_pred: #编码完全相同得分1.0
        return 1.0
    elif code_y.split("_")[0] == code_y_pred.split("_")[0]: #编码仅字母部分相同得分1.0/7
        return 1.0/7
    elif code_y.split("_")[1] == code_y_pred.split("_")[1]: #编码仅数字部分相同得分1.0/3
        return 1.0/3
    else:
        return 0.0
    
def acc_metric(y, y_pred):
    score = 0
    for i in range(len(y)):
        score += acc_combo(y[i], y_pred[i]) / len(y)
    return score

target = 'behavior_id'

oof_pred = np.argmax(oof_x, axis=1)
scores = acc_metric(label[target].values, oof_pred)
print("NN Offline {}".format(scores))

NN Offline 0.744253323929743
