# LGBM baseline

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import lightgbm as lgb
from sklearn.model_selection import GroupKFold
from sklearn.metrics import accuracy_score

pd.set_option('display.max_columns', 500)

In [2]:
DATA_DIR = '../dataset/raw/'

with open(DATA_DIR + 'features.txt') as f:
    features_txt = f.readlines()

features_name = [x.strip() for x in features_txt]
features_name = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in features_name]

X_train = pd.read_csv(DATA_DIR + 'X_train.csv', names=features_name)
X_test = pd.read_csv(DATA_DIR + 'X_test.csv', names=features_name)
y_train = pd.read_csv(DATA_DIR + 'y_train.csv', names=['activity_label'])
subject_train = pd.read_csv(DATA_DIR + 'subject_train.csv', names=['subject_id'])

# 0始まりにする
y_train['activity_label'] = y_train['activity_label'] - 1
all_df_train = pd.concat([X_train, subject_train, y_train], axis=1)

In [3]:
def lgb_trainer_with_group_kfold(df_train, df_test, features, target, group, num_folds=15, params=None):
    # data setup
    X_train = df_train[features]
    Y_train = df_train[target]
    X_test = df_test[features]

    # initialize
    valid_preds = np.zeros((len(X_train), 6))
    test_preds = np.zeros((num_folds, len(X_test), 6))
    kf = GroupKFold(n_splits=num_folds)
    importantce_df = pd.DataFrame()
    for fold, (train_index, valid_index) in enumerate(
        kf.split(X=df_train[group], groups=df_train[group])):
        print('Fold {}'.format(fold + 1))
        x_trn, x_val = X_train.iloc[train_index], X_train.iloc[valid_index]
        y_trn, y_val = Y_train.iloc[train_index], Y_train.iloc[valid_index]
        train_set = lgb.Dataset(x_trn, y_trn)
        val_set = lgb.Dataset(x_val, y_val)

        if params is None:
            params = {
                'learning_rate': 0.1,
                'metric': 'rmse',
                'objective': 'regression',
                'feature_fraction': 0.85,
                'subsample': 0.85,
                'n_jobs': -1,
                'seed': 1234,
                'max_depth': -1
            }

        # train the model with early stoping
        model = lgb.train(params, train_set, num_boost_round=10000, early_stopping_rounds=10,
                          valid_sets=[train_set, val_set], verbose_eval=10)
        valid_preds[valid_index] = model.predict(x_val)
        
        # calc score
        score = accuracy_score(y_val, np.argmax(valid_preds[valid_index], axis=1))
        print('Fold {} Score : {}'.format(fold+1, score))
        
        # get feature importance
        feature_imp = pd.DataFrame(
            sorted(zip(model.feature_importance(), features), reverse=True), 
            columns=['Value','Feature']
        )
        importantce_df['Fold {}'.format(fold+1)] = feature_imp['Feature']
        
        # get test predict
        test_preds[fold] = model.predict(X_test)

    return test_preds, importantce_df

In [4]:
features = X_train.columns
target = ['activity_label']
group = ['subject_id']
params = {
    'learning_rate': 0.1,
    'objective': 'multiclass',
    'num_class': 6,
    'n_jobs': -1,
    'seed': 1234,
}

test_preds, importantce_df = lgb_trainer_with_group_kfold(
    all_df_train, X_test, features, target, group, num_folds=15, params=params)

Fold 1
Training until validation scores don't improve for 10 rounds
[10]	training's multi_logloss: 0.538108	valid_1's multi_logloss: 0.56559
[20]	training's multi_logloss: 0.204447	valid_1's multi_logloss: 0.236203
[30]	training's multi_logloss: 0.0813578	valid_1's multi_logloss: 0.106996
[40]	training's multi_logloss: 0.0330009	valid_1's multi_logloss: 0.0522653
[50]	training's multi_logloss: 0.0135288	valid_1's multi_logloss: 0.0256936
[60]	training's multi_logloss: 0.00566862	valid_1's multi_logloss: 0.013603
[70]	training's multi_logloss: 0.00240061	valid_1's multi_logloss: 0.00757135
[80]	training's multi_logloss: 0.00103427	valid_1's multi_logloss: 0.00491703
[90]	training's multi_logloss: 0.000451169	valid_1's multi_logloss: 0.00326127
[100]	training's multi_logloss: 0.000195308	valid_1's multi_logloss: 0.00242068
[110]	training's multi_logloss: 8.57059e-05	valid_1's multi_logloss: 0.00189962
[120]	training's multi_logloss: 3.83693e-05	valid_1's multi_logloss: 0.00145767
[130]	t

In [5]:
importantce_df.head(10)

Unnamed: 0,Fold 1,Fold 2,Fold 3,Fold 4,Fold 5,Fold 6,Fold 7,Fold 8,Fold 9,Fold 10,Fold 11,Fold 12,Fold 13,Fold 14,Fold 15
0,53_tGravityAcc_min___X,53_tGravityAcc_min___X,38_tBodyAcc_correlation___X_Y,53_tGravityAcc_min___X,53_tGravityAcc_min___X,38_tBodyAcc_correlation___X_Y,38_tBodyAcc_correlation___X_Y,53_tGravityAcc_min___X,58_tGravityAcc_energy___Y,53_tGravityAcc_min___X,53_tGravityAcc_min___X,53_tGravityAcc_min___X,53_tGravityAcc_min___X,58_tGravityAcc_energy___Y,38_tBodyAcc_correlation___X_Y
1,58_tGravityAcc_energy___Y,38_tBodyAcc_correlation___X_Y,58_tGravityAcc_energy___Y,38_tBodyAcc_correlation___X_Y,58_tGravityAcc_energy___Y,58_tGravityAcc_energy___Y,58_tGravityAcc_energy___Y,58_tGravityAcc_energy___Y,38_tBodyAcc_correlation___X_Y,38_tBodyAcc_correlation___X_Y,38_tBodyAcc_correlation___X_Y,54_tGravityAcc_min___Y,38_tBodyAcc_correlation___X_Y,38_tBodyAcc_correlation___X_Y,58_tGravityAcc_energy___Y
2,38_tBodyAcc_correlation___X_Y,51_tGravityAcc_max___Y,53_tGravityAcc_min___X,58_tGravityAcc_energy___Y,275_fBodyAcc_max___X,53_tGravityAcc_min___X,53_tGravityAcc_min___X,51_tGravityAcc_max___Y,53_tGravityAcc_min___X,160_tBodyGyro_correlation___Y_Z,42_tGravityAcc_mean___Y,38_tBodyAcc_correlation___X_Y,160_tBodyGyro_correlation___Y_Z,51_tGravityAcc_max___Y,53_tGravityAcc_min___X
3,51_tGravityAcc_max___Y,54_tGravityAcc_min___Y,51_tGravityAcc_max___Y,51_tGravityAcc_max___Y,160_tBodyGyro_correlation___Y_Z,54_tGravityAcc_min___Y,451_fBodyGyro_maxInds_Z,38_tBodyAcc_correlation___X_Y,51_tGravityAcc_max___Y,42_tGravityAcc_mean___Y,58_tGravityAcc_energy___Y,51_tGravityAcc_max___Y,559_angle_X_gravityMean_,53_tGravityAcc_min___X,54_tGravityAcc_min___Y
4,451_fBodyGyro_maxInds_Z,42_tGravityAcc_mean___Y,54_tGravityAcc_min___Y,42_tGravityAcc_mean___Y,451_fBodyGyro_maxInds_Z,51_tGravityAcc_max___Y,51_tGravityAcc_max___Y,451_fBodyGyro_maxInds_Z,160_tBodyGyro_correlation___Y_Z,451_fBodyGyro_maxInds_Z,51_tGravityAcc_max___Y,58_tGravityAcc_energy___Y,58_tGravityAcc_energy___Y,54_tGravityAcc_min___Y,504_fBodyAccMag_std__
5,140_tBodyGyro_iqr___X,451_fBodyGyro_maxInds_Z,160_tBodyGyro_correlation___Y_Z,130_tBodyGyro_max___X,559_angle_X_gravityMean_,160_tBodyGyro_correlation___Y_Z,160_tBodyGyro_correlation___Y_Z,160_tBodyGyro_correlation___Y_Z,42_tGravityAcc_mean___Y,130_tBodyGyro_max___X,140_tBodyGyro_iqr___X,504_fBodyAccMag_std__,451_fBodyGyro_maxInds_Z,451_fBodyGyro_maxInds_Z,160_tBodyGyro_correlation___Y_Z
6,452_fBodyGyro_meanFreq___X,23_tBodyAcc_entropy___X,140_tBodyGyro_iqr___X,59_tGravityAcc_energy___Z,130_tBodyGyro_max___X,504_fBodyAccMag_std__,434_fBodyGyro_max___Y,54_tGravityAcc_min___Y,452_fBodyGyro_meanFreq___X,58_tGravityAcc_energy___Y,130_tBodyGyro_max___X,559_angle_X_gravityMean_,51_tGravityAcc_max___Y,160_tBodyGyro_correlation___Y_Z,451_fBodyGyro_maxInds_Z
7,130_tBodyGyro_max___X,56_tGravityAcc_sma__,452_fBodyGyro_meanFreq___X,504_fBodyAccMag_std__,54_tGravityAcc_min___Y,451_fBodyGyro_maxInds_Z,54_tGravityAcc_min___Y,130_tBodyGyro_max___X,504_fBodyAccMag_std__,434_fBodyGyro_max___Y,451_fBodyGyro_maxInds_Z,42_tGravityAcc_mean___Y,504_fBodyAccMag_std__,42_tGravityAcc_mean___Y,42_tGravityAcc_mean___Y
8,42_tGravityAcc_mean___Y,275_fBodyAcc_max___X,103_tBodyAccJerk_entropy___X,452_fBodyGyro_meanFreq___X,23_tBodyAcc_entropy___X,130_tBodyGyro_max___X,504_fBodyAccMag_std__,10_tBodyAcc_max___X,54_tGravityAcc_min___Y,140_tBodyGyro_iqr___X,54_tGravityAcc_min___Y,103_tBodyAccJerk_entropy___X,42_tGravityAcc_mean___Y,559_angle_X_gravityMean_,51_tGravityAcc_max___Y
9,434_fBodyGyro_max___Y,58_tGravityAcc_energy___Y,130_tBodyGyro_max___X,451_fBodyGyro_maxInds_Z,504_fBodyAccMag_std__,56_tGravityAcc_sma__,506_fBodyAccMag_max__,506_fBodyAccMag_max__,130_tBodyGyro_max___X,10_tBodyAcc_max___X,452_fBodyGyro_meanFreq___X,451_fBodyGyro_maxInds_Z,275_fBodyAcc_max___X,130_tBodyGyro_max___X,10_tBodyAcc_max___X


In [6]:
# 元のラベルに戻すために1を足している
submit = np.argmax(np.mean(test_preds, axis=0), axis=1) + 1
np.savetxt('baseline.txt', submit)