In [16]:
import pandas as pd
import warnings
import missingno as msno

import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm

warnings.filterwarnings('ignore')

In [17]:
import json
import pandas as pd
import numpy as np
import os
import re
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score
from scipy.stats import kurtosis, iqr
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA

from detecta import detect_peaks
from changepy import pelt
from changepy.costs import normal_mean, normal_var, normal_meanvar

In [18]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

print(train.columns)
print(test.columns)

Index(['layer_1', 'layer_2', 'layer_3', 'layer_4', '0', '1', '2', '3', '4',
       '5',
       ...
       '216', '217', '218', '219', '220', '221', '222', '223', '224', '225'],
      dtype='object', length=230)
Index(['id', '0', '1', '2', '3', '4', '5', '6', '7', '8',
       ...
       '216', '217', '218', '219', '220', '221', '222', '223', '224', '225'],
      dtype='object', length=227)


In [19]:
#독립변수와 종속변수를 분리합니다.
train_X = train.iloc[:,4:]
train_Y = train.iloc[:,0:4]
test_X = test.iloc[:,1:]

In [20]:
# rms, rss 정의
def rms(x):
    return np.sqrt(np.mean(x**2))

def rss(x):
    return rms(x)*len(x)

def skewness(x):
    return (sum((x-np.mean(x))**3)/len(x))/(sum((x-np.mean(x))**2)/len(x))**(3/2)

In [21]:
import numpy as np

from scipy.stats import kurtosis, iqr

function_list = ['mean', 'min', 'max', 'std', skewness, rss]


In [22]:
train_X.iloc[1:10,:].aggregate(function_list, axis=1)

Unnamed: 0,mean,min,max,std,skewness,rss
1,0.211725,0.06086,0.653231,0.144346,1.391504,57.871515
2,0.259477,0.034894,0.750391,0.217637,0.805334,76.468513
3,0.310657,0.027712,0.805305,0.260747,0.472784,91.577755
4,0.361292,0.030385,0.819105,0.27885,0.19902,103.05828
5,0.403067,0.027361,0.79003,0.276715,-0.061827,110.415481
6,0.43163,0.027519,0.734198,0.258159,-0.328845,113.598502
7,0.433948,0.030484,0.695156,0.219381,-0.593736,109.842879
8,0.411927,0.033531,0.677778,0.180273,-0.535266,101.584003
9,0.37902,0.05913,0.617094,0.160536,-0.205367,92.994025


In [23]:
train_summary = train_X.aggregate(function_list,axis=1)
test_summary = test_X.aggregate(function_list,axis=1)

In [24]:
feature_cols = list(train_X)
feature_cols[0:5]

['0', '1', '2', '3', '4']

In [25]:
from changepy import pelt
from changepy.costs import normal_mean, normal_var, normal_meanvar

train_peak = []

for i in range(len(train_X)):
    cp1 = len(pelt(normal_mean(train_X.iloc[i,][feature_cols], np.var(train_X.iloc[i,][feature_cols])), len(train_X.iloc[i,][feature_cols]))) - 1
    cp2 = len(pelt(normal_mean(train_X.iloc[i,][feature_cols], np.var(train_X.iloc[i,][feature_cols])), len(train_X.iloc[i,][feature_cols]))) - 1
    cp3 = len(pelt(normal_var(train_X.iloc[i,][feature_cols], np.mean(train_X.iloc[i,][feature_cols])), len(train_X.iloc[i,][feature_cols]))) - 1
    cp4 = len(pelt(normal_var(train_X.iloc[i,][feature_cols], np.mean(train_X.iloc[i,][feature_cols])), len(train_X.iloc[i,][feature_cols]))) - 1
    cp5 = len(pelt(normal_meanvar(train_X.iloc[i,][feature_cols]), len(train_X.iloc[i,][feature_cols]))) - 1
    cp6 = len(pelt(normal_meanvar(train_X.iloc[i,][feature_cols]), len(train_X.iloc[i,][feature_cols]))) - 1
    train_peak.append(pd.DataFrame({'d':[i], 'cp1':[cp1], 'cp2':[cp2], 'cp3':[cp3], 'cp4':[cp4], 'cp5':[cp5], 'cp6':[cp6]}))

train_peak = pd.concat(train_peak)



In [26]:
test_peak = []

for i in range(len(test_X)):
    cp1 = len(pelt(normal_mean(test_X.iloc[i,][feature_cols], np.var(test_X.iloc[i,][feature_cols])), len(test_X.iloc[i,][feature_cols]))) - 1
    cp2 = len(pelt(normal_mean(test_X.iloc[i,][feature_cols], np.var(test_X.iloc[i,][feature_cols])), len(test_X.iloc[i,][feature_cols]))) - 1
    cp3 = len(pelt(normal_var(test_X.iloc[i,][feature_cols], np.mean(test_X.iloc[i,][feature_cols])), len(test_X.iloc[i,][feature_cols]))) - 1
    cp4 = len(pelt(normal_var(test_X.iloc[i,][feature_cols], np.mean(test_X.iloc[i,][feature_cols])), len(test_X.iloc[i,][feature_cols]))) - 1
    cp5 = len(pelt(normal_meanvar(test_X.iloc[i,][feature_cols]), len(test_X.iloc[i,][feature_cols]))) - 1
    cp6 = len(pelt(normal_meanvar(test_X.iloc[i,][feature_cols]), len(test_X.iloc[i,][feature_cols]))) - 1
    test_peak.append(pd.DataFrame({'d':[i], 'cp1':[cp1], 'cp2':[cp2], 'cp3':[cp3], 'cp4':[cp4], 'cp5':[cp5], 'cp6':[cp6]}))

test_peak = pd.concat(test_peak)


In [27]:
train_peak_rslt = pd.DataFrame()

for i in range(len(train_X)):
    p = detect_peaks(train_X.iloc[i,][feature_cols], mph=4)
    f_n = len(p)
    p_interval = np.mean(np.diff(p)) if f_n > 2 else 0
    p_interval_std = np.std(np.diff(p)) if f_n > 2 else 0
    p_mean = np.mean(train_X.iloc[i,][feature_cols][p]) if f_n > 0 else 0
    p_max = np.max(train_X.iloc[i,][feature_cols][p]) if f_n > 0 else 0
    p_min = np.min(train_X.iloc[i,][feature_cols][p]) if f_n > 0 else 0
    p_std = np.std(train_X.iloc[i,][feature_cols][p]) if f_n > 0 else 0
    row_peak = pd.DataFrame({'d': i,
                             'f_n':[f_n],
                             'p_interval':[p_interval],
                             'p_interval_std':[p_interval_std],
                             'p_mean':[p_mean],
                             'p_max':[p_max],
                             'p_min':[p_min],
                             'p_std':[p_std]})
    train_peak_rslt = pd.concat([train_peak_rslt, row_peak])
    
    

In [28]:

test_peak_rslt = pd.DataFrame()

for i in range(len(test_X)):
    p = detect_peaks(test_X.iloc[i,][feature_cols], mph=4)
    f_n = len(p)
    p_interval = np.mean(np.diff(p)) if f_n > 2 else 0
    p_interval_std = np.std(np.diff(p)) if f_n > 2 else 0
    p_mean = np.mean(test_X.iloc[i,][feature_cols][p]) if f_n > 0 else 0
    p_max = np.max(test_X.iloc[i,][feature_cols][p]) if f_n > 0 else 0
    p_min = np.min(test_X.iloc[i,][feature_cols][p]) if f_n > 0 else 0
    p_std = np.std(test_X.iloc[i,][feature_cols][p]) if f_n > 0 else 0
    row_peak = pd.DataFrame({'d': i,
                         'f_n':[f_n],
                         'p_interval':[p_interval],
                         'p_interval_std':[p_interval_std],
                         'p_mean':[p_mean],
                         'p_max':[p_max],
                         'p_min':[p_min],
                         'p_std':[p_std]})
    test_peak_rslt = pd.concat([test_peak_rslt, row_peak])

In [29]:
# crest factor 추출

def crest(x):
    return np.max(x)/rms(x)

crests_train = pd.DataFrame()

for i in range(len(train_X)):
    cfR = crest(train_X.iloc[i,][feature_cols])
    cfA = crest(train_X.iloc[i,][feature_cols])
    row_crest = pd.DataFrame({'d': i, 'cfR': [cfR], 'cfA': [cfA]})
    crests_train = pd.concat([crests_train, row_crest])

crests_test = pd.DataFrame()

for i in range(len(test_X)):
    cfR = crest(test_X.iloc[i,][feature_cols])
    cfA = crest(test_X.iloc[i,][feature_cols])
    row_crest = pd.DataFrame({'d': i, 'cfR': [cfR], 'cfA': [cfA]})
    crests_test = pd.concat([crests_test, row_crest])


In [32]:
train_peak.to_pickle('train_peak.pkl')
test_peak.to_pickle('test_peak.pkl')
train_peak_rslt.to_pickle('train_peak_rslt.pkl')
test_peak_rslt.to_pickle('test_peak_rslt.pkl')
crests_train.to_pickle('crests_train.pkl')
crests_test.to_pickle('crests_test.pkl')
train_summary.to_pickle('train_summary.pkl')
test_summary.to_pickle('test_summary.pkl')

In [34]:
# peak_final
peak_final_train = pd.merge(train_peak_rslt, crests_train, on='d')
peak_final_train = pd.merge(peak_final_train, train_peak, on='d')

TypeError: expected string or bytes-like object

In [None]:
# peak_final
peak_final_test = pd.merge(test_peak_rslt, crests_test, on='d')
peak_final_test = pd.merge(peak_final_test, test_peak, on='d')

In [None]:

def id_f(d):
    numbers = re.findall('\d+', d)
    return pd.DataFrame({'exp_no': [numbers[0]], 'id':[numbers[1]], 'activity': [d.split('_')[0]]})

# train적용
temp_train = pd.DataFrame()

for i in range(len(train_X)):
    temp_train = pd.concat([temp_train, id_f(i)])

peak_final_train2 = pd.concat([peak_final_train, temp_train.reset_index()], axis=1)

           
# test 적용 
peak_final_test = pd.merge(test_peak_rslt, crests_train, on='d')
peak_final_test = pd.merge(peak_final_test, train_peak, on='d')             
               
temp_test = pd.DataFrame()
               
for i in range(len(test_X)):
    temp_test = pd.concat([temp_test, id_f(i)])

peak_final_test2 = pd.concat([peak_final_test, temp_test.reset_index()], axis=1)


In [None]:
peak_final_test2.to_pickle('final+data.pkl')

In [None]:
# 통계량 + peak으로 예측

peak_final3 = pd.concat([peak_final2, HAR_train_ext], axis=1)

activity_all = peak_final3.drop(['d', 'index', 'exp_no', 'id', 'activity'], axis=1)

rf = RandomForestClassifier(random_state=123456)
accuracy = cross_val_score(rf, activity_all, peak_final3['activity'], cv=10)
precision = cross_val_score(rf, activity_all, peak_final3['activity'], cv=10, scoring='precision_macro')
recall = cross_val_score(rf, activity_all, peak_final3['activity'], cv=10, scoring='recall_macro')
score = {'accuracy': np.mean(accuracy),
         'recall': np.mean(recall),
         'precision': np.mean(precision)}
print(json.dumps(score, indent=2))