In [32]:
# page6

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

def get_human_dataset():
  feature_name_df = pd.read_csv('human_activity/features.txt', sep='\s+', header=None, names=['column_index', 'column_name'])

  feature_name = feature_name_df.iloc[:,1].values.tolist()

  dupchk = []
  for name in feature_name:
    if name in dupchk:
      idx = 1
      while name+'.'+str(idx) in dupchk:
        idx += 1
      dupchk.append(name+'.'+str(idx))
    else:
      dupchk.append(name)
  print(len(feature_name), len(dupchk), dupchk)

  feature_name = dupchk

  X_train = pd.read_csv('human_activity/train/X_train.txt', sep='\s+', names=feature_name)
  X_test = pd.read_csv('human_activity/test/X_test.txt', sep='\s+', names=feature_name)

  y_train = pd.read_csv('human_activity/train/y_train.txt', sep='\s+', names=['action'])
  y_test = pd.read_csv('human_activity/test/y_test.txt', sep='\s+', names=['action'])

  return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = get_human_dataset()

print("X_train=",X_train.shape)
print("X_test=",X_test.shape)
print("y_train=",y_train.shape)
print("y_test=",y_test.shape)

561 561 ['tBodyAcc-mean()-X', 'tBodyAcc-mean()-Y', 'tBodyAcc-mean()-Z', 'tBodyAcc-std()-X', 'tBodyAcc-std()-Y', 'tBodyAcc-std()-Z', 'tBodyAcc-mad()-X', 'tBodyAcc-mad()-Y', 'tBodyAcc-mad()-Z', 'tBodyAcc-max()-X', 'tBodyAcc-max()-Y', 'tBodyAcc-max()-Z', 'tBodyAcc-min()-X', 'tBodyAcc-min()-Y', 'tBodyAcc-min()-Z', 'tBodyAcc-sma()', 'tBodyAcc-energy()-X', 'tBodyAcc-energy()-Y', 'tBodyAcc-energy()-Z', 'tBodyAcc-iqr()-X', 'tBodyAcc-iqr()-Y', 'tBodyAcc-iqr()-Z', 'tBodyAcc-entropy()-X', 'tBodyAcc-entropy()-Y', 'tBodyAcc-entropy()-Z', 'tBodyAcc-arCoeff()-X,1', 'tBodyAcc-arCoeff()-X,2', 'tBodyAcc-arCoeff()-X,3', 'tBodyAcc-arCoeff()-X,4', 'tBodyAcc-arCoeff()-Y,1', 'tBodyAcc-arCoeff()-Y,2', 'tBodyAcc-arCoeff()-Y,3', 'tBodyAcc-arCoeff()-Y,4', 'tBodyAcc-arCoeff()-Z,1', 'tBodyAcc-arCoeff()-Z,2', 'tBodyAcc-arCoeff()-Z,3', 'tBodyAcc-arCoeff()-Z,4', 'tBodyAcc-correlation()-X,Y', 'tBodyAcc-correlation()-X,Z', 'tBodyAcc-correlation()-Y,Z', 'tGravityAcc-mean()-X', 'tGravityAcc-mean()-Y', 'tGravityAcc-mean()

In [34]:
# page7 - AdaBoost
# 트레이닝 데이터에 결측치도 있고 행의 개수가 맞지 않아 정확도가 낮게 나온걸로 보입니다

from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score

# 훈련 데이터에서 NaN이 있는 행 제거
X_train_cleaned = X_train.dropna()
y_train_cleaned = y_train.loc[X_train_cleaned.index]

# 테스트 데이터에서 NaN이 있는 행 제거 (테스트 데이터도 처리해야 함)
#X_test_cleaned = X_test.dropna()
#y_test_cleaned = y_test.loc[X_test_cleaned.index]

clf = AdaBoostClassifier(n_estimators=30, random_state=10, learning_rate=0.1)
clf.fit(X_train_cleaned, y_train_cleaned)
pred = clf.predict(X_test)
print("AdaBoost 정확도: {:.4f}".format(accuracy_score(y_test, pred)))

AdaBoost 정확도: 0.5310


In [37]:
# page10

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
import time

start_time = time.time()

gb_clf = GradientBoostingClassifier(random_state=0, verbose=True)
gb_clf.fit(X_train_cleaned, y_train_cleaned)
gb_pred = gb_clf.predict(X_test)
print("GBM 정확도: {:.4f}".format(accuracy_score(y_test, gb_pred)))
print("GBM 수행시간: {:.1f}초".format(time.time()-start_time))

      Iter       Train Loss   Remaining Time 
         1           1.4036           18.09m
         2           1.1544           17.47m
         3           0.9739           17.23m
         4           0.8356           17.03m
         5           0.7240           16.83m
         6           0.6334           16.64m
         7           0.5579           16.36m
         8           0.4939           16.15m
         9           0.4417           15.98m
        10           0.3948           15.97m
        20           0.1553           14.07m
        30           0.0795           12.28m
        40           0.0487           10.53m
        50           0.0337            8.77m
        60           0.0252            7.10m
        70           0.0195            5.32m
        80           0.0154            3.54m
        90           0.0123            1.77m
       100           0.0102            0.00s
GBM 정확도: 0.9294
GBM 수행시간: 1061.4초


In [44]:
# page13

from sklearn.model_selection import GridSearchCV

param = {
    'n_estimators' : [10,100],
    'learning_rate' : [0.05, 0.1]
}

grid_cv = GridSearchCV(gb_clf, param_grid=param, cv=2, verbose=1, n_jobs=-1)
grid_cv.fit(X_train_cleaned, y_train_cleaned)

print("최적 하이퍼 파라미터:\n", grid_cv.best_params_)
print("최고 예측 정확도: {:.4f}초".format(grid_cv.best_score_))

Fitting 2 folds for each of 4 candidates, totalling 8 fits


KeyboardInterrupt: 

In [None]:
# page14

gb_pred = grid_cv.best_estimator_.predict(X_test)
print("GBM + CV 정확도: {:.4f}".format(accuracy_score(y_test, gb_pred)))