In [173]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [174]:
train_features = pd.read_csv('../../Data/KAERI/train_features.csv')
train_target = pd.read_csv('../../Data/KAERI/train_target.csv', index_col = 'id')
test_features = pd.read_csv('../../Data/KAERI/test_features.csv')

In [175]:
def preprocessing_KAERI(data) :
    '''
    data: train_features.csv or test_features.csv
    
    return: Random Forest 모델 입력용 데이터
    '''
    
    # 충돌체 별로 0.000116 초 까지의 가속도 데이터만 활용해보기 
    _data = data.groupby('id').head(30)
    
    # string 형태로 변환
    _data['Time'] = _data['Time'].astype('str')
    
    # Random Forest 모델에 입력 할 수 있는 1차원 형태로 가속도 데이터 변환
    _data = _data.pivot_table(index = 'id', columns = 'Time', values = ['S1', 'S2', 'S3', 'S4'])
    
    # column 명 변환
    _data.columns = ['_'.join(col) for col in _data.columns.values]
    
    return _data

In [176]:
train_features = preprocessing_KAERI(train_features)
test_features = preprocessing_KAERI(test_features)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':


In [177]:
print(f'train_features {train_features.shape}')
print(f'train_target {train_target.shape}')
print(f'test_features {test_features.shape}')

train_features (2800, 120)
train_target (2800, 4)
test_features (700, 120)


In [179]:
train_features.columns = np.arange(0,120, 1)
test_features.columns = np.arange(0,120, 1)

In [181]:
data = pd.concat([train_features, train_target], axis=1)

In [182]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

In [183]:
def XY(df):
    X = df.iloc[:,:-4]
    y_x = df.iloc[:,-4]
    y_y = df.iloc[:,-3]
    y_m = df.iloc[:,-2]
    y_v = df.iloc[:,-1]
    
    return X, y_x, y_y, y_m, y_v

In [184]:
def XG_Boost(df, ch):
    X, y_x, y_y, y_m, y_v = XY(df)
    
    # Split data
    X1_train, X1_test, y1_train, y1_test = train_test_split(X, y_x, test_size=0.2, shuffle=False)
    X2_train, X2_test, y2_train, y2_test = train_test_split(X, y_y, test_size=0.2, shuffle=False)
    X3_train, X3_test, y3_train, y3_test = train_test_split(X, y_m, test_size=0.2, shuffle=False)
    X4_train, X4_test, y4_train, y4_test = train_test_split(X, y_v, test_size=0.2, shuffle=1932)
    
    d1_train = xgb.DMatrix(X1_train, label=y1_train)
    d1_test = xgb.DMatrix(X1_test, label=y1_test)
    
    d2_train = xgb.DMatrix(X2_train, label=y2_train)
    d2_test = xgb.DMatrix(X2_test, label=y2_test)
    
    d3_train = xgb.DMatrix(X3_train, label=y3_train)
    d3_test = xgb.DMatrix(X3_test, label=y3_test)
    
    d4_train = xgb.DMatrix(X4_train, label=y4_train)
    d4_test = xgb.DMatrix(X4_test, label=y4_test)
    
    params = {}
    params['eta'] = 1
    params['objective'] = 'reg:squarederror'
    params['eval_metric'] = 'logloss'
    params['max_depth'] = 4
    params['silent'] = 1
    
    watchlist1 = [(d1_train, 'train'), (d1_test, 'test')]
    watchlist2 = [(d2_train, 'train'), (d2_test, 'test')]
    watchlist3 = [(d3_train, 'train'), (d3_test, 'test')]
    watchlist4 = [(d4_train, 'train'), (d4_test, 'test')]
    
    clf_x = xgb.train(params, d1_train, 10000, watchlist1, early_stopping_rounds=500, verbose_eval=10)
    clf_y = xgb.train(params, d2_train, 10000, watchlist2, early_stopping_rounds=500, verbose_eval=10)
    clf_m = xgb.train(params, d3_train, 10000, watchlist3, early_stopping_rounds=500, verbose_eval=10)
    clf_v = xgb.train(params, d4_train, 10000, watchlist4, early_stopping_rounds=500, verbose_eval=10)

    preds_x = clf_x.predict(xgb.DMatrix(X1_test))
    preds_y = clf_y.predict(xgb.DMatrix(X2_test))
    preds_m = clf_m.predict(xgb.DMatrix(X3_test))
    preds_v = clf_v.predict(xgb.DMatrix(X4_test))
    
    rmse_x = np.sqrt(mean_squared_error(y1_test, preds_x))
    rmse_y = np.sqrt(mean_squared_error(y2_test, preds_y))
    rmse_m = np.sqrt(mean_squared_error(y3_test, preds_m))
    rmse_v = np.sqrt(mean_squared_error(y4_test, preds_v))

    print(ch)
    print("RMSE - x  : %f" % (rmse_x))
    print("RMSE - y : %f" % (rmse_y))
    print("RMSE - m   : %f" % (rmse_m))
    print("RMSE - v   : %f" % (rmse_v))
    
    
    return clf_x, clf_y, clf_m, clf_v

In [185]:
x, y, m, v = XG_Boost(data, "baseline")

[0]	train-logloss:-8143.37	test-logloss:-8018.52
Multiple eval metrics have been passed: 'test-logloss' will be used for early stopping.

Will train until test-logloss hasn't improved in 500 rounds.
[10]	train-logloss:-8291.36	test-logloss:-8179.82
[20]	train-logloss:-8294.58	test-logloss:-8166.73
[30]	train-logloss:-8294.81	test-logloss:-8153.77
[40]	train-logloss:-8294.99	test-logloss:-8153.78
[50]	train-logloss:-8295.38	test-logloss:-8153.83
[60]	train-logloss:-8295.37	test-logloss:-8153.89
[70]	train-logloss:-8295.46	test-logloss:-8153.89
[80]	train-logloss:-8295.49	test-logloss:-8153.9
[90]	train-logloss:-8295.57	test-logloss:-8153.84
[100]	train-logloss:-8295.6	test-logloss:-8153.84
[110]	train-logloss:-8295.62	test-logloss:-8153.9
[120]	train-logloss:-8295.68	test-logloss:-8153.9
[130]	train-logloss:-8295.69	test-logloss:-8153.9
[140]	train-logloss:-8295.69	test-logloss:-8153.91
[150]	train-logloss:-8295.69	test-logloss:-8153.91
[160]	train-logloss:-8295.69	test-logloss:-8153.91

[490]	train-logloss:-3628.38	test-logloss:-3706.77
[500]	train-logloss:-3628.38	test-logloss:-3706.77
Stopping. Best iteration:
[1]	train-logloss:-3628.38	test-logloss:-3722.95

[0]	train-logloss:0.621064	test-logloss:0.631068
Multiple eval metrics have been passed: 'test-logloss' will be used for early stopping.

Will train until test-logloss hasn't improved in 500 rounds.
[10]	train-logloss:0.49636	test-logloss:0.514267
[20]	train-logloss:0.472889	test-logloss:0.493145
[30]	train-logloss:0.470095	test-logloss:0.490366
[40]	train-logloss:0.468884	test-logloss:0.489244
[50]	train-logloss:0.468357	test-logloss:0.489024
[60]	train-logloss:0.468017	test-logloss:0.488751
[70]	train-logloss:0.467783	test-logloss:0.488476
[80]	train-logloss:0.46763	test-logloss:0.488384
[90]	train-logloss:0.467505	test-logloss:0.488368
[100]	train-logloss:0.46744	test-logloss:0.488352
[110]	train-logloss:0.467374	test-logloss:0.488295
[120]	train-logloss:0.46731	test-logloss:0.488265
[130]	train-logloss:0.46

In [190]:
test = xgb.DMatrix(test_features)

In [193]:
y_x = x.predict(test)
y_y = y.predict(test)
y_m = m.predict(test)
y_v = v.predict(test)

In [201]:
num = np.arange(2800, 3500, 1)

In [202]:
y_pred = pd.DataFrame({'id': num, 'X': y_x, 'Y': y_y, 'M': y_m, 'V': y_v})

In [204]:
y_pred.to_csv('submission/01.csv', index = False)