**빛을 응용한 뇌 내 성분 검사**는 피부 상처 없이 다양한 성분 정보를 알 수 있습니다. 이는 뇌 활동 연구를 위해 신경영상을 얻을 수 있는 방법으로, 바늘(전극)을 찔러 넣는 방법인 뇌전도 검사를 대체할 방안으로 각광 받고 있습니다.

제공된 데이터로 뇌 내 성분 분석 알고리즘을 만들어 주세요.

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
data = pd.read_csv("../Data/Biometric Data Analysis/train.csv").fillna(0)

In [3]:
data.rho.unique()
data.drop("id", axis=1, inplace=True)

In [4]:
# group by rho

data_rho_25 = data[data['rho'] == 25]
data_rho_20 = data[data['rho'] == 20]
data_rho_15 = data[data['rho'] == 15]
data_rho_10 = data[data['rho'] == 10]

### XG Boost Classifier

In [5]:
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

In [6]:
def XY(df):
    X = df.iloc[:,:-4]
    y_hhb = df.iloc[:,-4]
    y_hhbo2 = df.iloc[:,-3]
    y_ca = df.iloc[:,-2]
    y_na = df.iloc[:,-1]
    
    return X, y_hhb, y_hhbo2, y_ca, y_na

In [7]:
def XG_Boost(df, num):
    X, y_hhb, y_hhbo2, y_ca, y_na = XY(df)
    
    # Split data
    X1_train, X1_test, y1_train, y1_test = train_test_split(X, y_hhb, test_size=0.2, shuffle=1234)
    X2_train, X2_test, y2_train, y2_test = train_test_split(X, y_hhbo2, test_size=0.2, shuffle=1234)
    X3_train, X3_test, y3_train, y3_test = train_test_split(X, y_ca, test_size=0.2, shuffle=1234)
    X4_train, X4_test, y4_train, y4_test = train_test_split(X, y_na, test_size=0.2, shuffle=1234)
    
    
    xg_reg_hbb = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.5, learning_rate = 0.2,
                max_depth = 10, alpha = 15, n_estimators = 50)
    xg_reg_hbo2 = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.5, learning_rate = 0.2,
                max_depth = 10, alpha = 15, n_estimators = 50)
    xg_reg_ca = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.5, learning_rate = 0.15,
                max_depth = 10, alpha = 15, n_estimators = 70)
    xg_reg_na = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.5, learning_rate = 0.2,
                max_depth = 1, alpha = 15, n_estimators = 50)
    
    
    xg_reg_hbb.fit(X1_train,y1_train)
    preds_hbb = xg_reg_hbb.predict(X1_test)

    xg_reg_hbo2.fit(X2_train,y2_train)
    preds_hbo2 = xg_reg_hbo2.predict(X2_test)

    xg_reg_ca.fit(X3_train,y3_train)
    preds_ca = xg_reg_ca.predict(X3_test)

    xg_reg_na.fit(X4_train,y4_train)
    preds_na = xg_reg_na.predict(X4_test)
    
    
    rmse_hbb = np.sqrt(mean_squared_error(y1_test, preds_hbb))
    rmse_hbo2 = np.sqrt(mean_squared_error(y2_test, preds_hbo2))
    rmse_ca = np.sqrt(mean_squared_error(y3_test, preds_ca))
    rmse_na = np.sqrt(mean_squared_error(y4_test, preds_na))

    print(num, " mm")
    print("RMSE - hbb  : %f" % (rmse_hbb))
    print("RMSE - hbo2 : %f" % (rmse_hbo2))
    print("RMSE - ca   : %f" % (rmse_ca))
    print("RMSE - na   : %f" % (rmse_na))
    
    
    return xg_reg_hbb, xg_reg_hbo2, xg_reg_ca, xg_reg_na

In [11]:
X, y_hhb, y_hhbo2, y_ca, y_na = XY(data_rho_25)
X_train, X_valid, y_train, y_valid = train_test_split(X, y_ca, test_size=0.25, shuffle=1234)

In [12]:
d_train = xgb.DMatrix(X_train, label=y_train)
d_valid = xgb.DMatrix(X_valid, label=y_valid)

In [13]:
params = {}
params['eta'] = 0.02
params['objective'] = 'reg:linear'
params['eval_metric'] = 'mae'
params['max_depth'] = 4
params['silent'] = 1

In [15]:
watchlist = [(d_train, 'train'), (d_valid, 'valid')]
clf = xgb.train(params, d_train, 10000, watchlist, early_stopping_rounds=1000, verbose_eval=10)

[0]	train-mae:8.40136	valid-mae:8.39094
Multiple eval metrics have been passed: 'valid-mae' will be used for early stopping.

Will train until valid-mae hasn't improved in 1000 rounds.
[10]	train-mae:6.87939	valid-mae:6.87643
[20]	train-mae:5.64813	valid-mae:5.65667
[30]	train-mae:4.66732	valid-mae:4.68638
[40]	train-mae:3.90274	valid-mae:3.93026
[50]	train-mae:3.32244	valid-mae:3.35832
[60]	train-mae:2.88431	valid-mae:2.95269
[70]	train-mae:2.55895	valid-mae:2.66554
[80]	train-mae:2.32046	valid-mae:2.46707
[90]	train-mae:2.14688	valid-mae:2.33366
[100]	train-mae:2.01498	valid-mae:2.242
[110]	train-mae:1.9203	valid-mae:2.18355
[120]	train-mae:1.84987	valid-mae:2.1446
[130]	train-mae:1.79529	valid-mae:2.11809
[140]	train-mae:1.75289	valid-mae:2.10221
[150]	train-mae:1.71868	valid-mae:2.09193
[160]	train-mae:1.68959	valid-mae:2.0857
[170]	train-mae:1.66691	valid-mae:2.08172
[180]	train-mae:1.64743	valid-mae:2.07881
[190]	train-mae:1.63119	valid-mae:2.07694
[200]	train-mae:1.6146	valid-ma

[1890]	train-mae:0.47743	valid-mae:2.03814
[1900]	train-mae:0.474818	valid-mae:2.03831
[1910]	train-mae:0.472199	valid-mae:2.03812
[1920]	train-mae:0.469413	valid-mae:2.03826
[1930]	train-mae:0.466579	valid-mae:2.03817
[1940]	train-mae:0.463791	valid-mae:2.03846
[1950]	train-mae:0.460761	valid-mae:2.03872
[1960]	train-mae:0.458267	valid-mae:2.03869
[1970]	train-mae:0.454752	valid-mae:2.03886
[1980]	train-mae:0.451961	valid-mae:2.0386
[1990]	train-mae:0.448681	valid-mae:2.03894
[2000]	train-mae:0.445158	valid-mae:2.03925
[2010]	train-mae:0.442729	valid-mae:2.0393
[2020]	train-mae:0.439887	valid-mae:2.03926
[2030]	train-mae:0.437752	valid-mae:2.03909
[2040]	train-mae:0.435882	valid-mae:2.03913
[2050]	train-mae:0.433118	valid-mae:2.03951
[2060]	train-mae:0.430806	valid-mae:2.03953
[2070]	train-mae:0.42832	valid-mae:2.03975
[2080]	train-mae:0.426008	valid-mae:2.03985
[2090]	train-mae:0.423288	valid-mae:2.04019
[2100]	train-mae:0.420153	valid-mae:2.04029
[2110]	train-mae:0.417502	valid-mae:

In [None]:
clf.predict

In [27]:
alg.fit(X1_train, y1_train)

preds_ca = alg.predict(X1_test)

rmse_hbb = np.sqrt(mean_squared_error(y1_test, preds_ca))
print("RMSE - hbb  : %f" % (rmse_hbb))

RMSE - hbb  : 2.689136


In [None]:
data_rho_25

In [58]:
# data_rho_25, data_rho_20, data_rho_15, data_rho_10

hbb_25, hbo2_25, ca_25, na_25 = XG_Boost(data_rho_25, 25)

hbb_20, hbo2_20, ca_20, na_20 = XG_Boost(data_rho_20, 20)

hbb_15, hbo2_15, ca_15, na_15 = XG_Boost(data_rho_15, 15)

hbb_10, hbo2_10, ca_10, na_10 = XG_Boost(data_rho_10, 10)

25  mm
RMSE - hbb  : 1.496399
RMSE - hbo2 : 0.824311
RMSE - ca   : 2.570684
RMSE - na   : 1.840316
20  mm
RMSE - hbb  : 1.402867
RMSE - hbo2 : 0.844671
RMSE - ca   : 2.744939
RMSE - na   : 1.861205
15  mm
RMSE - hbb  : 1.380185
RMSE - hbo2 : 0.918510
RMSE - ca   : 2.595230
RMSE - na   : 1.788560
10  mm
RMSE - hbb  : 1.420756
RMSE - hbo2 : 0.853461
RMSE - ca   : 2.457673
RMSE - na   : 1.609588


In [59]:
test = pd.read_csv("../Data/Biometric Data Analysis/test.csv")

In [60]:
test.head()

Unnamed: 0,id,rho,650_src,660_src,670_src,680_src,690_src,700_src,710_src,720_src,...,900_dst,910_dst,920_dst,930_dst,940_dst,950_dst,960_dst,970_dst,980_dst,990_dst
0,10000,15,0.15406,0.23275,0.30977,0.42949,0.51264,0.62558,0.7434,0.85418,...,0.0,0.0,1.432248e-14,0.0,0.0,6.332117e-15,1.429966e-14,0.0,,7.320236e-14
1,10001,15,0.48552,0.56939,0.67575,0.79089,0.85114,0.92581,0.98071,0.98177,...,1.036013e-13,,2.830975e-14,1.114337e-13,4.825731e-14,,2.282485e-14,7.348414e-14,1.259055e-13,2.349874e-13
2,10002,10,0.46883,0.56085,0.62442,0.73172,0.81724,0.91517,0.94801,0.99108,...,,,,,1.569208e-11,6.242378e-12,,1.21901e-11,,
3,10003,10,0.06905,0.07517,0.10226,0.14905,0.16182,0.19659,0.26085,0.36753,...,1.651177e-11,7.282747e-12,5.010879e-12,,1.571023e-11,0.0,0.0,3.304247e-12,4.106134e-11,
4,10004,25,0.00253,0.00757,0.01649,0.00128,0.0,0.0,0.00105,0.01975,...,0.0,0.0,0.0,0.0,3.7320570000000004e-17,4.1106050000000006e-17,0.0,0.0,1.910775e-16,2.215673e-15


In [61]:
test = test.fillna(0)

In [63]:
test_rho_25 = test[test['rho'] == 25]
test_rho_20 = test[test['rho'] == 20]
test_rho_15 = test[test['rho'] == 15]
test_rho_10 = test[test['rho'] == 10]

In [65]:
test_rho_25.head()

Unnamed: 0,id,rho,650_src,660_src,670_src,680_src,690_src,700_src,710_src,720_src,...,900_dst,910_dst,920_dst,930_dst,940_dst,950_dst,960_dst,970_dst,980_dst,990_dst
4,10004,25,0.00253,0.00757,0.01649,0.00128,0.0,0.0,0.00105,0.01975,...,0.0,0.0,0.0,0.0,3.7320570000000004e-17,4.1106050000000006e-17,0.0,0.0,1.910775e-16,2.215673e-15
7,10007,25,0.02086,0.01274,0.00564,0.02591,0.02021,0.03552,0.04008,0.08058,...,0.0,0.0,1.1795640000000001e-18,3.9536309999999997e-19,3.85985e-20,8.408863999999999e-19,1.776008e-20,2.3243279999999997e-19,7.958847e-19,1.578751e-17
9,10009,25,0.02078,0.02109,0.03051,0.03936,0.04283,0.03698,0.04585,0.03145,...,1.916856e-18,0.0,5.892738e-18,3.10021e-18,2.826076e-18,1.850818e-18,2.24685e-18,7.996433999999999e-19,8.171193e-18,5.0915640000000006e-17
20,10020,25,0.01464,0.04451,0.08738,0.13294,0.20736,0.29773,0.41921,0.58363,...,0.0,3.1241239999999996e-19,0.0,6.87719e-20,0.0,0.0,0.0,1.8739629999999998e-19,0.0,0.0
23,10023,25,0.04235,0.04854,0.06521,0.11027,0.12912,0.21326,0.25468,0.34638,...,4.755580999999999e-19,0.0,5.863937e-19,5.275918e-19,0.0,0.0,8.404945e-19,0.0,0.0,4.558269e-19


In [40]:
# features: iloc[1:]

In [66]:
test_rho_25.iloc[:, 1:].head()

Unnamed: 0,rho,650_src,660_src,670_src,680_src,690_src,700_src,710_src,720_src,730_src,...,900_dst,910_dst,920_dst,930_dst,940_dst,950_dst,960_dst,970_dst,980_dst,990_dst
4,25,0.00253,0.00757,0.01649,0.00128,0.0,0.0,0.00105,0.01975,0.0,...,0.0,0.0,0.0,0.0,3.7320570000000004e-17,4.1106050000000006e-17,0.0,0.0,1.910775e-16,2.215673e-15
7,25,0.02086,0.01274,0.00564,0.02591,0.02021,0.03552,0.04008,0.08058,0.08955,...,0.0,0.0,1.1795640000000001e-18,3.9536309999999997e-19,3.85985e-20,8.408863999999999e-19,1.776008e-20,2.3243279999999997e-19,7.958847e-19,1.578751e-17
9,25,0.02078,0.02109,0.03051,0.03936,0.04283,0.03698,0.04585,0.03145,0.06261,...,1.916856e-18,0.0,5.892738e-18,3.10021e-18,2.826076e-18,1.850818e-18,2.24685e-18,7.996433999999999e-19,8.171193e-18,5.0915640000000006e-17
20,25,0.01464,0.04451,0.08738,0.13294,0.20736,0.29773,0.41921,0.58363,0.76227,...,0.0,3.1241239999999996e-19,0.0,6.87719e-20,0.0,0.0,0.0,1.8739629999999998e-19,0.0,0.0
23,25,0.04235,0.04854,0.06521,0.11027,0.12912,0.21326,0.25468,0.34638,0.45072,...,4.755580999999999e-19,0.0,5.863937e-19,5.275918e-19,0.0,0.0,8.404945e-19,0.0,0.0,4.558269e-19


In [67]:
def prediction(hbb, hbo2, ca, na, df):
    
    test = df.iloc[:, 1:]
    
    preds_hbb = hbb.predict(test)
    preds_hbo2 = hbo2.predict(test)
    preds_ca = ca.predict(test)
    preds_na = na.predict(test)
    
    df_1 = df.copy()
    
    df_1['hbb'] = preds_hbb
    df_1['hbo2'] = preds_hbo2
    df_1['ca'] = preds_ca
    df_1['na'] = preds_na
    
    return df_1

In [68]:
# test_rho_25 - hbb_25, hbo2_25, ca_25, na_25
# test_rho_20 - hbb_20, hbo2_20, ca_20, na_20
# test_rho_15 - hbb_15, hbo2_15, ca_15, na_15
# test_rho_10 - hbb_10, hbo2_10, ca_10, na_10

rho_25 = prediction(hbb_25, hbo2_25, ca_25, na_25, test_rho_25)
rho_20 = prediction(hbb_20, hbo2_20, ca_20, na_20, test_rho_20)
rho_15 = prediction(hbb_15, hbo2_15, ca_15, na_15, test_rho_15)
rho_10 = prediction(hbb_10, hbo2_10, ca_10, na_10, test_rho_10)

In [81]:
temp = pd.concat([rho_25, rho_20, rho_15, rho_10]).sort_values('id')
id = temp.id
temp = temp.iloc[:, -4:]

In [89]:
submission = pd.concat([id, temp], axis=1)
submission.to_csv("submission/01.csv")

In [86]:
submission.head()

Unnamed: 0,id,hbb,hbo2,ca,na
0,10000,8.917688,4.704448,9.297256,3.256692
1,10001,6.046529,3.966025,6.13257,2.932524
2,10002,9.733712,4.523483,11.805521,3.398612
3,10003,9.044765,4.596517,9.095199,3.514396
4,10004,7.068999,3.793895,9.127597,3.072155


10000