In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
data = pd.read_csv("../Data/Biometric Data Analysis/train.csv").fillna(0)

In [3]:
data.rho.unique()
data.drop("id", axis=1, inplace=True)

In [4]:
# group by rho

data_rho_25 = data[data['rho'] == 25]
data_rho_20 = data[data['rho'] == 20]
data_rho_15 = data[data['rho'] == 15]
data_rho_10 = data[data['rho'] == 10]

### XG Boost

In [5]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

In [6]:
def XY(df):
    X = df.iloc[:,1:-4]
    y_hhb = df.iloc[:,-4]
    y_hhbo2 = df.iloc[:,-3]
    y_ca = df.iloc[:,-2]
    y_na = df.iloc[:,-1]
    
    return X, y_hhb, y_hhbo2, y_ca, y_na

In [34]:
def XG_Boost(df, num):
    X, y_hhb, y_hhbo2, y_ca, y_na = XY(df)
    
    # Split data
    X1_train, X1_test, y1_train, y1_test = train_test_split(X, y_hhb, test_size=0.4, shuffle=101)
    X2_train, X2_test, y2_train, y2_test = train_test_split(X, y_hhbo2, test_size=0.4, shuffle=122)
    X3_train, X3_test, y3_train, y3_test = train_test_split(X, y_ca, test_size=0.4, shuffle=157)
    X4_train, X4_test, y4_train, y4_test = train_test_split(X, y_na, test_size=0.4, shuffle=1932)
    
    d1_train = xgb.DMatrix(X1_train, label=y1_train)
    d1_test = xgb.DMatrix(X1_test, label=y1_test)
    
    d2_train = xgb.DMatrix(X2_train, label=y2_train)
    d2_test = xgb.DMatrix(X2_test, label=y2_test)
    
    d3_train = xgb.DMatrix(X3_train, label=y3_train)
    d3_test = xgb.DMatrix(X3_test, label=y3_test)
    
    d4_train = xgb.DMatrix(X4_train, label=y4_train)
    d4_test = xgb.DMatrix(X4_test, label=y4_test)
    
    params = {}
    params['eta'] = 0.05
    params['objective'] = 'reg:linear'
    params['eval_metric'] = 'mae'
    params['max_depth'] = 7
    params['silent'] = 1
    
    watchlist1 = [(d1_train, 'train'), (d1_test, 'test')]
    watchlist2 = [(d2_train, 'train'), (d2_test, 'test')]
    watchlist3 = [(d3_train, 'train'), (d3_test, 'test')]
    watchlist4 = [(d4_train, 'train'), (d4_test, 'test')]
    
    clf_hhb = xgb.train(params, d1_train, 10000, watchlist1, early_stopping_rounds=100, verbose_eval=10)
    clf_hbo2 = xgb.train(params, d2_train, 10000, watchlist2, early_stopping_rounds=100, verbose_eval=10)
    clf_ca = xgb.train(params, d3_train, 10000, watchlist3, early_stopping_rounds=100, verbose_eval=10)
    clf_na = xgb.train(params, d4_train, 10000, watchlist4, early_stopping_rounds=100, verbose_eval=10)

    preds_hhb = clf_hhb.predict(xgb.DMatrix(X1_test))
    preds_hbo2 = clf_hbo2.predict(xgb.DMatrix(X2_test))
    preds_ca = clf_ca.predict(xgb.DMatrix(X3_test))
    preds_na = clf_na.predict(xgb.DMatrix(X4_test))
    
    rmse_hhb = np.sqrt(mean_squared_error(y1_test, preds_hhb))
    rmse_hbo2 = np.sqrt(mean_squared_error(y2_test, preds_hbo2))
    rmse_ca = np.sqrt(mean_squared_error(y3_test, preds_ca))
    rmse_na = np.sqrt(mean_squared_error(y4_test, preds_na))

    print(num, " mm")
    print("RMSE - hbb  : %f" % (rmse_hhb))
    print("RMSE - hbo2 : %f" % (rmse_hbo2))
    print("RMSE - ca   : %f" % (rmse_ca))
    print("RMSE - na   : %f" % (rmse_na))
    
    
    return clf_hhb, clf_hbo2, clf_ca, clf_na

In [35]:
# data_rho_25, data_rho_20, data_rho_15, data_rho_10

hbb_25, hbo2_25, ca_25, na_25 = XG_Boost(data_rho_25, 25)

[0]	train-mae:7.06799	test-mae:7.1583
Multiple eval metrics have been passed: 'test-mae' will be used for early stopping.

Will train until test-mae hasn't improved in 100 rounds.
[10]	train-mae:4.27488	test-mae:4.36538
[20]	train-mae:2.59695	test-mae:2.75595
[30]	train-mae:1.59114	test-mae:1.86942
[40]	train-mae:1.00867	test-mae:1.43613
[50]	train-mae:0.685051	test-mae:1.24694
[60]	train-mae:0.504058	test-mae:1.1662
[70]	train-mae:0.388688	test-mae:1.12876
[80]	train-mae:0.319808	test-mae:1.11292
[90]	train-mae:0.270066	test-mae:1.10548
[100]	train-mae:0.235858	test-mae:1.09991
[110]	train-mae:0.210562	test-mae:1.09486
[120]	train-mae:0.190006	test-mae:1.09169
[130]	train-mae:0.16873	test-mae:1.08809
[140]	train-mae:0.153365	test-mae:1.08576
[150]	train-mae:0.143356	test-mae:1.0852
[160]	train-mae:0.130361	test-mae:1.085
[170]	train-mae:0.120965	test-mae:1.08461
[180]	train-mae:0.111401	test-mae:1.08364
[190]	train-mae:0.102207	test-mae:1.08234
[200]	train-mae:0.091024	test-mae:1.0819

[10]	train-mae:1.76575	test-mae:1.92222
[20]	train-mae:1.36947	test-mae:1.66594
[30]	train-mae:1.15753	test-mae:1.55894
[40]	train-mae:1.03778	test-mae:1.52017
[50]	train-mae:0.959722	test-mae:1.50849
[60]	train-mae:0.889239	test-mae:1.5032
[70]	train-mae:0.820611	test-mae:1.50385
[80]	train-mae:0.752205	test-mae:1.50691
[90]	train-mae:0.697752	test-mae:1.50836
[100]	train-mae:0.642138	test-mae:1.51046
[110]	train-mae:0.59007	test-mae:1.51374
[120]	train-mae:0.546007	test-mae:1.51672
[130]	train-mae:0.509275	test-mae:1.51928
[140]	train-mae:0.470242	test-mae:1.5207
[150]	train-mae:0.433425	test-mae:1.52206
Stopping. Best iteration:
[59]	train-mae:0.894283	test-mae:1.50284

25  mm
RMSE - hbb  : 1.421774
RMSE - hbo2 : 0.811627
RMSE - ca   : 2.596924
RMSE - na   : 1.867480


In [36]:
hbb_20, hbo2_20, ca_20, na_20 = XG_Boost(data_rho_20, 20)

[0]	train-mae:7.07557	test-mae:7.17014
Multiple eval metrics have been passed: 'test-mae' will be used for early stopping.

Will train until test-mae hasn't improved in 100 rounds.
[10]	train-mae:4.27815	test-mae:4.3814
[20]	train-mae:2.59855	test-mae:2.75281
[30]	train-mae:1.58835	test-mae:1.86443
[40]	train-mae:1.00665	test-mae:1.42706
[50]	train-mae:0.688102	test-mae:1.22441
[60]	train-mae:0.51053	test-mae:1.13015
[70]	train-mae:0.401756	test-mae:1.08481
[80]	train-mae:0.332605	test-mae:1.0608
[90]	train-mae:0.282957	test-mae:1.04649
[100]	train-mae:0.241794	test-mae:1.0395
[110]	train-mae:0.210111	test-mae:1.03441
[120]	train-mae:0.185876	test-mae:1.03111
[130]	train-mae:0.162163	test-mae:1.0275
[140]	train-mae:0.14301	test-mae:1.02525
[150]	train-mae:0.126017	test-mae:1.02289
[160]	train-mae:0.111955	test-mae:1.02198
[170]	train-mae:0.098363	test-mae:1.0203
[180]	train-mae:0.087549	test-mae:1.01905
[190]	train-mae:0.078916	test-mae:1.01816
[200]	train-mae:0.070488	test-mae:1.01713

In [37]:
hbb_15, hbo2_15, ca_15, na_15 = XG_Boost(data_rho_15, 15)

[0]	train-mae:7.13998	test-mae:7.17489
Multiple eval metrics have been passed: 'test-mae' will be used for early stopping.

Will train until test-mae hasn't improved in 100 rounds.
[10]	train-mae:4.31763	test-mae:4.32447
[20]	train-mae:2.62602	test-mae:2.6749
[30]	train-mae:1.62052	test-mae:1.79868
[40]	train-mae:1.05301	test-mae:1.38535
[50]	train-mae:0.745647	test-mae:1.20101
[60]	train-mae:0.570038	test-mae:1.12883
[70]	train-mae:0.466409	test-mae:1.09759
[80]	train-mae:0.401837	test-mae:1.08632
[90]	train-mae:0.357184	test-mae:1.07986
[100]	train-mae:0.326962	test-mae:1.077
[110]	train-mae:0.295919	test-mae:1.07612
[120]	train-mae:0.269731	test-mae:1.0761
[130]	train-mae:0.242359	test-mae:1.07489
[140]	train-mae:0.218141	test-mae:1.07306
[150]	train-mae:0.193556	test-mae:1.07195
[160]	train-mae:0.17724	test-mae:1.07159
[170]	train-mae:0.160223	test-mae:1.07077
[180]	train-mae:0.143319	test-mae:1.06869
[190]	train-mae:0.128648	test-mae:1.06774
[200]	train-mae:0.113198	test-mae:1.066

[210]	train-mae:0.127322	test-mae:1.30555
[220]	train-mae:0.118031	test-mae:1.30513
[230]	train-mae:0.108267	test-mae:1.3049
[240]	train-mae:0.099568	test-mae:1.30521
[250]	train-mae:0.091798	test-mae:1.30539
Stopping. Best iteration:
[154]	train-mae:0.206461	test-mae:1.30411

15  mm
RMSE - hbb  : 1.374183
RMSE - hbo2 : 0.882608
RMSE - ca   : 2.552061
RMSE - na   : 1.635817


In [20]:
hbb_10, hbo2_10, ca_10, na_10 = XG_Boost(data_rho_10, 10)

[0]	train-mae:7.14054	test-mae:7.13831
Multiple eval metrics have been passed: 'test-mae' will be used for early stopping.

Will train until test-mae hasn't improved in 200 rounds.
[10]	train-mae:4.31215	test-mae:4.34417
[20]	train-mae:2.62111	test-mae:2.70774
[30]	train-mae:1.62864	test-mae:1.84583
[40]	train-mae:1.07062	test-mae:1.43152
[50]	train-mae:0.760226	test-mae:1.25601
[60]	train-mae:0.579102	test-mae:1.18426
[70]	train-mae:0.470363	test-mae:1.15298
[80]	train-mae:0.396397	test-mae:1.13773
[90]	train-mae:0.349119	test-mae:1.13095
[100]	train-mae:0.315303	test-mae:1.12584
[110]	train-mae:0.285391	test-mae:1.12092
[120]	train-mae:0.256223	test-mae:1.11851
[130]	train-mae:0.231408	test-mae:1.11725
[140]	train-mae:0.207483	test-mae:1.11529
[150]	train-mae:0.189926	test-mae:1.11454
[160]	train-mae:0.171364	test-mae:1.11375
[170]	train-mae:0.156135	test-mae:1.11293
[180]	train-mae:0.143413	test-mae:1.11175
[190]	train-mae:0.131723	test-mae:1.11062
[200]	train-mae:0.122713	test-mae:

[370]	train-mae:0.080305	test-mae:1.81879
[380]	train-mae:0.07429	test-mae:1.81861
[390]	train-mae:0.069389	test-mae:1.81877
[400]	train-mae:0.066018	test-mae:1.81852
[410]	train-mae:0.061554	test-mae:1.81832
[420]	train-mae:0.057364	test-mae:1.81806
[430]	train-mae:0.053067	test-mae:1.8179
[440]	train-mae:0.049774	test-mae:1.81786
[450]	train-mae:0.04711	test-mae:1.81801
[460]	train-mae:0.044439	test-mae:1.81796
[470]	train-mae:0.041608	test-mae:1.818
[480]	train-mae:0.039	test-mae:1.81821
[490]	train-mae:0.036668	test-mae:1.81826
[500]	train-mae:0.03441	test-mae:1.81827
[510]	train-mae:0.03259	test-mae:1.81826
[520]	train-mae:0.030267	test-mae:1.81827
[530]	train-mae:0.028304	test-mae:1.81821
[540]	train-mae:0.026844	test-mae:1.81826
[550]	train-mae:0.025467	test-mae:1.81818
[560]	train-mae:0.024028	test-mae:1.81818
[570]	train-mae:0.022841	test-mae:1.81812
[580]	train-mae:0.021585	test-mae:1.81814
[590]	train-mae:0.020093	test-mae:1.81822
[600]	train-mae:0.018786	test-mae:1.81827
[6

In [38]:
test = pd.read_csv("../Data/Biometric Data Analysis/test.csv").fillna(0)

In [39]:
test_rho_25 = test[test['rho'] == 25]
test_rho_20 = test[test['rho'] == 20]
test_rho_15 = test[test['rho'] == 15]
test_rho_10 = test[test['rho'] == 10]

In [40]:
def prediction(hbb, hbo2, ca, na, df):
    
    test = df.iloc[:, 2:]
    
    preds_hbb = hbb.predict(xgb.DMatrix(test))
    preds_hbo2 = hbo2.predict(xgb.DMatrix(test))
    preds_ca = ca.predict(xgb.DMatrix(test))
    preds_na = na.predict(xgb.DMatrix(test))
    
    df_1 = df.copy()
    
    df_1['hhb'] = preds_hbb
    df_1['hbo2'] = preds_hbo2
    df_1['ca'] = preds_ca
    df_1['na'] = preds_na
    
    return df_1

In [41]:
# test_rho_25 - hbb_25, hbo2_25, ca_25, na_25
# test_rho_20 - hbb_20, hbo2_20, ca_20, na_20
# test_rho_15 - hbb_15, hbo2_15, ca_15, na_15
# test_rho_10 - hbb_10, hbo2_10, ca_10, na_10

rho_25 = prediction(hbb_25, hbo2_25, ca_25, na_25, test_rho_25)
rho_20 = prediction(hbb_20, hbo2_20, ca_20, na_20, test_rho_20)
rho_15 = prediction(hbb_15, hbo2_15, ca_15, na_15, test_rho_15)
rho_10 = prediction(hbb_10, hbo2_10, ca_10, na_10, test_rho_10)

In [42]:
temp = pd.concat([rho_25, rho_20, rho_15, rho_10]).sort_values('id')
id = temp.id
temp = temp.iloc[:, -4:]

In [44]:
submission = pd.concat([id, temp], axis=1)
submission.to_csv("submission/linear2.csv")