In [2]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [3]:
data = pd.read_csv("../Data/Biometric Data Analysis/train.csv").fillna(0)

In [4]:
data.rho.unique()
data.drop("id", axis=1, inplace=True)

In [5]:
# group by rho

data_rho_25 = data[data['rho'] == 25]
data_rho_20 = data[data['rho'] == 20]
data_rho_15 = data[data['rho'] == 15]
data_rho_10 = data[data['rho'] == 10]

### XG Boost

In [6]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

In [7]:
def XY(df):
    X = df.iloc[:,36:-4]
    y_hhb = df.iloc[:,-4]
    y_hhbo2 = df.iloc[:,-3]
    y_ca = df.iloc[:,-2]
    y_na = df.iloc[:,-1]
    
    return X, y_hhb, y_hhbo2, y_ca, y_na

In [28]:
def XG_Boost(df, num):
    X, y_hhb, y_hhbo2, y_ca, y_na = XY(df)
    
    # Split data
    X1_train, X1_test, y1_train, y1_test = train_test_split(X, y_hhb, test_size=0.25, shuffle=101)
    X2_train, X2_test, y2_train, y2_test = train_test_split(X, y_hhbo2, test_size=0.25, shuffle=122)
    X3_train, X3_test, y3_train, y3_test = train_test_split(X, y_ca, test_size=0.25, shuffle=157)
    X4_train, X4_test, y4_train, y4_test = train_test_split(X, y_na, test_size=0.25, shuffle=1932)
    
    d1_train = xgb.DMatrix(X1_train, label=y1_train)
    d1_test = xgb.DMatrix(X1_test, label=y1_test)
    
    d2_train = xgb.DMatrix(X2_train, label=y2_train)
    d2_test = xgb.DMatrix(X2_test, label=y2_test)
    
    d3_train = xgb.DMatrix(X3_train, label=y3_train)
    d3_test = xgb.DMatrix(X3_test, label=y3_test)
    
    d4_train = xgb.DMatrix(X4_train, label=y4_train)
    d4_test = xgb.DMatrix(X4_test, label=y4_test)
    
    params = {}
    params['eta'] = 0.02
    params['objective'] = 'reg:squarederror'
    params['eval_metric'] = 'mae'
    params['max_depth'] = 4
    params['silent'] = 1
    
    watchlist1 = [(d1_train, 'train'), (d1_test, 'test')]
    watchlist2 = [(d2_train, 'train'), (d2_test, 'test')]
    watchlist3 = [(d3_train, 'train'), (d3_test, 'test')]
    watchlist4 = [(d4_train, 'train'), (d4_test, 'test')]
    
    clf_hhb = xgb.train(params, d1_train, 10000, watchlist1, early_stopping_rounds=500, verbose_eval=10)
    clf_hbo2 = xgb.train(params, d2_train, 10000, watchlist2, early_stopping_rounds=500, verbose_eval=10)
    clf_ca = xgb.train(params, d3_train, 10000, watchlist3, early_stopping_rounds=500, verbose_eval=10)
    clf_na = xgb.train(params, d4_train, 10000, watchlist4, early_stopping_rounds=500, verbose_eval=10)

    preds_hhb = clf_hhb.predict(xgb.DMatrix(X1_test))
    preds_hbo2 = clf_hbo2.predict(xgb.DMatrix(X2_test))
    preds_ca = clf_ca.predict(xgb.DMatrix(X3_test))
    preds_na = clf_na.predict(xgb.DMatrix(X4_test))
    
    rmse_hhb = np.sqrt(mean_squared_error(y1_test, preds_hhb))
    rmse_hbo2 = np.sqrt(mean_squared_error(y2_test, preds_hbo2))
    rmse_ca = np.sqrt(mean_squared_error(y3_test, preds_ca))
    rmse_na = np.sqrt(mean_squared_error(y4_test, preds_na))

    print(num, " mm")
    print("RMSE - hbb  : %f" % (rmse_hhb))
    print("RMSE - hbo2 : %f" % (rmse_hbo2))
    print("RMSE - ca   : %f" % (rmse_ca))
    print("RMSE - na   : %f" % (rmse_na))
    
    
    return clf_hhb, clf_hbo2, clf_ca, clf_na

In [29]:
# data_rho_25, data_rho_20, data_rho_15, data_rho_10

hbb_25, hbo2_25, ca_25, na_25 = XG_Boost(data_rho_25, 25)

[0]	train-mae:7.33873	test-mae:7.26932
Multiple eval metrics have been passed: 'test-mae' will be used for early stopping.

Will train until test-mae hasn't improved in 500 rounds.
[10]	train-mae:6.00539	test-mae:5.94584
[20]	train-mae:4.9176	test-mae:4.87843
[30]	train-mae:4.03533	test-mae:4.02106
[40]	train-mae:3.33168	test-mae:3.34778
[50]	train-mae:2.77893	test-mae:2.81257
[60]	train-mae:2.34555	test-mae:2.40398
[70]	train-mae:2.01861	test-mae:2.09339
[80]	train-mae:1.77538	test-mae:1.86039
[90]	train-mae:1.59725	test-mae:1.69074
[100]	train-mae:1.47073	test-mae:1.56821
[110]	train-mae:1.37796	test-mae:1.48253
[120]	train-mae:1.3082	test-mae:1.41877
[130]	train-mae:1.25582	test-mae:1.37348
[140]	train-mae:1.21595	test-mae:1.34088
[150]	train-mae:1.18512	test-mae:1.31868
[160]	train-mae:1.16166	test-mae:1.30251
[170]	train-mae:1.14224	test-mae:1.29025
[180]	train-mae:1.1272	test-mae:1.28096
[190]	train-mae:1.11324	test-mae:1.27378
[200]	train-mae:1.10138	test-mae:1.2692
[210]	train-

[530]	train-mae:0.417259	test-mae:0.673294
[540]	train-mae:0.414278	test-mae:0.673392
[550]	train-mae:0.411438	test-mae:0.673682
[560]	train-mae:0.408447	test-mae:0.673683
[570]	train-mae:0.405287	test-mae:0.673657
[580]	train-mae:0.402342	test-mae:0.673516
[590]	train-mae:0.399606	test-mae:0.673641
[600]	train-mae:0.396593	test-mae:0.673412
[610]	train-mae:0.393747	test-mae:0.673574
[620]	train-mae:0.39105	test-mae:0.673899
[630]	train-mae:0.388163	test-mae:0.673934
[640]	train-mae:0.385664	test-mae:0.674135
[650]	train-mae:0.382987	test-mae:0.674447
[660]	train-mae:0.380233	test-mae:0.674785
[670]	train-mae:0.377897	test-mae:0.6749
[680]	train-mae:0.375699	test-mae:0.674985
[690]	train-mae:0.372977	test-mae:0.674462
[700]	train-mae:0.370718	test-mae:0.674714
[710]	train-mae:0.368322	test-mae:0.674307
[720]	train-mae:0.365906	test-mae:0.674495
[730]	train-mae:0.363643	test-mae:0.674326
[740]	train-mae:0.361594	test-mae:0.674181
[750]	train-mae:0.359406	test-mae:0.67394
[760]	train-mae

[70]	train-mae:1.46857	test-mae:1.56834
[80]	train-mae:1.4279	test-mae:1.54489
[90]	train-mae:1.39476	test-mae:1.5285
[100]	train-mae:1.36764	test-mae:1.51774
[110]	train-mae:1.34631	test-mae:1.51059
[120]	train-mae:1.32912	test-mae:1.50537
[130]	train-mae:1.31385	test-mae:1.50144
[140]	train-mae:1.29918	test-mae:1.49812
[150]	train-mae:1.28498	test-mae:1.49633
[160]	train-mae:1.27248	test-mae:1.49443
[170]	train-mae:1.26095	test-mae:1.49465
[180]	train-mae:1.24971	test-mae:1.49553
[190]	train-mae:1.23858	test-mae:1.49632
[200]	train-mae:1.22742	test-mae:1.49697
[210]	train-mae:1.21647	test-mae:1.49664
[220]	train-mae:1.20596	test-mae:1.49739
[230]	train-mae:1.19602	test-mae:1.49786
[240]	train-mae:1.18664	test-mae:1.49851
[250]	train-mae:1.17747	test-mae:1.499
[260]	train-mae:1.16776	test-mae:1.49971
[270]	train-mae:1.15797	test-mae:1.50132
[280]	train-mae:1.14949	test-mae:1.5035
[290]	train-mae:1.14155	test-mae:1.50538
[300]	train-mae:1.13282	test-mae:1.507
[310]	train-mae:1.12445	te

In [30]:
hbb_20, hbo2_20, ca_20, na_20 = XG_Boost(data_rho_20, 20)

[0]	train-mae:7.2857	test-mae:7.47297
Multiple eval metrics have been passed: 'test-mae' will be used for early stopping.

Will train until test-mae hasn't improved in 500 rounds.
[10]	train-mae:5.9641	test-mae:6.14188
[20]	train-mae:4.88483	test-mae:5.05057
[30]	train-mae:4.00854	test-mae:4.16325
[40]	train-mae:3.31007	test-mae:3.46366
[50]	train-mae:2.76245	test-mae:2.92109
[60]	train-mae:2.33953	test-mae:2.51076
[70]	train-mae:2.01531	test-mae:2.20909
[80]	train-mae:1.77049	test-mae:1.98064
[90]	train-mae:1.58867	test-mae:1.80844
[100]	train-mae:1.45281	test-mae:1.67876
[110]	train-mae:1.35322	test-mae:1.58137
[120]	train-mae:1.28046	test-mae:1.51091
[130]	train-mae:1.22453	test-mae:1.46064
[140]	train-mae:1.18204	test-mae:1.42328
[150]	train-mae:1.14834	test-mae:1.39586
[160]	train-mae:1.12095	test-mae:1.37396
[170]	train-mae:1.09895	test-mae:1.35553
[180]	train-mae:1.08105	test-mae:1.34234
[190]	train-mae:1.06536	test-mae:1.3322
[200]	train-mae:1.05037	test-mae:1.32365
[210]	train

[600]	train-mae:0.398295	test-mae:0.666264
[610]	train-mae:0.395156	test-mae:0.666048
[620]	train-mae:0.392086	test-mae:0.666085
[630]	train-mae:0.389009	test-mae:0.666022
[640]	train-mae:0.385927	test-mae:0.666048
[650]	train-mae:0.382949	test-mae:0.666465
[660]	train-mae:0.379959	test-mae:0.666707
[670]	train-mae:0.376981	test-mae:0.667026
[680]	train-mae:0.373634	test-mae:0.66727
[690]	train-mae:0.371074	test-mae:0.6673
[700]	train-mae:0.368203	test-mae:0.667455
[710]	train-mae:0.36536	test-mae:0.667421
[720]	train-mae:0.363008	test-mae:0.66719
[730]	train-mae:0.360268	test-mae:0.667068
[740]	train-mae:0.357782	test-mae:0.667422
[750]	train-mae:0.355236	test-mae:0.667434
[760]	train-mae:0.35292	test-mae:0.667401
[770]	train-mae:0.35033	test-mae:0.667405
[780]	train-mae:0.347733	test-mae:0.667395
[790]	train-mae:0.345423	test-mae:0.667212
[800]	train-mae:0.342867	test-mae:0.667447
[810]	train-mae:0.340355	test-mae:0.667699
[820]	train-mae:0.338013	test-mae:0.667832
[830]	train-mae:0.

[680]	train-mae:0.804301	test-mae:1.42773
[690]	train-mae:0.797678	test-mae:1.4289
[700]	train-mae:0.792862	test-mae:1.42984
[710]	train-mae:0.786215	test-mae:1.42977
[720]	train-mae:0.779843	test-mae:1.4291
[730]	train-mae:0.773311	test-mae:1.42999
[740]	train-mae:0.766767	test-mae:1.43119
[750]	train-mae:0.762233	test-mae:1.43099
[760]	train-mae:0.756687	test-mae:1.43073
[770]	train-mae:0.751722	test-mae:1.43101
[780]	train-mae:0.74616	test-mae:1.4312
[790]	train-mae:0.74099	test-mae:1.43193
[800]	train-mae:0.736785	test-mae:1.43222
[810]	train-mae:0.73126	test-mae:1.43241
Stopping. Best iteration:
[317]	train-mae:1.0778	test-mae:1.41361

20  mm
RMSE - hbb  : 1.639596
RMSE - hbo2 : 0.834693
RMSE - ca   : 2.524156
RMSE - na   : 1.784035


In [31]:
hbb_15, hbo2_15, ca_15, na_15 = XG_Boost(data_rho_15, 15)

[0]	train-mae:7.29204	test-mae:7.63212
Multiple eval metrics have been passed: 'test-mae' will be used for early stopping.

Will train until test-mae hasn't improved in 500 rounds.
[10]	train-mae:5.96819	test-mae:6.26624
[20]	train-mae:4.88891	test-mae:5.1517
[30]	train-mae:4.01255	test-mae:4.2487
[40]	train-mae:3.31113	test-mae:3.52673
[50]	train-mae:2.75809	test-mae:2.95518
[60]	train-mae:2.32918	test-mae:2.5184
[70]	train-mae:2.0021	test-mae:2.19049
[80]	train-mae:1.75539	test-mae:1.94559
[90]	train-mae:1.57729	test-mae:1.75427
[100]	train-mae:1.44685	test-mae:1.61797
[110]	train-mae:1.35089	test-mae:1.52482
[120]	train-mae:1.28171	test-mae:1.46067
[130]	train-mae:1.2305	test-mae:1.4151
[140]	train-mae:1.19194	test-mae:1.38446
[150]	train-mae:1.15978	test-mae:1.36031
[160]	train-mae:1.13244	test-mae:1.34297
[170]	train-mae:1.10997	test-mae:1.32687
[180]	train-mae:1.09104	test-mae:1.31376
[190]	train-mae:1.07233	test-mae:1.30486
[200]	train-mae:1.05686	test-mae:1.29785
[210]	train-ma

[480]	train-mae:0.466776	test-mae:0.688218
[490]	train-mae:0.463597	test-mae:0.687827
[500]	train-mae:0.460609	test-mae:0.687726
[510]	train-mae:0.457521	test-mae:0.68715
[520]	train-mae:0.453927	test-mae:0.686491
[530]	train-mae:0.450556	test-mae:0.686844
[540]	train-mae:0.447455	test-mae:0.686415
[550]	train-mae:0.444284	test-mae:0.685868
[560]	train-mae:0.441067	test-mae:0.685312
[570]	train-mae:0.438099	test-mae:0.685013
[580]	train-mae:0.435008	test-mae:0.684992
[590]	train-mae:0.431927	test-mae:0.684511
[600]	train-mae:0.428607	test-mae:0.684425
[610]	train-mae:0.425164	test-mae:0.684065
[620]	train-mae:0.422229	test-mae:0.68403
[630]	train-mae:0.419375	test-mae:0.683892
[640]	train-mae:0.416416	test-mae:0.683694
[650]	train-mae:0.413349	test-mae:0.683702
[660]	train-mae:0.410676	test-mae:0.683842
[670]	train-mae:0.40776	test-mae:0.683976
[680]	train-mae:0.405103	test-mae:0.684024
[690]	train-mae:0.402429	test-mae:0.683976
[700]	train-mae:0.399662	test-mae:0.684317
[710]	train-ma

[30]	train-mae:1.75282	test-mae:1.69122
[40]	train-mae:1.60179	test-mae:1.57363
[50]	train-mae:1.49089	test-mae:1.49314
[60]	train-mae:1.41186	test-mae:1.43884
[70]	train-mae:1.35251	test-mae:1.4032
[80]	train-mae:1.30717	test-mae:1.38345
[90]	train-mae:1.27044	test-mae:1.36811
[100]	train-mae:1.24166	test-mae:1.35799
[110]	train-mae:1.21792	test-mae:1.35162
[120]	train-mae:1.19706	test-mae:1.34692
[130]	train-mae:1.17972	test-mae:1.34318
[140]	train-mae:1.16247	test-mae:1.33879
[150]	train-mae:1.14616	test-mae:1.33675
[160]	train-mae:1.13225	test-mae:1.33495
[170]	train-mae:1.11878	test-mae:1.33331
[180]	train-mae:1.10619	test-mae:1.33312
[190]	train-mae:1.09427	test-mae:1.33214
[200]	train-mae:1.08249	test-mae:1.33009
[210]	train-mae:1.07086	test-mae:1.32762
[220]	train-mae:1.06046	test-mae:1.32574
[230]	train-mae:1.05015	test-mae:1.32473
[240]	train-mae:1.03878	test-mae:1.32281
[250]	train-mae:1.02993	test-mae:1.32129
[260]	train-mae:1.02076	test-mae:1.31967
[270]	train-mae:1.01168	

In [32]:
hbb_10, hbo2_10, ca_10, na_10 = XG_Boost(data_rho_10, 10)

[0]	train-mae:7.33277	test-mae:7.44406
Multiple eval metrics have been passed: 'test-mae' will be used for early stopping.

Will train until test-mae hasn't improved in 500 rounds.
[10]	train-mae:6.00404	test-mae:6.11583
[20]	train-mae:4.92176	test-mae:5.03384
[30]	train-mae:4.04497	test-mae:4.16521
[40]	train-mae:3.33964	test-mae:3.46589
[50]	train-mae:2.77913	test-mae:2.91173
[60]	train-mae:2.34805	test-mae:2.4855
[70]	train-mae:2.01972	test-mae:2.16008
[80]	train-mae:1.77342	test-mae:1.91956
[90]	train-mae:1.5877	test-mae:1.74568
[100]	train-mae:1.44937	test-mae:1.61887
[110]	train-mae:1.34855	test-mae:1.52882
[120]	train-mae:1.27604	test-mae:1.46509
[130]	train-mae:1.22155	test-mae:1.42084
[140]	train-mae:1.17919	test-mae:1.38859
[150]	train-mae:1.14447	test-mae:1.36139
[160]	train-mae:1.11695	test-mae:1.34256
[170]	train-mae:1.0932	test-mae:1.32761
[180]	train-mae:1.07046	test-mae:1.31359
[190]	train-mae:1.0515	test-mae:1.30389
[200]	train-mae:1.035	test-mae:1.29516
[210]	train-ma

[710]	train-mae:0.39788	test-mae:0.747587
[720]	train-mae:0.395273	test-mae:0.747787
[730]	train-mae:0.39229	test-mae:0.748053
[740]	train-mae:0.389528	test-mae:0.748502
[750]	train-mae:0.386845	test-mae:0.748616
Stopping. Best iteration:
[259]	train-mae:0.560283	test-mae:0.744506

[0]	train-mae:8.35607	test-mae:8.34421
Multiple eval metrics have been passed: 'test-mae' will be used for early stopping.

Will train until test-mae hasn't improved in 500 rounds.
[10]	train-mae:6.83979	test-mae:6.82815
[20]	train-mae:5.61058	test-mae:5.60969
[30]	train-mae:4.63407	test-mae:4.65146
[40]	train-mae:3.87649	test-mae:3.89608
[50]	train-mae:3.29872	test-mae:3.33213
[60]	train-mae:2.86699	test-mae:2.92499
[70]	train-mae:2.55096	test-mae:2.63075
[80]	train-mae:2.31738	test-mae:2.42374
[90]	train-mae:2.15163	test-mae:2.28855
[100]	train-mae:2.03245	test-mae:2.19513
[110]	train-mae:1.94204	test-mae:2.13585
[120]	train-mae:1.87633	test-mae:2.09523
[130]	train-mae:1.8243	test-mae:2.06686
[140]	train-m

[1860]	train-mae:0.490996	test-mae:1.956
[1870]	train-mae:0.488012	test-mae:1.95622
[1880]	train-mae:0.48514	test-mae:1.95612
[1890]	train-mae:0.482365	test-mae:1.95585
[1900]	train-mae:0.480289	test-mae:1.95587
[1910]	train-mae:0.477329	test-mae:1.9557
[1920]	train-mae:0.474708	test-mae:1.9559
[1930]	train-mae:0.472005	test-mae:1.95606
[1940]	train-mae:0.468972	test-mae:1.95605
Stopping. Best iteration:
[1440]	train-mae:0.642232	test-mae:1.94955

[0]	train-mae:2.56283	test-mae:2.58839
Multiple eval metrics have been passed: 'test-mae' will be used for early stopping.

Will train until test-mae hasn't improved in 500 rounds.
[10]	train-mae:2.19449	test-mae:2.21237
[20]	train-mae:1.91746	test-mae:1.93776
[30]	train-mae:1.71029	test-mae:1.74249
[40]	train-mae:1.56077	test-mae:1.60547
[50]	train-mae:1.4494	test-mae:1.50766
[60]	train-mae:1.36536	test-mae:1.43811
[70]	train-mae:1.30297	test-mae:1.39085
[80]	train-mae:1.25544	test-mae:1.35522
[90]	train-mae:1.21888	test-mae:1.33108
[100]	tr

[1810]	train-mae:0.302867	test-mae:1.11648
[1820]	train-mae:0.301272	test-mae:1.11604
[1830]	train-mae:0.299674	test-mae:1.11561
[1840]	train-mae:0.297937	test-mae:1.11513
[1850]	train-mae:0.296452	test-mae:1.11471
[1860]	train-mae:0.294602	test-mae:1.11439
[1870]	train-mae:0.292257	test-mae:1.11427
[1880]	train-mae:0.290443	test-mae:1.11455
[1890]	train-mae:0.288897	test-mae:1.11443
[1900]	train-mae:0.28725	test-mae:1.11461
[1910]	train-mae:0.28563	test-mae:1.11459
[1920]	train-mae:0.284172	test-mae:1.11437
[1930]	train-mae:0.282782	test-mae:1.11395
[1940]	train-mae:0.281027	test-mae:1.11353
[1950]	train-mae:0.279136	test-mae:1.11329
[1960]	train-mae:0.277323	test-mae:1.11337
[1970]	train-mae:0.275649	test-mae:1.11329
[1980]	train-mae:0.27384	test-mae:1.11321
[1990]	train-mae:0.272018	test-mae:1.11304
[2000]	train-mae:0.270563	test-mae:1.11318
[2010]	train-mae:0.269034	test-mae:1.11314
[2020]	train-mae:0.267261	test-mae:1.11308
[2030]	train-mae:0.265731	test-mae:1.11324
[2040]	train-m

In [35]:
test = pd.read_csv("../Data/Biometric Data Analysis/test.csv").fillna(0)

In [36]:
test_rho_25 = test[test['rho'] == 25]
test_rho_20 = test[test['rho'] == 20]
test_rho_15 = test[test['rho'] == 15]
test_rho_10 = test[test['rho'] == 10]

In [37]:
def prediction(hbb, hbo2, ca, na, df):
    
    test = df.iloc[:, 37:]
    
    preds_hbb = hbb.predict(xgb.DMatrix(test))
    preds_hbo2 = hbo2.predict(xgb.DMatrix(test))
    preds_ca = ca.predict(xgb.DMatrix(test))
    preds_na = na.predict(xgb.DMatrix(test))
    
    df_1 = df.copy()
    
    df_1['hhb'] = preds_hbb
    df_1['hbo2'] = preds_hbo2
    df_1['ca'] = preds_ca
    df_1['na'] = preds_na
    
    return df_1

In [38]:
# test_rho_25 - hbb_25, hbo2_25, ca_25, na_25
# test_rho_20 - hbb_20, hbo2_20, ca_20, na_20
# test_rho_15 - hbb_15, hbo2_15, ca_15, na_15
# test_rho_10 - hbb_10, hbo2_10, ca_10, na_10

rho_25 = prediction(hbb_25, hbo2_25, ca_25, na_25, test_rho_25)
rho_20 = prediction(hbb_20, hbo2_20, ca_20, na_20, test_rho_20)
rho_15 = prediction(hbb_15, hbo2_15, ca_15, na_15, test_rho_15)
rho_10 = prediction(hbb_10, hbo2_10, ca_10, na_10, test_rho_10)

In [39]:
temp = pd.concat([rho_25, rho_20, rho_15, rho_10]).sort_values('id')
id = temp.id
temp = temp.iloc[:, -4:]

In [40]:
submission = pd.concat([id, temp], axis=1)
submission.to_csv("submission/08.csv")

In [29]:
submission.head()

Unnamed: 0,id,hhb,hbo2,ca,na
0,10000,8.773984,5.076634,11.538292,1.604496
1,10001,5.408206,3.835741,6.606059,2.034667
2,10002,8.965022,4.644113,11.243861,3.110879
3,10003,8.475914,4.690307,9.270923,4.198197
4,10004,7.311491,2.663191,8.116082,3.273913


10000