In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
data = pd.read_csv("../Data/Biometric Data Analysis/train.csv").fillna(0)

In [3]:
data.rho.unique()
data.drop("id", axis=1, inplace=True)

In [4]:
# group by rho

data_rho_25 = data[data['rho'] == 25]
data_rho_20 = data[data['rho'] == 20]
data_rho_15 = data[data['rho'] == 15]
data_rho_10 = data[data['rho'] == 10]

### XG Boost

In [5]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

In [13]:
def XY(df):
    X = df.iloc[:,1:-4]
    y_hhb = df.iloc[:,-4]
    y_hhbo2 = df.iloc[:,-3]
    y_ca = df.iloc[:,-2]
    y_na = df.iloc[:,-1]
    
    return X, y_hhb, y_hhbo2, y_ca, y_na

In [41]:
def XG_Boost(df, num):
    X, y_hhb, y_hhbo2, y_ca, y_na = XY(df)
    
    # Split data
    X1_train, X1_test, y1_train, y1_test = train_test_split(X, y_hhb, test_size=0.4, shuffle=101)
    X2_train, X2_test, y2_train, y2_test = train_test_split(X, y_hhbo2, test_size=0.4, shuffle=122)
    X3_train, X3_test, y3_train, y3_test = train_test_split(X, y_ca, test_size=0.4, shuffle=157)
    X4_train, X4_test, y4_train, y4_test = train_test_split(X, y_na, test_size=0.4, shuffle=1932)
    
    d1_train = xgb.DMatrix(X1_train, label=y1_train)
    d1_test = xgb.DMatrix(X1_test, label=y1_test)
    
    d2_train = xgb.DMatrix(X2_train, label=y2_train)
    d2_test = xgb.DMatrix(X2_test, label=y2_test)
    
    d3_train = xgb.DMatrix(X3_train, label=y3_train)
    d3_test = xgb.DMatrix(X3_test, label=y3_test)
    
    d4_train = xgb.DMatrix(X4_train, label=y4_train)
    d4_test = xgb.DMatrix(X4_test, label=y4_test)
    
    params = {}
    params['eta'] = 0.02
    params['objective'] = 'reg:tweedie'
    params['eval_metric'] = 'mae'
    params['max_depth'] = 4
    params['silent'] = 1
    
    watchlist1 = [(d1_train, 'train'), (d1_test, 'test')]
    watchlist2 = [(d2_train, 'train'), (d2_test, 'test')]
    watchlist3 = [(d3_train, 'train'), (d3_test, 'test')]
    watchlist4 = [(d4_train, 'train'), (d4_test, 'test')]
    
    clf_hhb = xgb.train(params, d1_train, 10000, watchlist1, early_stopping_rounds=100, verbose_eval=10)
    clf_hbo2 = xgb.train(params, d2_train, 10000, watchlist2, early_stopping_rounds=100, verbose_eval=10)
    clf_ca = xgb.train(params, d3_train, 10000, watchlist3, early_stopping_rounds=100, verbose_eval=10)
    clf_na = xgb.train(params, d4_train, 10000, watchlist4, early_stopping_rounds=100, verbose_eval=10)

    preds_hhb = clf_hhb.predict(xgb.DMatrix(X1_test))
    preds_hbo2 = clf_hbo2.predict(xgb.DMatrix(X2_test))
    preds_ca = clf_ca.predict(xgb.DMatrix(X3_test))
    preds_na = clf_na.predict(xgb.DMatrix(X4_test))
    
    rmse_hhb = np.sqrt(mean_squared_error(y1_test, preds_hhb))
    rmse_hbo2 = np.sqrt(mean_squared_error(y2_test, preds_hbo2))
    rmse_ca = np.sqrt(mean_squared_error(y3_test, preds_ca))
    rmse_na = np.sqrt(mean_squared_error(y4_test, preds_na))

    print(num, " mm")
    print("RMSE - hbb  : %f" % (rmse_hhb))
    print("RMSE - hbo2 : %f" % (rmse_hbo2))
    print("RMSE - ca   : %f" % (rmse_ca))
    print("RMSE - na   : %f" % (rmse_na))
    
    
    return clf_hhb, clf_hbo2, clf_ca, clf_na

In [42]:
# data_rho_25, data_rho_20, data_rho_15, data_rho_10

hbb_25, hbo2_25, ca_25, na_25 = XG_Boost(data_rho_25, 25)

[0]	train-mae:7.4568	test-mae:7.44821
Multiple eval metrics have been passed: 'test-mae' will be used for early stopping.

Will train until test-mae hasn't improved in 100 rounds.
[10]	train-mae:7.24958	test-mae:7.24115
[20]	train-mae:6.97988	test-mae:6.97089
[30]	train-mae:6.64159	test-mae:6.63234
[40]	train-mae:6.23545	test-mae:6.22535
[50]	train-mae:5.77	test-mae:5.75862
[60]	train-mae:5.26213	test-mae:5.24915
[70]	train-mae:4.73073	test-mae:4.71925
[80]	train-mae:4.19789	test-mae:4.19078
[90]	train-mae:3.68653	test-mae:3.68601
[100]	train-mae:3.21075	test-mae:3.22085
[110]	train-mae:2.78176	test-mae:2.80521
[120]	train-mae:2.40477	test-mae:2.44117
[130]	train-mae:2.08347	test-mae:2.13184
[140]	train-mae:1.81436	test-mae:1.88042
[150]	train-mae:1.5917	test-mae:1.6816
[160]	train-mae:1.41314	test-mae:1.5292
[170]	train-mae:1.26709	test-mae:1.41658
[180]	train-mae:1.15239	test-mae:1.3337
[190]	train-mae:1.06282	test-mae:1.27189
[200]	train-mae:0.991834	test-mae:1.22842
[210]	train-mae

[50]	train-mae:2.32943	test-mae:2.30616
[60]	train-mae:2.05756	test-mae:2.04002
[70]	train-mae:1.79908	test-mae:1.78954
[80]	train-mae:1.56196	test-mae:1.56239
[90]	train-mae:1.35219	test-mae:1.36714
[100]	train-mae:1.17264	test-mae:1.20292
[110]	train-mae:1.02194	test-mae:1.06987
[120]	train-mae:0.899741	test-mae:0.964548
[130]	train-mae:0.803121	test-mae:0.884156
[140]	train-mae:0.727529	test-mae:0.824241
[150]	train-mae:0.671139	test-mae:0.778742
[160]	train-mae:0.628318	test-mae:0.745289
[170]	train-mae:0.594464	test-mae:0.720828
[180]	train-mae:0.568267	test-mae:0.704348
[190]	train-mae:0.546567	test-mae:0.692628
[200]	train-mae:0.528686	test-mae:0.684441
[210]	train-mae:0.514231	test-mae:0.677495
[220]	train-mae:0.500751	test-mae:0.672516
[230]	train-mae:0.490094	test-mae:0.669896
[240]	train-mae:0.480686	test-mae:0.666808
[250]	train-mae:0.471514	test-mae:0.664659
[260]	train-mae:0.463224	test-mae:0.662845
[270]	train-mae:0.454909	test-mae:0.6612
[280]	train-mae:0.447115	test-ma

In [43]:
hbb_20, hbo2_20, ca_20, na_20 = XG_Boost(data_rho_20, 20)

[0]	train-mae:7.33507	test-mae:7.59013
Multiple eval metrics have been passed: 'test-mae' will be used for early stopping.

Will train until test-mae hasn't improved in 100 rounds.
[10]	train-mae:7.12906	test-mae:7.38406
[20]	train-mae:6.86125	test-mae:7.11574
[30]	train-mae:6.52604	test-mae:6.77977
[40]	train-mae:6.12374	test-mae:6.37637
[50]	train-mae:5.66332	test-mae:5.91387
[60]	train-mae:5.16016	test-mae:5.41072
[70]	train-mae:4.63451	test-mae:4.88903
[80]	train-mae:4.10867	test-mae:4.36732
[90]	train-mae:3.60171	test-mae:3.86433
[100]	train-mae:3.13098	test-mae:3.39561
[110]	train-mae:2.70496	test-mae:2.96975
[120]	train-mae:2.33235	test-mae:2.59979
[130]	train-mae:2.01381	test-mae:2.28763
[140]	train-mae:1.75146	test-mae:2.03139
[150]	train-mae:1.53743	test-mae:1.82793
[160]	train-mae:1.36545	test-mae:1.66929
[170]	train-mae:1.23017	test-mae:1.54363
[180]	train-mae:1.12499	test-mae:1.44578
[190]	train-mae:1.04387	test-mae:1.37008
[200]	train-mae:0.980194	test-mae:1.31288
[210]	t

[1920]	train-mae:0.216784	test-mae:1.05639
Stopping. Best iteration:
[1822]	train-mae:0.223472	test-mae:1.05632

[0]	train-mae:3.49748	test-mae:3.50436
Multiple eval metrics have been passed: 'test-mae' will be used for early stopping.

Will train until test-mae hasn't improved in 100 rounds.
[10]	train-mae:3.32011	test-mae:3.32701
[20]	train-mae:3.10768	test-mae:3.11536
[30]	train-mae:2.86545	test-mae:2.87358
[40]	train-mae:2.60281	test-mae:2.61182
[50]	train-mae:2.33011	test-mae:2.3417
[60]	train-mae:2.05935	test-mae:2.07537
[70]	train-mae:1.80089	test-mae:1.82435
[80]	train-mae:1.56368	test-mae:1.59435
[90]	train-mae:1.35385	test-mae:1.3945
[100]	train-mae:1.1761	test-mae:1.22698
[110]	train-mae:1.029	test-mae:1.08975
[120]	train-mae:0.909264	test-mae:0.982726
[130]	train-mae:0.81419	test-mae:0.900296
[140]	train-mae:0.738723	test-mae:0.839274
[150]	train-mae:0.681034	test-mae:0.794844
[160]	train-mae:0.637035	test-mae:0.763337
[170]	train-mae:0.602286	test-mae:0.739502
[180]	train-

In [44]:
hbb_15, hbo2_15, ca_15, na_15 = XG_Boost(data_rho_15, 15)

[0]	train-mae:7.44181	test-mae:7.57582
Multiple eval metrics have been passed: 'test-mae' will be used for early stopping.

Will train until test-mae hasn't improved in 100 rounds.
[10]	train-mae:7.23484	test-mae:7.37054
[20]	train-mae:6.96513	test-mae:7.10322
[30]	train-mae:6.62729	test-mae:6.7657
[40]	train-mae:6.22143	test-mae:6.36049
[50]	train-mae:5.75626	test-mae:5.89584
[60]	train-mae:5.24725	test-mae:5.38681
[70]	train-mae:4.71544	test-mae:4.8564
[80]	train-mae:4.18177	test-mae:4.32582
[90]	train-mae:3.66577	test-mae:3.81456
[100]	train-mae:3.18331	test-mae:3.34203
[110]	train-mae:2.7483	test-mae:2.91874
[120]	train-mae:2.36468	test-mae:2.55703
[130]	train-mae:2.03825	test-mae:2.25149
[140]	train-mae:1.7641	test-mae:2.00291
[150]	train-mae:1.54266	test-mae:1.80495
[160]	train-mae:1.36567	test-mae:1.6491
[170]	train-mae:1.22273	test-mae:1.52756
[180]	train-mae:1.11123	test-mae:1.43715
[190]	train-mae:1.02601	test-mae:1.36834
[200]	train-mae:0.960873	test-mae:1.31769
[210]	train-

[170]	train-mae:0.626848	test-mae:0.796395
[180]	train-mae:0.601132	test-mae:0.779776
[190]	train-mae:0.581965	test-mae:0.769633
[200]	train-mae:0.56468	test-mae:0.760938
[210]	train-mae:0.551078	test-mae:0.754299
[220]	train-mae:0.538338	test-mae:0.74924
[230]	train-mae:0.52819	test-mae:0.745446
[240]	train-mae:0.518452	test-mae:0.742667
[250]	train-mae:0.510321	test-mae:0.74093
[260]	train-mae:0.501283	test-mae:0.739944
[270]	train-mae:0.49505	test-mae:0.739014
[280]	train-mae:0.487444	test-mae:0.737676
[290]	train-mae:0.480215	test-mae:0.736219
[300]	train-mae:0.473367	test-mae:0.735672
[310]	train-mae:0.466299	test-mae:0.734997
[320]	train-mae:0.460238	test-mae:0.734966
[330]	train-mae:0.45337	test-mae:0.733932
[340]	train-mae:0.446356	test-mae:0.733161
[350]	train-mae:0.439409	test-mae:0.732854
[360]	train-mae:0.433936	test-mae:0.732426
[370]	train-mae:0.427155	test-mae:0.731578
[380]	train-mae:0.42191	test-mae:0.731191
[390]	train-mae:0.416707	test-mae:0.730929
[400]	train-mae:0.

In [18]:
hbb_10, hbo2_10, ca_10, na_10 = XG_Boost(data_rho_10, 10)

[0]	train-mae:7.48152	test-mae:7.52185
Multiple eval metrics have been passed: 'test-mae' will be used for early stopping.

Will train until test-mae hasn't improved in 500 rounds.
[10]	train-mae:7.27495	test-mae:7.31521
[20]	train-mae:7.00532	test-mae:7.04596
[30]	train-mae:6.66628	test-mae:6.70702
[40]	train-mae:6.2598	test-mae:6.29778
[50]	train-mae:5.79345	test-mae:5.8278
[60]	train-mae:5.28349	test-mae:5.315
[70]	train-mae:4.75054	test-mae:4.7825
[80]	train-mae:4.21842	test-mae:4.25722
[90]	train-mae:3.70435	test-mae:3.75447
[100]	train-mae:3.22585	test-mae:3.2864
[110]	train-mae:2.79748	test-mae:2.86822
[120]	train-mae:2.42425	test-mae:2.51212
[130]	train-mae:2.10579	test-mae:2.21674
[140]	train-mae:1.8411	test-mae:1.9756
[150]	train-mae:1.62434	test-mae:1.78421
[160]	train-mae:1.45173	test-mae:1.63664
[170]	train-mae:1.31563	test-mae:1.52388
[180]	train-mae:1.20938	test-mae:1.43878
[190]	train-mae:1.12773	test-mae:1.37305
[200]	train-mae:1.06428	test-mae:1.32192
[210]	train-mae:

[1920]	train-mae:0.298885	test-mae:1.07135
[1930]	train-mae:0.297606	test-mae:1.07126
[1940]	train-mae:0.295949	test-mae:1.07124
[1950]	train-mae:0.294609	test-mae:1.07121
[1960]	train-mae:0.294037	test-mae:1.07133
[1970]	train-mae:0.292921	test-mae:1.07133
[1980]	train-mae:0.291998	test-mae:1.07114
[1990]	train-mae:0.291325	test-mae:1.07117
[2000]	train-mae:0.290314	test-mae:1.07108
[2010]	train-mae:0.288695	test-mae:1.07062
[2020]	train-mae:0.287654	test-mae:1.07059
[2030]	train-mae:0.286752	test-mae:1.07067
[2040]	train-mae:0.285697	test-mae:1.07046
[2050]	train-mae:0.284805	test-mae:1.0705
[2060]	train-mae:0.284221	test-mae:1.07039
[2070]	train-mae:0.283419	test-mae:1.0705
[2080]	train-mae:0.281605	test-mae:1.0701
[2090]	train-mae:0.280889	test-mae:1.07017
[2100]	train-mae:0.279787	test-mae:1.07028
[2110]	train-mae:0.278793	test-mae:1.07017
[2120]	train-mae:0.278377	test-mae:1.07015
[2130]	train-mae:0.277037	test-mae:1.07004
[2140]	train-mae:0.275948	test-mae:1.06989
[2150]	train-m

[3840]	train-mae:0.176391	test-mae:1.06107
[3850]	train-mae:0.175705	test-mae:1.06116
[3860]	train-mae:0.174947	test-mae:1.06114
[3870]	train-mae:0.174734	test-mae:1.06111
[3880]	train-mae:0.174352	test-mae:1.06116
[3890]	train-mae:0.17383	test-mae:1.06105
[3900]	train-mae:0.173357	test-mae:1.06114
[3910]	train-mae:0.172719	test-mae:1.06113
[3920]	train-mae:0.172254	test-mae:1.06112
[3930]	train-mae:0.171925	test-mae:1.06112
[3940]	train-mae:0.171682	test-mae:1.06113
[3950]	train-mae:0.171369	test-mae:1.06115
[3960]	train-mae:0.170709	test-mae:1.06117
[3970]	train-mae:0.170628	test-mae:1.06115
[3980]	train-mae:0.170176	test-mae:1.06116
[3990]	train-mae:0.169797	test-mae:1.06119
[4000]	train-mae:0.169176	test-mae:1.06124
[4010]	train-mae:0.16867	test-mae:1.0613
[4020]	train-mae:0.168627	test-mae:1.06126
[4030]	train-mae:0.168236	test-mae:1.06127
[4040]	train-mae:0.16785	test-mae:1.06123
[4050]	train-mae:0.167348	test-mae:1.0613
[4060]	train-mae:0.167003	test-mae:1.06127
[4070]	train-mae

[930]	train-mae:0.303709	test-mae:0.713992
[940]	train-mae:0.30118	test-mae:0.713707
[950]	train-mae:0.299534	test-mae:0.713782
[960]	train-mae:0.29746	test-mae:0.713784
[970]	train-mae:0.294972	test-mae:0.713647
[980]	train-mae:0.292941	test-mae:0.713432
[990]	train-mae:0.291249	test-mae:0.713261
[1000]	train-mae:0.289755	test-mae:0.713621
[1010]	train-mae:0.286931	test-mae:0.713614
[1020]	train-mae:0.285243	test-mae:0.713691
[1030]	train-mae:0.282795	test-mae:0.713856
[1040]	train-mae:0.280726	test-mae:0.7139
[1050]	train-mae:0.278678	test-mae:0.714153
[1060]	train-mae:0.277309	test-mae:0.714141
[1070]	train-mae:0.275828	test-mae:0.714365
[1080]	train-mae:0.273821	test-mae:0.71447
[1090]	train-mae:0.271778	test-mae:0.714443
[1100]	train-mae:0.269663	test-mae:0.714217
[1110]	train-mae:0.268184	test-mae:0.714161
[1120]	train-mae:0.265589	test-mae:0.713957
[1130]	train-mae:0.263804	test-mae:0.713763
[1140]	train-mae:0.261941	test-mae:0.713507
[1150]	train-mae:0.260112	test-mae:0.713362


[2810]	train-mae:0.087053	test-mae:0.704605
[2820]	train-mae:0.086596	test-mae:0.704623
[2830]	train-mae:0.086131	test-mae:0.70452
[2840]	train-mae:0.085735	test-mae:0.704492
[2850]	train-mae:0.085267	test-mae:0.70461
[2860]	train-mae:0.084634	test-mae:0.704656
[2870]	train-mae:0.084092	test-mae:0.704696
[2880]	train-mae:0.083551	test-mae:0.704727
[2890]	train-mae:0.083133	test-mae:0.704642
[2900]	train-mae:0.082567	test-mae:0.704734
[2910]	train-mae:0.082148	test-mae:0.704814
[2920]	train-mae:0.081628	test-mae:0.704843
[2930]	train-mae:0.081199	test-mae:0.704857
[2940]	train-mae:0.080707	test-mae:0.704945
[2950]	train-mae:0.080171	test-mae:0.704961
[2960]	train-mae:0.079662	test-mae:0.704934
[2970]	train-mae:0.07913	test-mae:0.704972
[2980]	train-mae:0.07855	test-mae:0.704966
[2990]	train-mae:0.07805	test-mae:0.704893
[3000]	train-mae:0.077657	test-mae:0.704889
[3010]	train-mae:0.077271	test-mae:0.704961
[3020]	train-mae:0.076777	test-mae:0.705077
[3030]	train-mae:0.076225	test-mae:0.

[340]	train-mae:0.885025	test-mae:1.16256
[350]	train-mae:0.876405	test-mae:1.16141
[360]	train-mae:0.867961	test-mae:1.16096
[370]	train-mae:0.856895	test-mae:1.16089
[380]	train-mae:0.847842	test-mae:1.15927
[390]	train-mae:0.837856	test-mae:1.15729
[400]	train-mae:0.830091	test-mae:1.15627
[410]	train-mae:0.822911	test-mae:1.15533
[420]	train-mae:0.814321	test-mae:1.15538
[430]	train-mae:0.806538	test-mae:1.15583
[440]	train-mae:0.800059	test-mae:1.15586
[450]	train-mae:0.793355	test-mae:1.15579
[460]	train-mae:0.786495	test-mae:1.15577
[470]	train-mae:0.779632	test-mae:1.15457
[480]	train-mae:0.771869	test-mae:1.15409
[490]	train-mae:0.763309	test-mae:1.1539
[500]	train-mae:0.756202	test-mae:1.15334
[510]	train-mae:0.749917	test-mae:1.15303
[520]	train-mae:0.743446	test-mae:1.15248
[530]	train-mae:0.736687	test-mae:1.15233
[540]	train-mae:0.731268	test-mae:1.15261
[550]	train-mae:0.723243	test-mae:1.15209
[560]	train-mae:0.715757	test-mae:1.15208
[570]	train-mae:0.710182	test-mae:1

In [30]:
test = pd.read_csv("../Data/Biometric Data Analysis/test.csv").fillna(0)

In [31]:
test_rho_25 = test[test['rho'] == 25]
test_rho_20 = test[test['rho'] == 20]
test_rho_15 = test[test['rho'] == 15]
test_rho_10 = test[test['rho'] == 10]

In [35]:
def prediction(hbb, hbo2, ca, na, df):
    
    test = df.iloc[:, 2:]
    
    preds_hbb = hbb.predict(xgb.DMatrix(test))
    preds_hbo2 = hbo2.predict(xgb.DMatrix(test))
    preds_ca = ca.predict(xgb.DMatrix(test))
    preds_na = na.predict(xgb.DMatrix(test))
    
    df_1 = df.copy()
    
    df_1['hhb'] = preds_hbb
    df_1['hbo2'] = preds_hbo2
    df_1['ca'] = preds_ca
    df_1['na'] = preds_na
    
    return df_1

In [36]:
# test_rho_25 - hbb_25, hbo2_25, ca_25, na_25
# test_rho_20 - hbb_20, hbo2_20, ca_20, na_20
# test_rho_15 - hbb_15, hbo2_15, ca_15, na_15
# test_rho_10 - hbb_10, hbo2_10, ca_10, na_10

rho_25 = prediction(hbb_25, hbo2_25, ca_25, na_25, test_rho_25)
rho_20 = prediction(hbb_20, hbo2_20, ca_20, na_20, test_rho_20)
rho_15 = prediction(hbb_15, hbo2_15, ca_15, na_15, test_rho_15)
rho_10 = prediction(hbb_10, hbo2_10, ca_10, na_10, test_rho_10)

In [37]:
temp = pd.concat([rho_25, rho_20, rho_15, rho_10]).sort_values('id')
id = temp.id
temp = temp.iloc[:, -4:]

In [38]:
submission = pd.concat([id, temp], axis=1)
submission.to_csv("submission/tweedie.csv")

In [39]:
submission.head()

Unnamed: 0,id,hhb,hbo2,ca,na
0,10000,8.835473,4.485795,10.227639,2.759007
1,10001,6.017116,3.917904,8.52475,2.39406
2,10002,9.950055,5.21766,11.239717,3.120166
3,10003,8.282282,4.193402,9.230623,4.488401
4,10004,4.986564,3.290931,7.356716,3.101167


10000