In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
data = pd.read_csv("../Data/Biometric Data Analysis/train.csv")

In [3]:
data_dst = data.filter(regex='_dst$', axis=1).replace(0, np.NaN)
data_dst = data_dst.interpolate(methods='linear', axis=1)
data_dst.fillna(-999, inplace=True) 
data.update(data_dst)

In [4]:
data.rho.unique()
data.drop("id", axis=1, inplace=True)

In [5]:
# group by rho

data_rho_25 = data[data['rho'] == 25]
data_rho_20 = data[data['rho'] == 20]
data_rho_15 = data[data['rho'] == 15]
data_rho_10 = data[data['rho'] == 10]

### XG Boost

In [6]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

In [7]:
def XY(df):
    X = df.iloc[:,1:-4]
    y_hhb = df.iloc[:,-4]
    y_hhbo2 = df.iloc[:,-3]
    y_ca = df.iloc[:,-2]
    y_na = df.iloc[:,-1]
    
    return X, y_hhb, y_hhbo2, y_ca, y_na

In [28]:
def XG_Boost(df, num):
    X, y_hhb, y_hhbo2, y_ca, y_na = XY(df)
    
    # Split data
    X1_train, X1_test, y1_train, y1_test = train_test_split(X, y_hhb, test_size=0.4, shuffle=101)
    X2_train, X2_test, y2_train, y2_test = train_test_split(X, y_hhbo2, test_size=0.4, shuffle=122)
    X3_train, X3_test, y3_train, y3_test = train_test_split(X, y_ca, test_size=0.4, shuffle=157)
    X4_train, X4_test, y4_train, y4_test = train_test_split(X, y_na, test_size=0.4, shuffle=1932)
    
    d1_train = xgb.DMatrix(X1_train, label=y1_train)
    d1_test = xgb.DMatrix(X1_test, label=y1_test)
    
    d2_train = xgb.DMatrix(X2_train, label=y2_train)
    d2_test = xgb.DMatrix(X2_test, label=y2_test)
    
    d3_train = xgb.DMatrix(X3_train, label=y3_train)
    d3_test = xgb.DMatrix(X3_test, label=y3_test)
    
    d4_train = xgb.DMatrix(X4_train, label=y4_train)
    d4_test = xgb.DMatrix(X4_test, label=y4_test)
    
    params = {}
    params['eta'] = 0.007
    params['objective'] = 'reg:linear'
    params['eval_metric'] = 'mae'
    params['max_depth'] = 8
    params['silent'] = 1
    
    watchlist1 = [(d1_train, 'train'), (d1_test, 'test')]
    watchlist2 = [(d2_train, 'train'), (d2_test, 'test')]
    watchlist3 = [(d3_train, 'train'), (d3_test, 'test')]
    watchlist4 = [(d4_train, 'train'), (d4_test, 'test')]
    
    clf_hhb = xgb.train(params, d1_train, 10000, watchlist1, early_stopping_rounds=800, verbose_eval=10)
    clf_hbo2 = xgb.train(params, d2_train, 10000, watchlist2, early_stopping_rounds=100, verbose_eval=10)
    clf_ca = xgb.train(params, d3_train, 10000, watchlist3, early_stopping_rounds=1000, verbose_eval=10)
    clf_na = xgb.train(params, d4_train, 10000, watchlist4, early_stopping_rounds=1000, verbose_eval=10)

    preds_hhb = clf_hhb.predict(xgb.DMatrix(X1_test))
    preds_hbo2 = clf_hbo2.predict(xgb.DMatrix(X2_test))
    preds_ca = clf_ca.predict(xgb.DMatrix(X3_test))
    preds_na = clf_na.predict(xgb.DMatrix(X4_test))
    
    rmse_hhb = np.sqrt(mean_squared_error(y1_test, preds_hhb))
    rmse_hbo2 = np.sqrt(mean_squared_error(y2_test, preds_hbo2))
    rmse_ca = np.sqrt(mean_squared_error(y3_test, preds_ca))
    rmse_na = np.sqrt(mean_squared_error(y4_test, preds_na))

    print(num, " mm")
    print("RMSE - hbb  : %f" % (rmse_hhb))
    print("RMSE - hbo2 : %f" % (rmse_hbo2))
    print("RMSE - ca   : %f" % (rmse_ca))
    print("RMSE - na   : %f" % (rmse_na))
    
    
    return clf_hhb, clf_hbo2, clf_ca, clf_na

In [29]:
# data_rho_25, data_rho_20, data_rho_15, data_rho_10

hbb_25, hbo2_25, ca_25, na_25 = XG_Boost(data_rho_25, 25)

[0]	train-mae:7.36819	test-mae:7.4946
Multiple eval metrics have been passed: 'test-mae' will be used for early stopping.

Will train until test-mae hasn't improved in 800 rounds.
[10]	train-mae:6.8753	test-mae:6.99923
[20]	train-mae:6.41583	test-mae:6.53634
[30]	train-mae:5.98738	test-mae:6.10433
[40]	train-mae:5.58789	test-mae:5.70404
[50]	train-mae:5.21565	test-mae:5.32978
[60]	train-mae:4.86851	test-mae:4.98278
[70]	train-mae:4.54564	test-mae:4.66105
[80]	train-mae:4.24453	test-mae:4.36375
[90]	train-mae:3.96384	test-mae:4.0871
[100]	train-mae:3.7022	test-mae:3.82994
[110]	train-mae:3.45842	test-mae:3.59212
[120]	train-mae:3.23122	test-mae:3.37193
[130]	train-mae:3.01942	test-mae:3.16792
[140]	train-mae:2.82219	test-mae:2.97883
[150]	train-mae:2.63786	test-mae:2.80297
[160]	train-mae:2.46587	test-mae:2.64144
[170]	train-mae:2.30546	test-mae:2.49144
[180]	train-mae:2.1557	test-mae:2.3529
[190]	train-mae:2.01584	test-mae:2.22441
[200]	train-mae:1.88546	test-mae:2.10856
[210]	train-ma

[1890]	train-mae:0.02665	test-mae:0.985385
[1900]	train-mae:0.026308	test-mae:0.985365
[1910]	train-mae:0.025985	test-mae:0.98534
[1920]	train-mae:0.025633	test-mae:0.985309
[1930]	train-mae:0.025307	test-mae:0.985279
[1940]	train-mae:0.02501	test-mae:0.985258
[1950]	train-mae:0.024705	test-mae:0.985226
[1960]	train-mae:0.024412	test-mae:0.985211
[1970]	train-mae:0.024155	test-mae:0.985193
[1980]	train-mae:0.023894	test-mae:0.985162
[1990]	train-mae:0.02366	test-mae:0.985137
[2000]	train-mae:0.023399	test-mae:0.985104
[2010]	train-mae:0.023162	test-mae:0.985081
[2020]	train-mae:0.02293	test-mae:0.985071
[2030]	train-mae:0.02269	test-mae:0.985063
[2040]	train-mae:0.022464	test-mae:0.985053
[2050]	train-mae:0.022239	test-mae:0.985047
[2060]	train-mae:0.02202	test-mae:0.985047
[2070]	train-mae:0.021796	test-mae:0.985045
[2080]	train-mae:0.0216	test-mae:0.985045
[2090]	train-mae:0.0214	test-mae:0.985031
[2100]	train-mae:0.021156	test-mae:0.98501
[2110]	train-mae:0.020957	test-mae:0.984987


[3770]	train-mae:0.003825	test-mae:0.984143
[3780]	train-mae:0.00378	test-mae:0.984139
[3790]	train-mae:0.003737	test-mae:0.984136
[3800]	train-mae:0.003694	test-mae:0.984132
[3810]	train-mae:0.003647	test-mae:0.984129
[3820]	train-mae:0.003604	test-mae:0.984128
[3830]	train-mae:0.003569	test-mae:0.984128
[3840]	train-mae:0.003535	test-mae:0.984125
[3850]	train-mae:0.003501	test-mae:0.984121
[3860]	train-mae:0.003466	test-mae:0.984118
[3870]	train-mae:0.003428	test-mae:0.984113
[3880]	train-mae:0.003393	test-mae:0.98411
[3890]	train-mae:0.003353	test-mae:0.984109
[3900]	train-mae:0.003309	test-mae:0.984106
[3910]	train-mae:0.003274	test-mae:0.984105
[3920]	train-mae:0.003235	test-mae:0.984103
[3930]	train-mae:0.0032	test-mae:0.984103
[3940]	train-mae:0.00317	test-mae:0.984103
[3950]	train-mae:0.003139	test-mae:0.984103
[3960]	train-mae:0.003104	test-mae:0.984102
[3970]	train-mae:0.003065	test-mae:0.984103
[3980]	train-mae:0.003027	test-mae:0.984101
[3990]	train-mae:0.002996	test-mae:0.

[5640]	train-mae:0.000655	test-mae:0.984011
[5650]	train-mae:0.000655	test-mae:0.984011
[5660]	train-mae:0.000655	test-mae:0.984011
[5670]	train-mae:0.000654	test-mae:0.984011
[5680]	train-mae:0.000654	test-mae:0.984011
[5690]	train-mae:0.000654	test-mae:0.984011
[5700]	train-mae:0.000654	test-mae:0.984011
[5710]	train-mae:0.000654	test-mae:0.984011
[5720]	train-mae:0.000654	test-mae:0.984011
[5730]	train-mae:0.000654	test-mae:0.984011
[5740]	train-mae:0.000654	test-mae:0.984011
[5750]	train-mae:0.000654	test-mae:0.984011
[5760]	train-mae:0.000654	test-mae:0.984011
[5770]	train-mae:0.000654	test-mae:0.984011
[5780]	train-mae:0.000654	test-mae:0.984011
[5790]	train-mae:0.000654	test-mae:0.984011
[5800]	train-mae:0.000654	test-mae:0.984011
[5810]	train-mae:0.000654	test-mae:0.984011
[5820]	train-mae:0.000654	test-mae:0.984011
[5830]	train-mae:0.000654	test-mae:0.984011
[5840]	train-mae:0.000654	test-mae:0.984011
[5850]	train-mae:0.000654	test-mae:0.984011
[5860]	train-mae:0.000654	test-m

[1290]	train-mae:0.068996	test-mae:0.60963
[1300]	train-mae:0.068427	test-mae:0.609574
[1310]	train-mae:0.067803	test-mae:0.609548
[1320]	train-mae:0.06714	test-mae:0.609534
[1330]	train-mae:0.066464	test-mae:0.609496
[1340]	train-mae:0.065892	test-mae:0.60944
[1350]	train-mae:0.065268	test-mae:0.609403
[1360]	train-mae:0.064592	test-mae:0.609363
[1370]	train-mae:0.064004	test-mae:0.60931
[1380]	train-mae:0.063372	test-mae:0.609285
[1390]	train-mae:0.062797	test-mae:0.609212
[1400]	train-mae:0.062256	test-mae:0.609151
[1410]	train-mae:0.061652	test-mae:0.609087
[1420]	train-mae:0.0611	test-mae:0.609032
[1430]	train-mae:0.060538	test-mae:0.608975
[1440]	train-mae:0.059908	test-mae:0.608917
[1450]	train-mae:0.059372	test-mae:0.608902
[1460]	train-mae:0.058934	test-mae:0.608849
[1470]	train-mae:0.058482	test-mae:0.608805
[1480]	train-mae:0.057963	test-mae:0.608756
[1490]	train-mae:0.057458	test-mae:0.608754
[1500]	train-mae:0.056909	test-mae:0.60873
[1510]	train-mae:0.056337	test-mae:0.60

[610]	train-mae:0.603049	test-mae:2.05518
[620]	train-mae:0.596052	test-mae:2.05533
[630]	train-mae:0.589919	test-mae:2.05572
[640]	train-mae:0.583157	test-mae:2.05606
[650]	train-mae:0.573824	test-mae:2.05649
[660]	train-mae:0.564893	test-mae:2.05655
[670]	train-mae:0.558319	test-mae:2.05716
[680]	train-mae:0.552294	test-mae:2.0576
[690]	train-mae:0.547166	test-mae:2.05789
[700]	train-mae:0.541573	test-mae:2.05796
[710]	train-mae:0.536435	test-mae:2.05826
[720]	train-mae:0.532115	test-mae:2.05861
[730]	train-mae:0.527009	test-mae:2.05904
[740]	train-mae:0.521176	test-mae:2.05934
[750]	train-mae:0.515656	test-mae:2.05959
[760]	train-mae:0.5097	test-mae:2.05944
[770]	train-mae:0.504032	test-mae:2.05961
[780]	train-mae:0.496811	test-mae:2.05959
[790]	train-mae:0.490342	test-mae:2.05993
[800]	train-mae:0.483833	test-mae:2.06018
[810]	train-mae:0.477892	test-mae:2.06049
[820]	train-mae:0.471538	test-mae:2.0606
[830]	train-mae:0.466546	test-mae:2.0607
[840]	train-mae:0.461035	test-mae:2.061

[920]	train-mae:0.47825	test-mae:1.54981
[930]	train-mae:0.471421	test-mae:1.54979
[940]	train-mae:0.465263	test-mae:1.55002
[950]	train-mae:0.458631	test-mae:1.55028
[960]	train-mae:0.452585	test-mae:1.55068
[970]	train-mae:0.44754	test-mae:1.55069
[980]	train-mae:0.442696	test-mae:1.5509
[990]	train-mae:0.436909	test-mae:1.55114
[1000]	train-mae:0.432076	test-mae:1.55143
[1010]	train-mae:0.42695	test-mae:1.55171
[1020]	train-mae:0.421996	test-mae:1.55182
[1030]	train-mae:0.417753	test-mae:1.55199
[1040]	train-mae:0.413275	test-mae:1.55208
[1050]	train-mae:0.408347	test-mae:1.55225
[1060]	train-mae:0.403665	test-mae:1.5524
[1070]	train-mae:0.399302	test-mae:1.55239
[1080]	train-mae:0.395918	test-mae:1.5524
[1090]	train-mae:0.391558	test-mae:1.55237
[1100]	train-mae:0.387422	test-mae:1.55235
[1110]	train-mae:0.382804	test-mae:1.55234
[1120]	train-mae:0.378408	test-mae:1.55248
[1130]	train-mae:0.373846	test-mae:1.55257
[1140]	train-mae:0.369033	test-mae:1.55276
[1150]	train-mae:0.364981

In [30]:
hbb_20, hbo2_20, ca_20, na_20 = XG_Boost(data_rho_20, 20)

[0]	train-mae:7.44565	test-mae:7.40238
Multiple eval metrics have been passed: 'test-mae' will be used for early stopping.

Will train until test-mae hasn't improved in 800 rounds.
[10]	train-mae:6.94837	test-mae:6.90461
[20]	train-mae:6.48465	test-mae:6.44181
[30]	train-mae:6.05202	test-mae:6.01126
[40]	train-mae:5.64901	test-mae:5.61175
[50]	train-mae:5.27358	test-mae:5.2409
[60]	train-mae:4.92346	test-mae:4.89581
[70]	train-mae:4.59682	test-mae:4.57328
[80]	train-mae:4.29227	test-mae:4.27491
[90]	train-mae:4.00839	test-mae:3.99768
[100]	train-mae:3.74383	test-mae:3.74102
[110]	train-mae:3.49709	test-mae:3.5027
[120]	train-mae:3.26704	test-mae:3.28005
[130]	train-mae:3.05248	test-mae:3.07415
[140]	train-mae:2.85274	test-mae:2.88379
[150]	train-mae:2.66669	test-mae:2.70724
[160]	train-mae:2.49255	test-mae:2.54627
[170]	train-mae:2.33035	test-mae:2.39579
[180]	train-mae:2.17909	test-mae:2.25653
[190]	train-mae:2.03805	test-mae:2.12897
[200]	train-mae:1.9063	test-mae:2.01084
[210]	train

KeyboardInterrupt: 

In [None]:
hbb_15, hbo2_15, ca_15, na_15 = XG_Boost(data_rho_15, 15)

In [None]:
hbb_10, hbo2_10, ca_10, na_10 = XG_Boost(data_rho_10, 10)

In [14]:
test = pd.read_csv("../Data/Biometric Data Analysis/test.csv")

In [15]:
test_dst = test.filter(regex='_dst$', axis=1).replace(0, np.NaN)
test_dst = test_dst.interpolate(methods='linear', axis=1)
test_dst.fillna(-999, inplace=True) 

test.update(test_dst)

In [16]:
test_rho_25 = test[test['rho'] == 25]
test_rho_20 = test[test['rho'] == 20]
test_rho_15 = test[test['rho'] == 15]
test_rho_10 = test[test['rho'] == 10]

In [17]:
def prediction(hbb, hbo2, ca, na, df):
    
    test = df.iloc[:, 2:]
    
    preds_hbb = hbb.predict(xgb.DMatrix(test))
    preds_hbo2 = hbo2.predict(xgb.DMatrix(test))
    preds_ca = ca.predict(xgb.DMatrix(test))
    preds_na = na.predict(xgb.DMatrix(test))
    
    df_1 = df.copy()
    
    df_1['hhb'] = preds_hbb
    df_1['hbo2'] = preds_hbo2
    df_1['ca'] = preds_ca
    df_1['na'] = preds_na
    
    return df_1

In [18]:
# test_rho_25 - hbb_25, hbo2_25, ca_25, na_25
# test_rho_20 - hbb_20, hbo2_20, ca_20, na_20
# test_rho_15 - hbb_15, hbo2_15, ca_15, na_15
# test_rho_10 - hbb_10, hbo2_10, ca_10, na_10

rho_25 = prediction(hbb_25, hbo2_25, ca_25, na_25, test_rho_25)
rho_20 = prediction(hbb_20, hbo2_20, ca_20, na_20, test_rho_20)
rho_15 = prediction(hbb_15, hbo2_15, ca_15, na_15, test_rho_15)
rho_10 = prediction(hbb_10, hbo2_10, ca_10, na_10, test_rho_10)

In [19]:
temp = pd.concat([rho_25, rho_20, rho_15, rho_10]).sort_values('id')
id = temp.id
temp = temp.iloc[:, -4:]

In [20]:
submission = pd.concat([id, temp], axis=1)
submission.to_csv("submission/tweedie04.csv")

In [39]:
submission.head()

Unnamed: 0,id,hhb,hbo2,ca,na
0,10000,8.835473,4.485795,10.227639,2.759007
1,10001,6.017116,3.917904,8.52475,2.39406
2,10002,9.950055,5.21766,11.239717,3.120166
3,10003,8.282282,4.193402,9.230623,4.488401
4,10004,4.986564,3.290931,7.356716,3.101167


10000