In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
data = pd.read_csv("../Data/Biometric Data Analysis/train.csv").fillna(-999)

In [3]:
data.rho.unique()
data.drop("id", axis=1, inplace=True)

In [4]:
# group by rho

data_rho_25 = data[data['rho'] == 25]
data_rho_20 = data[data['rho'] == 20]
data_rho_15 = data[data['rho'] == 15]
data_rho_10 = data[data['rho'] == 10]

### XG Boost

In [5]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

In [6]:
def XY(df):
    X = df.iloc[:,:-4]
    y_hhb = df.iloc[:,-4]
    y_hhbo2 = df.iloc[:,-3]
    y_ca = df.iloc[:,-2]
    y_na = df.iloc[:,-1]
    
    return X, y_hhb, y_hhbo2, y_ca, y_na

In [7]:
def XG_Boost(df, num):
    X, y_hhb, y_hhbo2, y_ca, y_na = XY(df)
    
    # Split data
    X1_train, X1_test, y1_train, y1_test = train_test_split(X, y_hhb, test_size=0.2, shuffle=1234)
    X2_train, X2_test, y2_train, y2_test = train_test_split(X, y_hhbo2, test_size=0.2, shuffle=1234)
    X3_train, X3_test, y3_train, y3_test = train_test_split(X, y_ca, test_size=0.2, shuffle=1234)
    X4_train, X4_test, y4_train, y4_test = train_test_split(X, y_na, test_size=0.2, shuffle=1234)
    
    
    xg_reg_hbb = xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, 
                             learning_rate=0.05, max_depth=7, 
                             min_child_weight=1.7817, n_estimators=2200,
                             reg_alpha=0.4640, reg_lambda=0.8571,
                             subsample=0.5213, silent=1,
                             random_state =7, nthread = -1)
    xg_reg_hbo2 = xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, 
                             learning_rate=0.05, max_depth=7, 
                             min_child_weight=1.7817, n_estimators=2200,
                             reg_alpha=0.4640, reg_lambda=0.8571,
                             subsample=0.5213, silent=1,
                             random_state =7, nthread = -1)
    xg_reg_ca = xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, 
                             learning_rate=0.05, max_depth=7, 
                             min_child_weight=1.7817, n_estimators=2200,
                             reg_alpha=0.4640, reg_lambda=0.8571,
                             subsample=0.5213, silent=1,
                             random_state =7, nthread = -1)
    xg_reg_na = xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, 
                             learning_rate=0.05, max_depth=7, 
                             min_child_weight=1.7817, n_estimators=2200,
                             reg_alpha=0.4640, reg_lambda=0.8571,
                             subsample=0.5213, silent=1,
                             random_state =7, nthread = -1)
    
    
    xg_reg_hbb.fit(X1_train,y1_train)
    preds_hbb = xg_reg_hbb.predict(X1_test)

    xg_reg_hbo2.fit(X2_train,y2_train)
    preds_hbo2 = xg_reg_hbo2.predict(X2_test)

    xg_reg_ca.fit(X3_train,y3_train)
    preds_ca = xg_reg_ca.predict(X3_test)

    xg_reg_na.fit(X4_train,y4_train)
    preds_na = xg_reg_na.predict(X4_test)
    
    
    rmse_hbb = np.sqrt(mean_squared_error(y1_test, preds_hbb))
    rmse_hbo2 = np.sqrt(mean_squared_error(y2_test, preds_hbo2))
    rmse_ca = np.sqrt(mean_squared_error(y3_test, preds_ca))
    rmse_na = np.sqrt(mean_squared_error(y4_test, preds_na))

    print(num, " mm")
    print("RMSE - hbb  : %f" % (rmse_hbb))
    print("RMSE - hbo2 : %f" % (rmse_hbo2))
    print("RMSE - ca   : %f" % (rmse_ca))
    print("RMSE - na   : %f" % (rmse_na))
    
    
    return xg_reg_hbb, xg_reg_hbo2, xg_reg_ca, xg_reg_na

In [8]:
# data_rho_25, data_rho_20, data_rho_15, data_rho_10

hbb_25, hbo2_25, ca_25, na_25 = XG_Boost(data_rho_25, 25)

hbb_20, hbo2_20, ca_20, na_20 = XG_Boost(data_rho_20, 20)

hbb_15, hbo2_15, ca_15, na_15 = XG_Boost(data_rho_15, 15)

hbb_10, hbo2_10, ca_10, na_10 = XG_Boost(data_rho_10, 10)

25  mm
RMSE - hbb  : 1.260773
RMSE - hbo2 : 0.809058
RMSE - ca   : 2.511896
RMSE - na   : 1.968474
20  mm
RMSE - hbb  : 1.155488
RMSE - hbo2 : 0.797178
RMSE - ca   : 2.524110
RMSE - na   : 1.773526
15  mm
RMSE - hbb  : 1.373566
RMSE - hbo2 : 0.885574
RMSE - ca   : 2.484292
RMSE - na   : 1.580293
10  mm
RMSE - hbb  : 1.315086
RMSE - hbo2 : 0.900857
RMSE - ca   : 2.354888
RMSE - na   : 1.431465


In [9]:
test = pd.read_csv("../Data/Biometric Data Analysis/test.csv").fillna(-999)

In [10]:
test_rho_25 = test[test['rho'] == 25]
test_rho_20 = test[test['rho'] == 20]
test_rho_15 = test[test['rho'] == 15]
test_rho_10 = test[test['rho'] == 10]

In [11]:
def prediction(hbb, hbo2, ca, na, df):
    
    test = df.iloc[:, 1:]
    
    preds_hbb = hbb.predict(test)
    preds_hbo2 = hbo2.predict(test)
    preds_ca = ca.predict(test)
    preds_na = na.predict(test)
    
    df_1 = df.copy()
    
    df_1['hhb'] = preds_hbb
    df_1['hbo2'] = preds_hbo2
    df_1['ca'] = preds_ca
    df_1['na'] = preds_na
    
    return df_1

In [12]:
# test_rho_25 - hbb_25, hbo2_25, ca_25, na_25
# test_rho_20 - hbb_20, hbo2_20, ca_20, na_20
# test_rho_15 - hbb_15, hbo2_15, ca_15, na_15
# test_rho_10 - hbb_10, hbo2_10, ca_10, na_10

rho_25 = prediction(hbb_25, hbo2_25, ca_25, na_25, test_rho_25)
rho_20 = prediction(hbb_20, hbo2_20, ca_20, na_20, test_rho_20)
rho_15 = prediction(hbb_15, hbo2_15, ca_15, na_15, test_rho_15)
rho_10 = prediction(hbb_10, hbo2_10, ca_10, na_10, test_rho_10)

In [13]:
temp = pd.concat([rho_25, rho_20, rho_15, rho_10]).sort_values('id')
id = temp.id
temp = temp.iloc[:, -4:]

In [14]:
submission = pd.concat([id, temp], axis=1)
submission.to_csv("submission/13.csv")

In [52]:
submission.head()

Unnamed: 0,id,hhb,hbo2,ca,na
0,10000,8.202074,5.025829,10.310868,4.162852
1,10001,6.38477,4.184011,8.38625,2.911784
2,10002,10.268466,4.178093,10.943738,2.251129
3,10003,7.956439,4.358584,9.547226,3.150662
4,10004,4.031321,3.824872,8.401194,3.201443


10000