# Loading Data

In [None]:
import numpy
import xgboost

test_size = 550000

dtrain = numpy.loadtxt('/content/drive/My Drive/Higgs Boson classifier/training.csv', delimiter=',', skiprows=1,
                    converters={32: lambda x: int(x == 's'.encode('utf-8'))})

y_train = dtrain[:, -1]
X_train = dtrain[:, 1:31]
weight_train = dtrain[:, 31] * float(test_size)/len(y_train)

dtest = numpy.loadtxt('/content/drive/My Drive/Higgs Boson classifier/test.csv', delimiter=',', skiprows=1 )


X_test   = dtest[:,1:31]
EventID = dtest[:,0]

dtrain = xgboost.DMatrix( X_train, label=y_train, missing = -999.0, weight=weight_train )

  "because it will generate extra copies and increase memory consumption")


# Cross-Validation

In [None]:
param = {}
param['objective'] = 'binary:logitraw'
param['eta'] = 0.1
param['max_depth'] = 8
param['silent'] = 1

num_round = 1000

In [None]:
def fpreproc(dtrain, dtest, param):
    label = dtrain.get_label()
    ratio = float(numpy.sum(label == 0)) / numpy.sum(label==1)
    param['scale_pos_weight'] = ratio
    wtrain = dtrain.get_weight()
    wtest = dtest.get_weight()
    sum_weight = sum(wtrain) + sum(wtest)
    wtrain *= sum_weight / sum(wtrain)
    wtest *= sum_weight / sum(wtest)
    dtrain.set_weight(wtrain)
    dtest.set_weight(wtest)
    return (dtrain, dtest, param)

cvresult = xgboost.cv(param, dtrain, num_round, nfold=2, metrics={'ams@0.15', 'auc'}, early_stopping_rounds=2, seed = 0, fpreproc = fpreproc)

# Training the Model

In [None]:
n_estimators = cvresult.shape[0]
bst = xgboost.train( param, dtrain, n_estimators );

# Making Predictions

In [None]:
xgmat = xgboost.DMatrix( X_test, missing = -999.0 )
ypred = bst.predict( xgmat )

# Preparing Data for Scoring

In [None]:
threshold_ratio = 0.15
outfile = '/content/drive/My Drive/Higgs Boson classifier/submission_xgboost_3.csv'

res  = [ ( int(EventID[i]), ypred[i] ) for i in range(len(ypred)) ]

rorder = {}
for k, v in sorted( res, key = lambda x:-x[1] ):
    rorder[ k ] = len(rorder) + 1


ntop = int( threshold_ratio * len(rorder ) )
fo = open(outfile, 'w')
nhit = 0
ntot = 0
fo.write('EventId,RankOrder,Class\n')
for k, v in res:
    if rorder[k] <= ntop:
        lb = 's'
        nhit += 1
    else:
        lb = 'b'
    fo.write('%s,%d,%s\n' % ( k,  len(rorder)+1-rorder[k], lb ) )
    ntot += 1
fo.close()