In [4]:
def gini(solution, submission):
    df = zip(solution, submission)
    df = sorted(df, key=lambda x: (x[1],x[0]), reverse=True)
    rand = [float(i+1)/float(len(df)) for i in range(len(df))]
    totalPos = float(sum([x[0] for x in df]))
    cumPosFound = [df[0][0]]
    for i in range(1,len(df)):
        cumPosFound.append(cumPosFound[len(cumPosFound)-1] + df[i][0])
    Lorentz = [float(x)/totalPos for x in cumPosFound]
    Gini = [Lorentz[i]-rand[i] for i in range(len(df))]
    return sum(Gini)

def normalized_gini(solution, submission):
    solution=np.array(solution)
    submission=np.array(submission)
    normalized_gini = gini(solution, submission)/gini(solution, solution)
    return normalized_gini

In [5]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt

from sklearn import preprocessing
import xgboost as xgb

%pylab inline --no-import-all

Populating the interactive namespace from numpy and matplotlib


In [None]:
#load train and test 
train  = pd.read_csv('../data/train.csv', index_col=0)
test  = pd.read_csv('../data/test.csv', index_col=0)

labels = train.Hazard
train.drop('Hazard', axis=1, inplace=True)


columns = train.columns
test_ind = test.index

train = np.array(train)
test = np.array(test)

# label encode the categorical variables
for i in range(train.shape[1]):
    if type(train[1,i]) is str:
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(train[:,i]) + list(test[:,i]))
        train[:,i] = lbl.transform(train[:,i])
        test[:,i] = lbl.transform(test[:,i])

train = train.astype(float)
test = test.astype(float)

params = {}
params["objective"] = "reg:linear"
params["eta"] = 0.005
params["min_child_weight"] = 6
params["colsample_bytree"] = 0.7
params["subsample"] = 0.7
params["scale_pos_weight"] = 1.0
params["silent"] = 1
params["max_depth"] = 9

plst = list(params.items())

#Using 5000 rows for early stopping. 
offset = 4000

num_rounds = 5000
xgtest = xgb.DMatrix(test)

#create a train and validation dmatrices 
xgtrain = xgb.DMatrix(train[offset:,:], label=labels[offset:])
xgval = xgb.DMatrix(train[:offset,:], label=labels[:offset])

#train using early stopping and predict
watchlist = [(xgtrain, 'train'),(xgval, 'val')]
model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=5)
preds1 = model.predict(xgtest)
print("gini score on training set"+str(normalized_gini(labels,model.predict(xgb.DMatrix(train)))))

#reverse train and labels and use different 5k for early stopping. 
# this adds very little to the score but it is an option if you are concerned about using all the data. 
train = train[::-1,:]
labels = labels[::-1]

xgtrain = xgb.DMatrix(train[offset:,:], label=labels[offset:])
xgval = xgb.DMatrix(train[:offset,:], label=labels[:offset])

watchlist = [(xgtrain, 'train'),(xgval, 'val')]
model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=5)
preds2 = model.predict(xgtest)
print("gini score on training set"+str(normalized_gini(labels,model.predict(xgb.DMatrix(train)))))

#combine predictions
#since the metric only cares about relative rank we don't need to average
preds = preds1 + preds2

#generate solution
preds = pd.DataFrame({"Id": test_ind, "Hazard": preds})
preds = preds.set_index('Id')
preds.to_csv('xgboost_benchmark.csv')

Will train until val error hasn't decreased in 5 rounds.
[0]	train-rmse:5.336764	val-rmse:5.284347
[1]	train-rmse:5.323686	val-rmse:5.271910
[2]	train-rmse:5.310789	val-rmse:5.259579
[3]	train-rmse:5.297752	val-rmse:5.247142
[4]	train-rmse:5.285065	val-rmse:5.235039
[5]	train-rmse:5.272676	val-rmse:5.223176
[6]	train-rmse:5.259944	val-rmse:5.211122
[7]	train-rmse:5.247583	val-rmse:5.199399
[8]	train-rmse:5.235036	val-rmse:5.187624
[9]	train-rmse:5.222717	val-rmse:5.175957
[10]	train-rmse:5.210148	val-rmse:5.164411
[11]	train-rmse:5.197776	val-rmse:5.152729
[12]	train-rmse:5.185497	val-rmse:5.141187
[13]	train-rmse:5.173228	val-rmse:5.129803
[14]	train-rmse:5.160954	val-rmse:5.118273
[15]	train-rmse:5.148979	val-rmse:5.107006
[16]	train-rmse:5.137039	val-rmse:5.095809
[17]	train-rmse:5.125519	val-rmse:5.084814
[18]	train-rmse:5.114144	val-rmse:5.074005
[19]	train-rmse:5.102350	val-rmse:5.063134
[20]	train-rmse:5.090423	val-rmse:5.052016
[21]	train-rmse:5.079354	val-rmse:5.041385
[22]	tr

In [None]:
# Plot predicted and true interarrival times on the training set

plt.plot(labels, color="green", label="True value")
plt.plot(model.predict(xgb.DMatrix(train)), label="Predicted value")

plt.xlabel("Record number", fontsize=14)
plt.ylabel("Hazard rating", fontsize=14)
plt.legend(frameon=True, shadow=True, framealpha=0.7, loc=0, prop={"size": 14})