forked from shelfwise/Mars-Express-Challenge
-
Notifications
You must be signed in to change notification settings - Fork 0
/
xgboost_models.py
114 lines (96 loc) · 4.2 KB
/
xgboost_models.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
# -*- coding: utf-8 -*-
"""
@author: fornax
"""
from __future__ import print_function, division
import numpy as np
import pandas as pd
import xgboost
import utils
from preprocessing import dmop_analysis
np.random.seed(5)
# default model parameters
N_ESTIMATORS = 200
MAX_DEPTH = 3
LAMBDA = 1
# percentage of features to randomly select in each bagged model
FEATS_PERC = 0.7
# columns to correct by median
cols_to_correct = [u'NPWD2401', u'NPWD2402', u'NPWD2481', u'NPWD2482', u'NPWD2501',
u'NPWD2531', u'NPWD2691', u'NPWD2722', u'NPWD2771', u'NPWD2801']
def create_model(X_train, Y_train, X_test,
num_models=1,
n_estimators=[N_ESTIMATORS], max_depths=[MAX_DEPTH], lambdas=[LAMBDA]):
"""
Builds a (bagged) xgboost model for a single power line and creates predictions
on the test data.
:param X_train: features to train on
:param Y_train: training target
:param X_test: features from which to create predictions
:param num_models: number of bagging models to train
:param n_estimators: a list with numbers of estimators from which to draw
in each bag
:param max_depths: as above, but for max_depth
:param lambdas: as above, but for lambda
:return: prediction, clamped into min-max values as seen in the training data
"""
preds = []
for m in range(num_models):
max_depth = np.random.choice(max_depths)
trees = np.random.choice(n_estimators)
reg_lambda = np.random.choice(lambdas)
mdl = xgboost.XGBRegressor(max_depth=max_depth, n_estimators=trees, reg_lambda=reg_lambda)
if num_models > 1:
feats = np.random.choice(X_train.columns, size=int(FEATS_PERC*X_train.shape[1]), replace=False)
else:
feats = X_train.columns
dummy = mdl.fit(X_train[feats], Y_train)
preds.append(mdl.predict(X_test[feats]))
pred = np.vstack(preds).T
pred = np.mean(pred, axis=1)
pred = utils.correct_min_max(Y_train, pred)
return pred
###############################################################################
############################# LOAD DATA #######################################
###############################################################################
df, features = utils.load_data('dataset1')
df2, features2 = utils.load_data('dataset2')
aooo = dmop_analysis.get_npwd2881_features(df)
p_cols = utils.p_cols
Y = df[p_cols]
X = df.drop(p_cols + features['aux_time'], axis=1)
X2 = df2.drop(p_cols + features2['aux_time'], axis=1)
###############################################################################
############################ SUBMISSION #######################################
###############################################################################
trainset = (df.m_year <= 2) & (df.m_year > 0)
testset = df.m_year == 3
X_train, Y_train = X[trainset], Y[trainset]
X_test, Y_test = X[testset], Y[testset]
X_train2 = X2[trainset]
X_test2 = X2[testset]
Y_test_hat = []
for p_col in p_cols:
print('%s...' % p_col)
if p_col == 'NPWD2881':
pred = create_model(aooo[trainset.values], Y_train[p_col], aooo[testset.values])
elif p_col == 'NPWD2451' or p_col == 'NPWD2532':
pred = create_model(X_train2, Y_train[p_col], X_test2, num_models=200)
elif p_col == 'NPWD2851':
pred = create_model(X_train, Y_train[p_col], X_test, num_models=200)
elif p_col == 'NPWD2551':
pred = create_model(X_train, Y_train[p_col], X_test, num_models=200,
max_depths=[3,5,7,9], n_estimators=[200,400,600], lambdas=[1,2])
else:
pred = create_model(X_train, Y_train[p_col], X_test)
Y_test_hat.append(pred)
Y_test_hat = np.vstack(Y_test_hat).T
# median correction
Y_test_hat = pd.DataFrame(Y_test_hat, columns = p_cols)
Y_test_hat_corrected = utils.correct_cols_with_median(Y_train, Y_test_hat, cols_to_correct)
Y_test_hat = Y_test_hat_corrected.values
#Preparing the submission file:
Y_test_hat=pd.DataFrame(Y_test_hat, index=X_test.index, columns=p_cols)
Y_test_hat.index = pd.to_datetime(Y_test_hat.index)
Y_test_hat['ut_ms'] = (Y_test_hat.index.astype(np.int64) * 1e-6).astype(int)
Y_test_hat[['ut_ms'] + p_cols].to_csv('results_4y/xgb.csv', index=False)