# Imports

In [2]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

---

# Carga de datos

In [1]:
features = pd.read_pickle("../../features/auctions_train_ventana_1.pkl")
targets = pd.read_pickle("../../targets/targets_ventana_1_auctions.pkl")
train = features.merge(targets, left_index=True, right_index=True)

NameError: name 'pd' is not defined

---

# Preparación del set de datos para XGBoost

In [64]:
X, y = train.iloc[:,:-1], train.iloc[:,-1]
data_dmatrix = xgb.DMatrix(data=X,label=y)
X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.2, random_state=123)

In [65]:
xg_reg = xgb.XGBRegressor(colsample_bytree = 0.7, learning_rate = 1,
                n_estimators = 100, n_jobs=6, max_depth=7, objective="reg:linear")

In [66]:
xg_reg.fit(X_train,y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.7, gamma=0, learning_rate=1, max_delta_step=0,
       max_depth=7, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=6, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [67]:
preds = xg_reg.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, preds))
print("RMSE: %f" % (rmse))

RMSE: 93545.206149


---

# Pruebas

In [72]:
features_auctions = pd.read_pickle("../../features/auctions_train_ventana_4.pkl")
features_clicks = pd.read_pickle("../../features/clicks_train_ventana_4.pkl")

In [50]:
features_combinados_imputados = features_auctions.merge(features_clicks, how="left", right_index=True, left_index=True, suffixes=('_auctions', '_clicks')).fillna(0)

In [47]:
features_auctions = pd.read_pickle("../../features/auctions_train_ventana_3.pkl")
targets = pd.read_pickle("../../targets/targets_ventana_3_auctions.pkl")
train = features_auctions.merge(targets, left_index=True, right_index=True)

X, y = train.iloc[:,:-1], train.iloc[:,-1]
X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.2, random_state=123)

dtrain = xgb.DMatrix(data=X_train, label=y_train)
dtest = xgb.DMatrix(data=X_test, label=y_test)

param = {'max_depth':6,
         'eta':0.3,
         'silent':1,
         'objective':'reg:linear',
         'eval_metric' : 'rmse',
         'obj' : 'reg:squarederror',
         'colsample_bytree' : 1,
         'min_child_weight' : 0.5,
         'random_state' : 0,
         'reg_alpha' : 0,
         'reg_lambda' : 1,
         'scale_pos_weight' : 1,
         'learning_rate' : 0.1}


num_round = 5000
stopping = 20
bst = xgb.train(param, dtrain, num_round, evals=[(dtrain, "train"), (dtest, "test")], early_stopping_rounds=stopping)
# make prediction
#preds = bst.predict(dtest)
#Stopping. Best iteration:
#[360]	train-rmse:67651	test-rmse:69665.3a

[0]	train-rmse:150595	test-rmse:150714
Multiple eval metrics have been passed: 'test-rmse' will be used for early stopping.

Will train until test-rmse hasn't improved in 20 rounds.
[1]	train-rmse:141099	test-rmse:141208
[2]	train-rmse:132886	test-rmse:132989
[3]	train-rmse:125832	test-rmse:125928
[4]	train-rmse:119802	test-rmse:119892
[5]	train-rmse:114669	test-rmse:114757
[6]	train-rmse:110320	test-rmse:110409
[7]	train-rmse:106662	test-rmse:106757
[8]	train-rmse:103594	test-rmse:103688
[9]	train-rmse:101024	test-rmse:101124
[10]	train-rmse:98887.3	test-rmse:98989.8
[11]	train-rmse:97098.2	test-rmse:97209.3
[12]	train-rmse:95627.9	test-rmse:95747.6
[13]	train-rmse:94405	test-rmse:94532.3
[14]	train-rmse:93400.2	test-rmse:93535.7
[15]	train-rmse:92565.4	test-rmse:92711.4
[16]	train-rmse:91874.7	test-rmse:92036.8
[17]	train-rmse:91309.1	test-rmse:91482.4
[18]	train-rmse:90837.1	test-rmse:91022.1
[19]	train-rmse:90451.3	test-rmse:90649
[20]	train-rmse:90129.2	test-rmse:90336.5
[21]	trai

---

In [8]:
import lightgbm as lgb
#lgb_model = lgb.LGBMRegressor(boosting_type='dart', metric='rmse', n_estimators=50, colsample_bytree=0.9)
lgb_model = lgb.LGBMRegressor(boosting_type='dart', num_leaves=7, n_estimators=500, metric='rmse',
                               learning_rate=0.05, colsample_bytree=0.8)
lgb_model.fit(X_train, y_train)
preds = lgb_model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, preds))
print("RMSE: %f" % (rmse))

RMSE: 88652.271971


In [16]:
preds_xgb = bst.predict(dtest)
preds_lgbm = lgb_model.predict(X_test)

In [17]:
p1 = 0.8
p2 = 0.2

preds = p1*preds_xgb + p2*preds_lgbm
rmse = np.sqrt(mean_squared_error(y_test, preds))
print("RMSE: %f" % (rmse))

RMSE: 88700.212349


In [30]:
diferencias = abs((y_test.values - preds))

In [36]:
74000 /3600

20.555555555555557

---