In [1]:
import pandas as pd
import numpy as np
import warnings
seed = np.random.seed(22)
import seaborn as sns
import matplotlib.pyplot as plt
import category_encoders as ce
from preprocessing import *
import xgboost as xgb
from bayes_opt import BayesianOptimization

from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_absolute_error #no se pide este error pero tiene las mismas unidades que shares

#filtrado de warnings
warnings.filterwarnings('ignore')

In [2]:
noticias_online_train_df = pd.read_parquet('train').set_index('url')
noticias_online_test_df = pd.read_parquet('test').set_index('url')

In [3]:
noticias_online_train_orden_df = noticias_online_train_df.sort_values('timedelta', ascending=False)
noticias_online_target_orden_df = noticias_online_train_orden_df[['popular', 'shares']]
noticias_online_train_orden_df = noticias_online_train_orden_df.drop(columns=['popular', 'shares'])

noticias_online_target_test_df = noticias_online_test_df.shares
noticias_online_test_df = noticias_online_test_df.drop(columns=['popular', 'shares'])

In [4]:
X_train, X_val, Y_train, Y_val = train_test_split(noticias_online_train_orden_df, noticias_online_target_orden_df, 0.4)

# Mean imputer, robust scaler, one hot encoding, mean encoding con smoothing y TF-IDF encoding

## Encoders y preprocesado de X_train, X_val, Y_train, Y_val y train

In [5]:
noticias_online_train_tf_idf, noticias_online_val_tf_idf, noticias_online_test_tf_idf = preprocessing_mean_imputer_robust_escaler_one_hot_encoding_mean_encoding_smooth_tf_idf_vectorizer(
    X_train,
    X_val,
    Y_train['shares'],
    noticias_online_test_df,
    True
    )

noticias_online_train_target_enc = Y_train['shares']
noticias_online_val_target_enc = Y_val['shares']
noticias_online_target_test_enc = noticias_online_target_test_df

## Búsqueda de hiperparámetros

In [6]:
dtrain = xgb.DMatrix(noticias_online_train_tf_idf, label=noticias_online_train_target_enc)

In [7]:
def bo_tune_xgb(max_depth, gamma, reg_alpha, n_estimators, learning_rate, subsample, colsample_bytree):
  #params = {'max_depth': int(max_depth),
  #         'gamma': gamma,
  #          'n_estimators': int(n_estimators),
  #          'learning_rate':learning_rate,
  #          'subsample': 0.9,
  #          'metric': 'accuracy'}
  params = {'max_depth': int(max_depth),
            'gamma': gamma,
            #'min_child_weight': min_child_weight,
            #'objetive': objetive,
            'reg_alpha': reg_alpha,
            #'reg_lambda': reg_lambda,
            #'max_delta_step': max_delta_step,
            'n_estimators': int(n_estimators),
            'learning_rate':learning_rate,
            'subsample': subsample,
            'colsample_bytree': colsample_bytree,
            #'eta': 0.1,
            'eval_metric': 'rmse'}
  #Cross validating with the specified parameters in 5 folds and 70 iterations
  cv_result = xgb.cv(params, dtrain, num_boost_round=70, nfold=5)
  #Return the negative RMSE
  return -1.0 * cv_result['test-rmse-mean'].iloc[-1]
  #return xgb.cv(params, dtrain, num_boost_round=70, nfold=5, early_stopping_rounds=100)

In [8]:
xgb_bo = BayesianOptimization(bo_tune_xgb, {#'max_depth':(5, 12.99),
    #'gamma':(0.0, 1.0),
    'max_depth':(3, 10),
    'gamma':(0.5, 0.7),
    'reg_alpha':(1e-5, 1e-2),
    #'reg_lambda':(1e-5, 1e-2),
    'n_estimators':(50, 70),
    'learning_rate':(0.01, 0.1),
    'subsample':(0.6, 0.8),
    'colsample_bytree':(0.8, 1),
    #'learning_rate':(0.0, 1.0)
                            })
xgb_bo.maximize(n_iter=8, init_points=8, acq='ei')
#xgb_bo.maximize(acq='poi')

|   iter    |  target   | colsam... |   gamma   | learni... | max_depth | n_esti... | reg_alpha | subsample |
-------------------------------------------------------------------------------------------------------------
Parameters: { "n_estimators" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "n_estimators" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "n_estimators" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
 

In [9]:
params = xgb_bo.max['params']
print(params)

{'colsample_bytree': 0.8009085090997561, 'gamma': 0.6307008935897852, 'learning_rate': 0.012885682297654681, 'max_depth': 4.218275458555351, 'n_estimators': 63.21386179105179, 'reg_alpha': 0.0011973595363997127, 'subsample': 0.6361003212348948}


In [10]:
params['max_depth'] = int(params['max_depth'])
params['n_estimators'] = int(params['n_estimators'])

## Entreno y veo cómo me va (uso train)

In [11]:
modelo = xgb.XGBRegressor(**params,random_state=22)
modelo.fit(noticias_online_train_tf_idf, noticias_online_train_target_enc)

In [12]:
prediccion = modelo.predict(noticias_online_train_tf_idf)

In [13]:
round(mean_squared_error(noticias_online_train_target_enc, prediccion), 2)

162837912.45

In [14]:
round(mean_absolute_error(noticias_online_train_target_enc, prediccion), 2)

2624.61

## Accuracy sobre `popular` luego de una regresión que se pasa a binaria por medio del percentil 80 de `shares`

In [15]:
prediccion = modelo.predict(noticias_online_val_tf_idf)
percentil_80 = np.percentile(prediccion, 80)
prediccion_binaria = np.where(prediccion > percentil_80, 1, 0)
noticias_online_val_popular_enc = Y_val['popular'].map({False: 0, True: 1})
round(accuracy_score(noticias_online_val_popular_enc, prediccion_binaria), 4)

0.7537

## Predicciones (uso test)

In [16]:
prediccion = modelo.predict(noticias_online_test_tf_idf)

In [17]:
round(mean_squared_error(noticias_online_target_test_enc, prediccion), 2)

62499465.81

In [18]:
round(mean_absolute_error(noticias_online_target_test_enc, prediccion), 2)

2044.56