In [1]:
import pandas as pd
import numpy as np
import warnings
seed = np.random.seed(22)
import seaborn as sns
import matplotlib.pyplot as plt
import category_encoders as ce
from preprocessing import *

from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import VotingClassifier

from sklearn.metrics import roc_curve, auc
from sklearn.metrics import roc_auc_score

#filtrado de warnings
warnings.filterwarnings('ignore')

In [2]:
noticias_online_train_df = pd.read_parquet('train').set_index('url')
noticias_online_test_df = pd.read_parquet('test').set_index('url')

In [3]:
noticias_online_train_orden_df = noticias_online_train_df.sort_values('timedelta', ascending=False)
noticias_online_target_orden_df = noticias_online_train_orden_df.popular
noticias_online_train_orden_df = noticias_online_train_orden_df.drop(columns=['popular', 'shares'])

noticias_online_target_test_df = noticias_online_test_df.popular
noticias_online_test_df = noticias_online_test_df.drop(columns=['popular', 'shares'])

In [4]:
X_train, X_val, Y_train, Y_val = train_test_split(noticias_online_train_orden_df, noticias_online_target_orden_df, 0.4)

# Knn imputer, standard scaler, one hot encoding, mean encoding y count vectorizer encoding

## Encoders y preprocesado de X_train, X_val, Y_train, Y_val y train

In [5]:
noticias_online_train_count_vec, noticias_online_val_count_vec, noticias_online_test_count_vec = preprocessing_knn_imputer_standar_escaler_one_hot_encoding_mean_encoding_count_vectorizer(
    X_train,
    X_val,
    Y_train,
    noticias_online_test_df,
    True
    )

noticias_online_train_target_enc = Y_train.map({False: 0, True: 1})
noticias_online_val_target_enc = Y_val.map({False: 0, True: 1})
noticias_online_target_test_enc = noticias_online_target_test_df.map({False: 0, True: 1})

## Entreno y veo cómo me va (uso validación)

`parametros_xgb` y `parametros_knn` son sacados de la sección correspondiente a 
`Knn imputer, standard scaler, one hot encoding, mean encoding y count vectorizer encoding` del
notebook de XGBoost y KNN, respectivamente.

In [6]:
parametros_xgb = {'colsample_bytree': 0.94,
 'gamma': 0.64,
 'learning_rate': 0.09,
 'max_depth': 3,
 'n_estimators': 68,
 'subsample': 0.74}
xgb_classi = XGBClassifier(**parametros_xgb, random_state=22)

In [7]:
parametros_knn = {'leaf_size': 1,
 'metric': 'manhattan',
 'n_neighbors': 300,
 'p': 1,
 'weights': 'distance'}
knn_clasi = KNeighborsClassifier(**parametros_knn)

In [8]:
ensamble = VotingClassifier(
    estimators=[('xgb', xgb_classi), ('knn', knn_clasi)], voting='soft', weights=[1, 1])

In [9]:
ensamble.fit(noticias_online_train_count_vec, noticias_online_train_target_enc)

In [10]:
prediccion = ensamble.predict_proba(noticias_online_val_count_vec)[:,1]

In [11]:
round(roc_auc_score(noticias_online_val_target_enc, prediccion, average='micro'), 4)

0.7205

## Predicciones (uso test)

In [12]:
prediccion = ensamble.predict_proba(noticias_online_test_count_vec)[:,1]

In [13]:
round(roc_auc_score(noticias_online_target_test_enc, prediccion, average='micro'), 4)

0.7237