In [1]:
import pandas as pd
import numpy as np
import warnings
seed = np.random.seed(22)
import seaborn as sns
import matplotlib.pyplot as plt
import category_encoders as ce
from preprocessing import *

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import roc_curve, auc
from sklearn.metrics import roc_auc_score

#filtrado de warnings
warnings.filterwarnings('ignore')

In [2]:
noticias_online_train_df = pd.read_parquet('train').set_index('url')
noticias_online_test_df = pd.read_parquet('test').set_index('url')

In [3]:
noticias_online_train_orden_df = noticias_online_train_df.sort_values('timedelta', ascending=False)
noticias_online_target_orden_df = noticias_online_train_orden_df.popular
noticias_online_train_orden_df = noticias_online_train_orden_df.drop(columns=['popular', 'shares'])

noticias_online_target_test_df = noticias_online_test_df.popular
noticias_online_test_df = noticias_online_test_df.drop(columns=['popular', 'shares'])

In [4]:
X_train, X_val, Y_train, Y_val = train_test_split(noticias_online_train_orden_df, noticias_online_target_orden_df, 0.4)

# Knn imputer, standard scaler, one hot encoding, mean encoding y count vectorizer encoding

## Encoders y preprocesado de X_train, X_val, Y_train e Y_val

In [5]:
noticias_online_train_count_vec, noticias_online_val_count_vec, ppp = preprocessing_knn_imputer_standar_escaler_one_hot_encoding_mean_encoding_count_vectorizer(
    X_train,
    X_val,
    Y_train,
    )

noticias_online_train_target_enc = Y_train.map({False: 0, True: 1})
noticias_online_val_target_enc = Y_val.map({False: 0, True: 1})

Sacado de XGBoost:

In [6]:
columnas_a_usar = ['kw_avg_avg',
 'data_channel_is_socmed',
 'is_weekend',
 'data_channel_is_entertainment',
 'self_reference_avg_sharess',
 'data_channel_is_tech',
 'x0_sports',
 'kw_max_avg',
 'weekday_is_sunday',
 'self_reference_min_shares',
 'kw_max_max',
 'weekday_is_saturday',
 'data_channel_is_bus',
 'surprise2',
 'num_imgs',
 'num_videos',
 'num_hrefs',
 'kw_min_max',
 'credit',
 'kw_min_avg',
 'weekday_is_tuesday',
 'kw_avg_max',
 'LDA_02',
 'just',
 'x0_science/tech',
 'title_sentiment_polarity',
 'n_unique_tokens',
 'n_non_stop_unique_tokens',
 'data_channel_is_lifestyle',
 'self_reference_max_shares',
 'timedelta',
 'abs_title_sentiment_polarity',
 'x0_business',
 'rate_positive_words',
 'time',
 'twitter',
 'avg_positive_polarity',
 'global_subjectivity',
 'LDA_00',
 'rate_negative_words',
 'n_tokens_content',
 'global_rate_positive_words',
 'title_subjectivity',
 'LDA_03',
 'LDA_04']

In [7]:
df_reducido = noticias_online_train_count_vec[columnas_a_usar]

## Búsqueda de hiperparámetros

In [8]:
knn_model = KNeighborsClassifier()

leaf_size = [1, 15]
weights = ["uniform", "distance"]
n_neighbors = [150, 300, 450]
p = [1, 2]
metric = ["euclidean", "manhattan", "chebyshev"]

knn_hyperparameters = dict(weights=weights, n_neighbors=n_neighbors, leaf_size=leaf_size, p=p, metric=metric)

In [9]:
gsearch = GridSearchCV(estimator=knn_model, param_grid=knn_hyperparameters, cv=5, verbose=4, scoring='roc_auc', n_jobs=-1)
gsearch.fit(df_reducido, noticias_online_train_target_enc)

Fitting 5 folds for each of 72 candidates, totalling 360 fits


In [10]:
parametros = gsearch.best_params_
parametros

{'leaf_size': 1,
 'metric': 'manhattan',
 'n_neighbors': 300,
 'p': 1,
 'weights': 'distance'}

## Entreno y veo cómo me va (uso validación)

In [11]:
modelo = KNeighborsClassifier(**parametros)
modelo.fit(df_reducido, noticias_online_train_target_enc)

In [12]:
prediccion = modelo.predict_proba(noticias_online_val_count_vec[columnas_a_usar])[:,1]

In [13]:
round(roc_auc_score(noticias_online_val_target_enc, prediccion, average='micro'), 4)

0.7005

# Mean imputer, robust scaler, one hot encoding, mean encoding con smoothing y TF-IDF encoding

## Encoders y preprocesado de X_train, X_val, Y_train e Y_val

In [14]:
noticias_online_train_tf_idf, noticias_online_val_tf_idf, ppp = preprocessing_mean_imputer_robust_escaler_one_hot_encoding_mean_encoding_smooth_tf_idf_vectorizer(
    X_train,
    X_val,
    Y_train,
    )

noticias_online_train_target_enc = Y_train.map({False: 0, True: 1})
noticias_online_val_target_enc = Y_val.map({False: 0, True: 1})

Sacado de XGBoost:

In [15]:
columnas_a_usar = ['data_channel_is_entertainment',
 'kw_avg_avg',
 'data_channel_is_bus',
 'kw_max_max',
 'self_reference_avg_sharess',
 'data_channel_is_socmed',
 'is_weekend',
 'weekday_is_saturday',
 'data_channel_is_tech',
 'kw_max_avg',
 'weekday_is_friday',
 'kw_min_min',
 'num_videos',
 'num_imgs',
 'self_reference_min_shares',
 'credit',
 'x0_science/tech',
 'n_tokens_content',
 'n_unique_tokens',
 'LDA_02',
 'kw_min_avg',
 'x0_the world',
 'num_hrefs',
 'x0_sports',
 'num_self_hrefs',
 'kw_min_max',
 'n_non_stop_words',
 'data_channel_is_lifestyle',
 'num_keywords',
 'timedelta',
 'kw_avg_max',
 'said',
 'LDA_00',
 'LDA_01',
 'global_rate_positive_words',
 'rate_positive_words',
 'n_non_stop_unique_tokens',
 'like',
 'global_sentiment_polarity',
 'global_subjectivity',
 'max_negative_polarity',
 'weekday_is_tuesday',
 'surprise2',
 'time',
 'abs_title_sentiment_polarity']

In [16]:
df_reducido = noticias_online_train_tf_idf[columnas_a_usar]

## Búsqueda de hiperparámetros

In [17]:
knn_model = KNeighborsClassifier()

leaf_size = [1, 15]
weights = ["uniform", "distance"]
n_neighbors = [200, 500, 900]
p = [1, 2]
metric = ["euclidean", "manhattan", "chebyshev"]

knn_hyperparameters = dict(weights=weights, n_neighbors=n_neighbors, leaf_size=leaf_size, p=p, metric=metric)

In [18]:
gsearch = GridSearchCV(estimator=knn_model, param_grid=knn_hyperparameters, cv=5, verbose=4, scoring='roc_auc', n_jobs=-1)
gsearch.fit(df_reducido, noticias_online_train_target_enc)

Fitting 5 folds for each of 72 candidates, totalling 360 fits


In [19]:
parametros = gsearch.best_params_
parametros

{'leaf_size': 1,
 'metric': 'manhattan',
 'n_neighbors': 900,
 'p': 1,
 'weights': 'uniform'}

## Entreno y veo cómo me va (uso validación)

In [23]:
p = parametros
p['n_neighbors'] = 1000

In [24]:
modelo = KNeighborsClassifier(**p)
modelo.fit(df_reducido, noticias_online_train_target_enc)

In [25]:
prediccion = modelo.predict_proba(noticias_online_val_tf_idf[columnas_a_usar])[:,1]

In [26]:
round(roc_auc_score(noticias_online_val_target_enc, prediccion, average='micro'), 4)

0.7002