# Otros intentos

- _Hyperparameter tuning_ en CatBoost.

In [1]:
import pandas as pd
import numpy as np
import time
from itertools import product
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.multioutput import MultiOutputClassifier
from sklearn.base import clone
import catboost as catb

# código común
import common.preprocessing as pre
from common.vars import SEED

# display all columns
pd.options.display.max_columns = None

In [2]:
train_features = pd.read_csv('../data/flu_training_set_features.csv', index_col='respondent_id')
train_labels = pd.read_csv('../data/flu_training_set_labels.csv', index_col='respondent_id')
test_features = pd.read_csv('../data/flu_test_set_features.csv', index_col='respondent_id')

In [3]:
def preprocess(df_train, df_test):
  """PREPROCESAMIENTO
  """
  _df_train = df_train.copy()
  _df_test = df_test.copy()
  # antes de realizar one-hot encoding, eliminamos los valores perdidos de atributos categóricos
  # lo haremos imputando a la clase mayoritaria (mode)
  attrs_imp = _df_train.select_dtypes(include=object)
  _df_train = pre.impute(_df_train, strategy='mode', attrs=attrs_imp)
  _df_test = pre.impute(_df_test, strategy='mode', attrs=attrs_imp)
  
  # del resto de atributos, imputamos la mediana (median)
  attrs_imp = _df_train.select_dtypes(exclude=object)
  _df_train = pre.impute(_df_train, strategy='median', attrs=attrs_imp)
  _df_test = pre.impute(_df_test, strategy='median', attrs=attrs_imp)

  # pasamos los atributos a enteros para CatBoost (no tenemos atributos float)
  for attr in attrs_imp:
    _df_train[attr] = _df_train[attr].astype(np.int64)
    _df_test[attr] = _df_test[attr].astype(np.int64)
  
  return _df_train, _df_test

In [4]:
train_features, test_features = preprocess(train_features, test_features)

In [5]:
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)

def cross_validation_mono(model, X, y, cv, attr):
  measures = []
  time_total_0 = time.time()
  print(f"[CV] {attr}")

  for train, test in cv.split(X, y[attr]):
    X_train = X.iloc[train, :]
    X_test = X.iloc[test, :]
    y_train = y[attr].iloc[train]
    y_test = y[attr].iloc[test]
    pool_train = catb.Pool(X_train, y_train, cat_features=X_train.select_dtypes(include=object).columns.tolist())
    pool_test = catb.Pool(X_test, cat_features=X_test.select_dtypes(include=object).columns.tolist())
    time_iter_0 = time.time()
    _model = model.fit(pool_train)
    time_iter = time.time() - time_iter_0
    preds = _model.predict_proba(pool_test)
    y_preds = pd.DataFrame(
      {
        attr: preds[:,1]
      },
      index = y[attr].iloc[test].index
    )
    measure = roc_auc_score(y_test, y_preds)
    print(f"[CV] AUC score: {measure:.4f}. Time: {time_iter:6.2f}")
    measures.append(measure)
  
  time_total = time.time() - time_total_0
  print(f"[CV] Average AUC: {np.mean(measures):.4f}. Total time: {time_total:6.2f}")
  return {
    'model':        model,
    'attr':         attr,
    'y_preds':      y_preds,
    'measures':     measures,
    'avg_measure':  np.mean(measures),
    'pool_train':   pool_train,
    'pool_test':    pool_test
  }

def preds_to_df(preds, features):
  return pd.DataFrame(
    {
      'h1n1_vaccine': preds[0][:,1],
      'seasonal_vaccine': preds[1][:,1]
    },
    index = features.index
  )

### _Holdout_ como proxy de ROC en test

In [6]:
def holdout_proxy(model, X, y, test_size=0.2):
  # split train features in two
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=SEED)

  results = {}
  for attr in y.columns:
    _model = clone(model)
    results[attr] = cross_validation_mono(_model, X_train, y_train, kf, attr)

  preds = {}
  for attr in y.columns:
    # aprendemos con todos los ejemplos
    pool_train = catb.Pool(X_train, y_train[attr], cat_features=X_train.select_dtypes(include=object).columns.tolist())
    pool_test = catb.Pool(X_test, cat_features=X_test.select_dtypes(include=object).columns.tolist())
    results[attr]['model'] = results[attr]['model'].fit(pool_train)
    # aplicamos probabilidad
    preds[attr] = results[attr]['model'].predict_proba(pool_test)
  
  return roc_auc_score(y_test, preds_to_df([preds['h1n1_vaccine'], preds['seasonal_vaccine']], X_test))

In [7]:
import random
def proxy_search(search_space, model_lambda, X, y, test_size=0.2, limit=20):
  space = list(dict(zip(search_space.keys(), values)) for values in product(*search_space.values()))
  n = len(list(product(*search_space.values())))
  results = []
  i = 1
  while i <= limit:
    params = space[random.randint(0, n-1)]
    print(f"[Proxy Search] ({i} of {limit}) Searching with params {params}")
    model = model_lambda(params)
    roc_auc = holdout_proxy(model, X, y, test_size=test_size)
    results.append({
      'params': params,
      'roc_auc': roc_auc
    })

    i += 1
  
  return results

In [8]:
catb_multi_lambda = lambda params: catb.CatBoostClassifier(random_seed=SEED, silent=True, **params)

In [9]:
search_space = {
  'depth': [4, 5, 6, 7, 8, 9, 10],
  'learning_rate': [0.01, 0.02, 0.03, 0.04],
  'iterations': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
}

In [10]:
search = proxy_search(search_space, catb_multi_lambda, train_features, train_labels)

[Proxy Search] (1 of 20) Searching with params {'depth': 9, 'learning_rate': 0.03, 'iterations': 50}
[CV] h1n1_vaccine
[CV] AUC score: 0.8316. Time:   5.47
[CV] AUC score: 0.8173. Time:   4.99
[CV] AUC score: 0.8338. Time:   4.70
[CV] AUC score: 0.8317. Time:   4.21
[CV] AUC score: 0.8226. Time:   4.18
[CV] Average AUC: 0.8274. Total time:  24.19
[CV] seasonal_vaccine
[CV] AUC score: 0.8501. Time:   4.58
[CV] AUC score: 0.8403. Time:   4.78
[CV] AUC score: 0.8510. Time:   4.52
[CV] AUC score: 0.8494. Time:   4.31
[CV] AUC score: 0.8533. Time:   4.53
[CV] Average AUC: 0.8488. Total time:  23.36
[Proxy Search] (2 of 20) Searching with params {'depth': 9, 'learning_rate': 0.02, 'iterations': 10}
[CV] h1n1_vaccine
[CV] AUC score: 0.8266. Time:   1.10
[CV] AUC score: 0.8105. Time:   1.03
[CV] AUC score: 0.8290. Time:   1.08
[CV] AUC score: 0.8246. Time:   0.96
[CV] AUC score: 0.8148. Time:   1.18
[CV] Average AUC: 0.8211. Total time:   5.97
[CV] seasonal_vaccine
[CV] AUC score: 0.8320. Time

In [11]:
search_max = max(search, key=lambda item: item['roc_auc'])
search_max

{'params': {'depth': 6, 'learning_rate': 0.04, 'iterations': 100},
 'roc_auc': 0.8518087063929591}