# Intento 7

| Score | Estimación _holdout_ | Timestamp |
| --- | --- | --- |
| **0.8608** | 0.8682 | 2022-01-03 11:56:43 UTC |

- _Hyperparameter tuning_ en XGBoost.

In [1]:
import pandas as pd
import numpy as np
import time
from itertools import product
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.multioutput import MultiOutputClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
import xgboost as xgb

# código común
import common.preprocessing as pre
from common.vars import SEED

# display all columns
pd.options.display.max_columns = None

In [2]:
train_features = pd.read_csv('../data/flu_training_set_features.csv', index_col='respondent_id')
train_labels = pd.read_csv('../data/flu_training_set_labels.csv', index_col='respondent_id')
test_features = pd.read_csv('../data/flu_test_set_features.csv', index_col='respondent_id')

In [3]:
def preprocess(train_features, test_features):
  """PREPROCESAMIENTO
  """
  labels = {}
  imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
  _train_features = train_features.astype(str)
  # uso SimpleImputer para "eliminar los nulos", como elimina los nombres de las columnas los ponemos de nuevo
  _train_features = pd.DataFrame(imp.fit_transform(_train_features), index=train_features.index)
  _train_features.columns = train_features.columns

  _all = pd.concat([train_features, test_features])

  # aprendo las etiquetas
  for col in _all.columns:
    labels[col] = LabelEncoder()
    labels[col].fit(_all[col].astype(str))
  
  # aplico el etiquetado
  for col in _train_features.columns:
    _train_features[col] = labels[col].transform(_train_features[col])
  
  _test_features = test_features.astype(str)
  _test_features = pd.DataFrame(imp.transform(_test_features), index=test_features.index)
  _test_features.columns = test_features.columns

  # aplico el mismo etiquetado con los valores de test
  for col in _test_features.columns:
    _test_features[col] = labels[col].transform(_test_features[col])
  
  return _train_features, _test_features

In [4]:
train_features, test_features = preprocess(train_features, test_features)

In [5]:
kf = KFold(n_splits=5, shuffle=True, random_state=SEED)

def cross_validation(model, X, y, cv):
  measures = []
  time_total_0 = time.time()

  for train, test in cv.split(X, y):
    X_train = X.iloc[train, :].values
    X_test = X.iloc[test, :].values
    y_train = y.iloc[train, :].values
    y_test = y.iloc[test, :].values
    time_iter_0 = time.time()
    _model = model.fit(X_train, y_train)
    time_iter = time.time() - time_iter_0
    preds = _model.predict_proba(X_test)
    y_preds = pd.DataFrame(
      {
        'h1n1_vaccine':     preds[0][:,1],
        'seasonal_vaccine': preds[1][:,1],
      },
      index = y.iloc[test,:].index
    )
    measure = roc_auc_score(y_test, y_preds)
    print(f"[CV] AUC score: {measure:.4f}. Time: {time_iter:6.2f}")
    measures.append(measure)
  
  time_total = time.time() - time_total_0
  print(f"[CV] Average AUC: {np.mean(measures):.4f}. Total time: {time_total:6.2f}")
  return model, y_preds

def preds_to_df(preds, features):
  return pd.DataFrame(
    {
      'h1n1_vaccine': preds[0][:,1],
      'seasonal_vaccine': preds[1][:,1]
    },
    index = features.index
  )

### _Holdout_ como proxy de ROC en test

In [6]:
def holdout_proxy(model, X, y, test_size=0.2):
  # split train features in two
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=SEED)

  model_cvd, _ = cross_validation(model, X_train, y_train, kf)

  # aprendemos con todos los ejemplos
  model_cvd = model_cvd.fit(X_train, y_train)
  # aplicamos probabilidad
  preds = model_cvd.predict_proba(X_test)

  return roc_auc_score(y_test, preds_to_df(preds, X_test))

In [11]:
def proxy_search(search_space, model_lambda, X, y, test_size=0.2):
  space = (dict(zip(search_space.keys(), values)) for values in product(*search_space.values()))
  n = len(list(product(*search_space.values())))
  results = []
  i = 1
  for params in space:
    print(f"[Proxy Search] ({i} of {n}) Searching with params {params}")
    model = model_lambda(params)
    roc_auc = holdout_proxy(model, X, y, test_size=test_size)
    results.append({
      'params': params,
      'roc_auc': roc_auc
    })
    i += 1

  return results

In [8]:
xgb_multi_lambda = lambda params: MultiOutputClassifier(xgb.XGBClassifier(seed=SEED, use_label_encoder=False, eval_metric='auc', **params))

In [9]:
search_space = {
  'eta': [0.05, 0.1, 0.2],
  'max_depth': [3, 5, 8, 10],
  'subsample': [0.5, 0.75, 1]
}

In [12]:
search = proxy_search(search_space, xgb_multi_lambda, train_features, train_labels)

[Proxy Search] (1 of 36) Searching with params {'eta': 0.05, 'max_depth': 3, 'subsample': 0.5}
[CV] AUC score: 0.8619. Time:   1.95
[CV] AUC score: 0.8630. Time:   1.76
[CV] AUC score: 0.8522. Time:   1.67
[CV] AUC score: 0.8509. Time:   1.72
[CV] AUC score: 0.8662. Time:   1.77
[CV] Average AUC: 0.8589. Total time:   9.03
[Proxy Search] (2 of 36) Searching with params {'eta': 0.05, 'max_depth': 3, 'subsample': 0.75}
[CV] AUC score: 0.8619. Time:   1.63
[CV] AUC score: 0.8626. Time:   1.59
[CV] AUC score: 0.8516. Time:   1.94
[CV] AUC score: 0.8509. Time:   1.52
[CV] AUC score: 0.8657. Time:   1.49
[CV] Average AUC: 0.8585. Total time:   8.32
[Proxy Search] (3 of 36) Searching with params {'eta': 0.05, 'max_depth': 3, 'subsample': 1}
[CV] AUC score: 0.8615. Time:   1.09
[CV] AUC score: 0.8623. Time:   1.20
[CV] AUC score: 0.8516. Time:   1.34
[CV] AUC score: 0.8502. Time:   1.10
[CV] AUC score: 0.8655. Time:   1.11
[CV] Average AUC: 0.8582. Total time:   5.98
[Proxy Search] (4 of 36) S

In [13]:
search_max = max(search, key=lambda item: item['roc_auc'])
search_max

{'params': {'eta': 0.1, 'max_depth': 5, 'subsample': 0.75},
 'roc_auc': 0.8682050936387379}

In [14]:
# split train features in two
X_train, X_test, y_train, y_test = train_test_split(train_features, train_labels, test_size=0.2, random_state=SEED)

In [15]:
# LightGBM
xgb_multi = MultiOutputClassifier(xgb.XGBClassifier(seed=SEED, use_label_encoder=False, eval_metric='auc', **search_max['params']))
xgb_cvd, y_preds_xgb = cross_validation(xgb_multi, X_train, y_train, kf)

[CV] AUC score: 0.8671. Time:   2.07
[CV] AUC score: 0.8695. Time:   2.67
[CV] AUC score: 0.8614. Time:   2.24
[CV] AUC score: 0.8556. Time:   1.97
[CV] AUC score: 0.8720. Time:   1.99
[CV] Average AUC: 0.8651. Total time:  11.10


In [16]:
# aprendemos con todos los ejemplos
xgb_cvd = xgb_cvd.fit(X_train, y_train)
# aplicamos probabilidad
xgb_preds = xgb_cvd.predict_proba(X_test)

In [17]:
roc_auc_score(y_test, preds_to_df(xgb_preds, X_test))

0.8682050936387379

### Submission

In [21]:
# LightGBM
xgb_multi = MultiOutputClassifier(xgb.XGBClassifier(seed=SEED, use_label_encoder=False, eval_metric='auc', **search_max['params']))
xgb_cvd, y_preds_xgb = cross_validation(xgb_multi, train_features, train_labels, kf)

[CV] AUC score: 0.8681. Time:   2.62
[CV] AUC score: 0.8673. Time:   2.58
[CV] AUC score: 0.8658. Time:   2.54
[CV] AUC score: 0.8696. Time:   3.17
[CV] AUC score: 0.8641. Time:   4.13
[CV] Average AUC: 0.8670. Total time:  15.22


In [22]:
# aprendemos con todos los ejemplos
xgb_cvd = xgb_cvd.fit(train_features, train_labels)
# aplicamos probabilidad
xgb_preds = xgb_cvd.predict_proba(test_features)

In [23]:
preds_to_df(xgb_preds, test_features).to_csv('../submissions/07.csv')