# Intento 5

| Score | Estimación _holdout_ | Timestamp |
| --- | --- | --- |
| **0.8592** | 0.8647 | 2022-01-01 22:16:44 UTC |

- Uso de `StratifiedKFold`.

In [1]:
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.base import clone
import lightgbm as lgb

# código común
import common.preprocessing as pre
from common.vars import SEED

# display all columns
pd.options.display.max_columns = None

In [2]:
train_features = pd.read_csv('../data/flu_training_set_features.csv', index_col='respondent_id')
train_labels = pd.read_csv('../data/flu_training_set_labels.csv', index_col='respondent_id')
test_features = pd.read_csv('../data/flu_test_set_features.csv', index_col='respondent_id')

In [3]:
def preprocess(train_features, test_features):
  """PREPROCESAMIENTO
  """
  labels = {}
  imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
  _train_features = train_features.astype(str)
  # uso SimpleImputer para "eliminar los nulos", como elimina los nombres de las columnas los ponemos de nuevo
  _train_features = pd.DataFrame(imp.fit_transform(_train_features), index=train_features.index)
  _train_features.columns = train_features.columns

  _all = pd.concat([train_features, test_features])

  # aprendo las etiquetas
  for col in _all.columns:
    labels[col] = LabelEncoder()
    labels[col].fit(_all[col].astype(str))
  
  # aplico el etiquetado
  for col in _train_features.columns:
    _train_features[col] = labels[col].transform(_train_features[col])
  
  _test_features = test_features.astype(str)
  _test_features = pd.DataFrame(imp.transform(_test_features), index=test_features.index)
  _test_features.columns = test_features.columns

  # aplico el mismo etiquetado con los valores de test
  for col in _test_features.columns:
    _test_features[col] = labels[col].transform(_test_features[col])
  
  return _train_features, _test_features

In [4]:
train_features, test_features = preprocess(train_features, test_features)

In [5]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)

def cross_validation_mono(model, X, y, cv, attr):
  measures = []
  time_total_0 = time.time()

  for train, test in cv.split(X, y[attr]):
    X_train = X.iloc[train, :]
    X_test = X.iloc[test, :]
    y_train = y[attr].iloc[train].values
    y_test = y[attr].iloc[test].values
    time_iter_0 = time.time()
    _model = model.fit(X_train, y_train)
    time_iter = time.time() - time_iter_0
    preds = _model.predict_proba(X_test)
    y_preds = pd.DataFrame(
      {
        attr: preds[:,1]
      },
      index = y[attr].iloc[test].index
    )
    measure = roc_auc_score(y_test, y_preds)
    print(f"[CV] AUC score: {measure:.4f}. Time: {time_iter:6.2f}")
    measures.append(measure)
  
  time_total = time.time() - time_total_0
  print(f"[CV] Average AUC: {np.mean(measures):.4f}. Total time: {time_total:6.2f}")
  return {
    'model':        model,
    'y_preds':      y_preds,
    'measures':     measures,
    'avg_measure':  np.mean(measures)
  }

def preds_to_df(preds, features):
  return pd.DataFrame(
    {
      'h1n1_vaccine': preds[0][:,1],
      'seasonal_vaccine': preds[1][:,1]
    },
    index = features.index
  )

### _Holdout_ como proxy de ROC en test

In [6]:
# split train features in two
X_train, X_test, y_train, y_test = train_test_split(train_features, train_labels, test_size=0.2, random_state=SEED)

In [7]:
# LightGBM
lgbm_h1n1 = lgb.LGBMClassifier(objective='binary', n_estimators=200)
lgbm_seasonal = lgb.LGBMClassifier(objective='binary', n_estimators=200)
print("h1n1_vaccine")
lgbm_h1n1_result = cross_validation_mono(lgbm_h1n1, X_train, y_train, skf, 'h1n1_vaccine')
print("\nseasonal_vaccine")
lgbm_seasonal_result = cross_validation_mono(lgbm_seasonal, X_train, y_train, skf, 'seasonal_vaccine')

h1n1_vaccine
[CV] AUC score: 0.8606. Time:   0.32
[CV] AUC score: 0.8619. Time:   0.33
[CV] AUC score: 0.8708. Time:   0.46
[CV] AUC score: 0.8679. Time:   0.58
[CV] AUC score: 0.8607. Time:   0.42
[CV] Average AUC: 0.8644. Total time:   2.33

seasonal_vaccine
[CV] AUC score: 0.8615. Time:   0.44
[CV] AUC score: 0.8486. Time:   0.51
[CV] AUC score: 0.8604. Time:   0.43
[CV] AUC score: 0.8565. Time:   0.55
[CV] AUC score: 0.8583. Time:   0.43
[CV] Average AUC: 0.8571. Total time:   2.60


In [8]:
# aprendemos con todos los ejemplos
lgbm_h1n1_result_model = lgbm_h1n1_result['model'].fit(X_train, y_train['h1n1_vaccine'])
# aplicamos probabilidad
h1n1_preds = lgbm_h1n1_result_model.predict_proba(X_test)

In [9]:
# aprendemos con todos los ejemplos
lgbm_seasonal_result_model = lgbm_seasonal_result['model'].fit(X_train, y_train['seasonal_vaccine'])
# aplicamos probabilidad
seasonal_preds = lgbm_seasonal_result_model.predict_proba(X_test)

In [10]:
roc_auc_score(y_test, preds_to_df([h1n1_preds, seasonal_preds], X_test))

0.8646884663508434

### Submission

In [11]:
# LightGBM
lgbm_h1n1 = lgb.LGBMClassifier(objective='binary', n_estimators=200)
lgbm_seasonal = lgb.LGBMClassifier(objective='binary', n_estimators=200)
print("h1n1_vaccine")
lgbm_h1n1_result = cross_validation_mono(lgbm_h1n1, train_features, train_labels, skf, 'h1n1_vaccine')
print("\nseasonal_vaccine")
lgbm_seasonal_result = cross_validation_mono(lgbm_seasonal, train_features, train_labels, skf, 'seasonal_vaccine')

h1n1_vaccine
[CV] AUC score: 0.8660. Time:   0.36
[CV] AUC score: 0.8671. Time:   0.48
[CV] AUC score: 0.8653. Time:   0.62
[CV] AUC score: 0.8525. Time:   0.46
[CV] AUC score: 0.8776. Time:   0.54
[CV] Average AUC: 0.8657. Total time:   2.71

seasonal_vaccine
[CV] AUC score: 0.8634. Time:   0.54
[CV] AUC score: 0.8653. Time:   0.51
[CV] AUC score: 0.8597. Time:   0.55
[CV] AUC score: 0.8528. Time:   0.47
[CV] AUC score: 0.8614. Time:   0.79
[CV] Average AUC: 0.8605. Total time:   3.11


In [12]:
# aprendemos con todos los ejemplos
lgbm_h1n1_result_model = lgbm_h1n1_result['model'].fit(train_features, train_labels['h1n1_vaccine'])
# aplicamos probabilidad
h1n1_preds = lgbm_h1n1_result_model.predict_proba(test_features)

In [13]:
# aprendemos con todos los ejemplos
lgbm_seasonal_result_model = lgbm_seasonal_result['model'].fit(train_features, train_labels['seasonal_vaccine'])
# aplicamos probabilidad
seasonal_preds = lgbm_seasonal_result_model.predict_proba(test_features)

In [14]:
preds_to_df([h1n1_preds, seasonal_preds], test_features).to_csv('../submissions/05.csv')