# Intento 8

| Score | Estimación _holdout_ | Timestamp |
| --- | --- | --- |
| **0.8626** | 0.8699 | 2022-01-03 12:16:33 UTC |

- Partir del modelo obtenido en el intento 6.
- Cambiar estrategia de preprocesado.

In [1]:
import pandas as pd
import numpy as np
import time
from itertools import product
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.multioutput import MultiOutputClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
import lightgbm as lgb

# código común
import common.preprocessing as pre
from common.vars import SEED

# display all columns
pd.options.display.max_columns = None

In [2]:
train_features = pd.read_csv('../data/flu_training_set_features.csv', index_col='respondent_id')
train_labels = pd.read_csv('../data/flu_training_set_labels.csv', index_col='respondent_id')
test_features = pd.read_csv('../data/flu_test_set_features.csv', index_col='respondent_id')

In [14]:
def preprocess(df_train, df_test, pre_onehot=True):
  """PREPROCESAMIENTO
  """
  _df_train = df_train.copy()
  _df_test = df_test.copy()
  # antes de realizar one-hot encoding, eliminamos los valores perdidos de atributos categóricos
  # lo haremos imputando a la clase mayoritaria (mode)
  if pre_onehot:
    attrs_imp = ['race', 'employment_status', 'census_msa', 'hhs_geo_region', 'employment_industry', 'employment_occupation']
    _df_train = pre.impute(_df_train, strategy='mode', attrs=attrs_imp)
    _df_test = pre.impute(_df_test, strategy='mode', attrs=attrs_imp)
  
  def trans(df):
    df = pre.trans_onehot(df, 'race')
    df = pre.trans_onehot(df, 'employment_status')
    df = pre.trans_onehot(df, 'census_msa')
    df = pre.trans_discretize(df, 'age_group', {
      '18 - 34 Years':  1,
      '35 - 44 Years':  2,
      '45 - 54 Years':  3,
      '55 - 64 Years':  4,
      '65+ Years':      5
    })
    df = pre.trans_discretize(df, 'education', {
      '< 12 Years':       1,
      '12 Years':         2,
      'Some College':     3,
      'College Graduate': 4
    })
    df = pre.trans_discretize(df, 'sex', {
      'Male':   0,
      'Female': 1
    })
    df = pre.trans_discretize(df, 'income_poverty', {
      'Below Poverty':              1,
      '<= $75,000, Above Poverty':  2,
      '> $75,000':                  3
    })
    df = pre.trans_discretize(df, 'marital_status', {
      'Not Married':  0,
      'Married':      1
    })
    df = pre.trans_discretize(df, 'rent_or_own', {
      'Rent': 0,
      'Own':  1
    })
    return df
  _df_test = trans(_df_test)
  _df_train = trans(_df_train)

  # discretizamos estas columnas asignando un valor discreto cualquiera, secuencialmente
  # (hacemos la labor de LabelEncoder para los atributos)
  for attr in ['hhs_geo_region', 'employment_industry', 'employment_occupation']:
    _df_train, discrete_dict = pre.trans_discretize(_df_train, attr)
    # añadimos las mismas claves que df_train
    _df_test = pre.trans_discretize(_df_test, attr, discrete_dict)
  
  # VALORES PERDIDOS
  # del resto de atributos, imputamos la mediana (median)
  _df_train = pre.impute(_df_train, strategy='median')
  _df_test = pre.impute(_df_test, strategy='median')
  
  return _df_train, _df_test

In [15]:
train_features, test_features = preprocess(train_features, test_features)

In [16]:
kf = KFold(n_splits=5, shuffle=True, random_state=SEED)

def cross_validation(model, X, y, cv):
  measures = []
  time_total_0 = time.time()

  for train, test in cv.split(X, y):
    X_train = X.iloc[train, :].values
    X_test = X.iloc[test, :].values
    y_train = y.iloc[train, :].values
    y_test = y.iloc[test, :].values
    time_iter_0 = time.time()
    _model = model.fit(X_train, y_train)
    time_iter = time.time() - time_iter_0
    preds = _model.predict_proba(X_test)
    y_preds = pd.DataFrame(
      {
        'h1n1_vaccine':     preds[0][:,1],
        'seasonal_vaccine': preds[1][:,1],
      },
      index = y.iloc[test,:].index
    )
    measure = roc_auc_score(y_test, y_preds)
    print(f"[CV] AUC score: {measure:.4f}. Time: {time_iter:6.2f}")
    measures.append(measure)
  
  time_total = time.time() - time_total_0
  print(f"[CV] Average AUC: {np.mean(measures):.4f}. Total time: {time_total:6.2f}")
  return model, y_preds

def preds_to_df(preds, features):
  return pd.DataFrame(
    {
      'h1n1_vaccine': preds[0][:,1],
      'seasonal_vaccine': preds[1][:,1]
    },
    index = features.index
  )

### _Holdout_ como proxy de ROC en test

In [17]:
def holdout_proxy(model, X, y, test_size=0.2):
  # split train features in two
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=SEED)

  model_cvd, _ = cross_validation(model, X_train, y_train, kf)

  # aprendemos con todos los ejemplos
  model_cvd = model_cvd.fit(X_train, y_train)
  # aplicamos probabilidad
  preds = model_cvd.predict_proba(X_test)

  return roc_auc_score(y_test, preds_to_df(preds, X_test))

In [18]:
# split train features in two
X_train, X_test, y_train, y_test = train_test_split(train_features, train_labels, test_size=0.2, random_state=SEED)

In [19]:
# LightGBM
lgbm = lgb.LGBMClassifier(objective='binary', n_estimators=200, colsample_bytree=0.5, learning_rate= 0.05, num_leaves=50)
multi_lgbm = MultiOutputClassifier(lgbm)
lgbm_cvd, y_preds_lgbm = cross_validation(multi_lgbm, X_train, y_train, kf)

[CV] AUC score: 0.8688. Time:   1.14
[CV] AUC score: 0.8689. Time:   1.25
[CV] AUC score: 0.8603. Time:   1.16
[CV] AUC score: 0.8553. Time:   1.15
[CV] AUC score: 0.8735. Time:   1.22
[CV] Average AUC: 0.8654. Total time:   6.30


In [20]:
# aprendemos con todos los ejemplos
lgbm_cvd = lgbm_cvd.fit(X_train, y_train)
# aplicamos probabilidad
lgbm_preds = lgbm_cvd.predict_proba(X_test)

In [21]:
roc_auc_score(y_test, preds_to_df(lgbm_preds, X_test))

0.869863706394876

### Submission

In [22]:
# LightGBM
lgbm = lgb.LGBMClassifier(objective='binary', n_estimators=200, colsample_bytree=0.5, learning_rate= 0.05, num_leaves=50)
multi_lgbm = MultiOutputClassifier(lgbm)
lgbm_cvd, y_preds_lgbm = cross_validation(multi_lgbm, train_features, train_labels, kf)

[CV] AUC score: 0.8699. Time:   1.25
[CV] AUC score: 0.8679. Time:   1.29
[CV] AUC score: 0.8668. Time:   1.25
[CV] AUC score: 0.8700. Time:   1.27
[CV] AUC score: 0.8663. Time:   1.31
[CV] Average AUC: 0.8682. Total time:   6.80


In [23]:
# aprendemos con todos los ejemplos
lgbm_cvd = lgbm_cvd.fit(train_features, train_labels)
# aplicamos probabilidad
lgbm_preds = lgbm_cvd.predict_proba(test_features)

In [24]:
preds_to_df(lgbm_preds, test_features).to_csv('../submissions/08.csv')