# Intento 1

| Score | Timestamp |
| --- | --- |
| **0.8589** | 2021-12-31 02:08:12 UTC |

- Uso de LightGBM.
- Estrategia de preprocesado básica.

In [1]:
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
from sklearn.multioutput import MultiOutputClassifier
import lightgbm as lgb

# código común
import common.preprocessing as pre
from common.vars import SEED

# display all columns
pd.options.display.max_columns = None

In [2]:
train_features = pd.read_csv('../data/flu_training_set_features.csv')
train_labels = pd.read_csv('../data/flu_training_set_labels.csv')
test_features = pd.read_csv('../data/flu_test_set_features.csv')

In [3]:
# no usamos el respondent_id
train_features.drop(labels=['respondent_id'], axis=1, inplace=True)
train_labels.drop(labels=['respondent_id'], axis=1, inplace=True)
test_features.drop(labels=['respondent_id'], axis=1, inplace=True)

In [4]:
# estadísticas de atributos numéricos
train_features.describe(exclude=object).round(4)

Unnamed: 0,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_h1n1,doctor_recc_seasonal,chronic_med_condition,child_under_6_months,health_worker,health_insurance,opinion_h1n1_vacc_effective,opinion_h1n1_risk,opinion_h1n1_sick_from_vacc,opinion_seas_vacc_effective,opinion_seas_risk,opinion_seas_sick_from_vacc,household_adults,household_children
count,26615.0,26591.0,26636.0,26499.0,26688.0,26665.0,26620.0,26625.0,26579.0,24547.0,24547.0,25736.0,25887.0,25903.0,14433.0,26316.0,26319.0,26312.0,26245.0,26193.0,26170.0,26458.0,26458.0
mean,1.6185,1.2625,0.0488,0.7256,0.069,0.8256,0.3586,0.3373,0.6773,0.2203,0.3297,0.2833,0.0826,0.1119,0.8797,3.8506,2.3426,2.3577,4.026,2.7192,2.1181,0.8865,0.5346
std,0.9103,0.6181,0.2155,0.4462,0.2534,0.3794,0.4796,0.4728,0.4675,0.4145,0.4701,0.4506,0.2753,0.3153,0.3253,1.0074,1.2855,1.3628,1.0866,1.3851,1.3329,0.7534,0.9282
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0
25%,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,3.0,1.0,1.0,4.0,2.0,1.0,0.0,0.0
50%,2.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,4.0,2.0,2.0,4.0,2.0,2.0,1.0,0.0
75%,2.0,2.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,5.0,4.0,4.0,5.0,4.0,4.0,1.0,1.0
max,3.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,5.0,5.0,5.0,5.0,5.0,5.0,3.0,3.0


In [5]:
# estadísticas de atributos categóricos
train_features.describe(include=object)

Unnamed: 0,age_group,education,race,sex,income_poverty,marital_status,rent_or_own,employment_status,hhs_geo_region,census_msa,employment_industry,employment_occupation
count,26707,25300,26707,26707,22284,25299,24665,25244,26707,26707,13377,13237
unique,5,4,4,2,3,2,2,3,10,3,21,23
top,65+ Years,College Graduate,White,Female,"<= $75,000, Above Poverty",Married,Own,Employed,lzgpxyit,"MSA, Not Principle City",fcxhlnwr,xtkaffoo
freq,6843,10097,21222,15858,12777,13555,18736,13560,4297,11645,2468,1778


In [6]:
def preprocess(df_train, df_test, pre_onehot=True):
  """PREPROCESAMIENTO
  """
  _df_train = df_train.copy()
  _df_test = df_test.copy()
  # antes de realizar one-hot encoding, eliminamos los valores perdidos de atributos categóricos
  # lo haremos imputando a la clase mayoritaria (mode)
  if pre_onehot:
    attrs_imp = ['race', 'employment_status', 'census_msa', 'hhs_geo_region', 'employment_industry', 'employment_occupation']
    _df_train = pre.impute(_df_train, strategy='mode', attrs=attrs_imp)
    _df_test = pre.impute(_df_test, strategy='mode', attrs=attrs_imp)
  
  # discretizamos estas columnas asignando un valor discreto cualquiera, secuencialmente
  # (hacemos la labor de LabelEncoder para los atributos)
  for attr in df_train.columns:
    _df_train, discrete_dict = pre.trans_discretize(_df_train, attr)
    # añadimos las mismas claves que df_train
    _df_test = pre.trans_discretize(_df_test, attr, discrete_dict)
  
  # VALORES PERDIDOS
  # del resto de atributos, imputamos la mediana (median)
  _df_train = pre.impute(_df_train, strategy='median')
  _df_test = pre.impute(_df_test, strategy='median')
  
  return _df_train, _df_test

In [7]:
train_features.isna().sum()

h1n1_concern                      92
h1n1_knowledge                   116
behavioral_antiviral_meds         71
behavioral_avoidance             208
behavioral_face_mask              19
behavioral_wash_hands             42
behavioral_large_gatherings       87
behavioral_outside_home           82
behavioral_touch_face            128
doctor_recc_h1n1                2160
doctor_recc_seasonal            2160
chronic_med_condition            971
child_under_6_months             820
health_worker                    804
health_insurance               12274
opinion_h1n1_vacc_effective      391
opinion_h1n1_risk                388
opinion_h1n1_sick_from_vacc      395
opinion_seas_vacc_effective      462
opinion_seas_risk                514
opinion_seas_sick_from_vacc      537
age_group                          0
education                       1407
race                               0
sex                                0
income_poverty                  4423
marital_status                  1408
r

In [8]:
test_features.isna().sum()

h1n1_concern                      85
h1n1_knowledge                   122
behavioral_antiviral_meds         79
behavioral_avoidance             213
behavioral_face_mask              19
behavioral_wash_hands             40
behavioral_large_gatherings       72
behavioral_outside_home           82
behavioral_touch_face            128
doctor_recc_h1n1                2160
doctor_recc_seasonal            2160
chronic_med_condition            932
child_under_6_months             813
health_worker                    789
health_insurance               12228
opinion_h1n1_vacc_effective      398
opinion_h1n1_risk                380
opinion_h1n1_sick_from_vacc      375
opinion_seas_vacc_effective      452
opinion_seas_risk                499
opinion_seas_sick_from_vacc      521
age_group                          0
education                       1407
race                               0
sex                                0
income_poverty                  4497
marital_status                  1442
r

In [9]:
train_features.dtypes

h1n1_concern                   float64
h1n1_knowledge                 float64
behavioral_antiviral_meds      float64
behavioral_avoidance           float64
behavioral_face_mask           float64
behavioral_wash_hands          float64
behavioral_large_gatherings    float64
behavioral_outside_home        float64
behavioral_touch_face          float64
doctor_recc_h1n1               float64
doctor_recc_seasonal           float64
chronic_med_condition          float64
child_under_6_months           float64
health_worker                  float64
health_insurance               float64
opinion_h1n1_vacc_effective    float64
opinion_h1n1_risk              float64
opinion_h1n1_sick_from_vacc    float64
opinion_seas_vacc_effective    float64
opinion_seas_risk              float64
opinion_seas_sick_from_vacc    float64
age_group                       object
education                       object
race                            object
sex                             object
income_poverty           

In [10]:
test_features.dtypes

h1n1_concern                   float64
h1n1_knowledge                 float64
behavioral_antiviral_meds      float64
behavioral_avoidance           float64
behavioral_face_mask           float64
behavioral_wash_hands          float64
behavioral_large_gatherings    float64
behavioral_outside_home        float64
behavioral_touch_face          float64
doctor_recc_h1n1               float64
doctor_recc_seasonal           float64
chronic_med_condition          float64
child_under_6_months           float64
health_worker                  float64
health_insurance               float64
opinion_h1n1_vacc_effective    float64
opinion_h1n1_risk              float64
opinion_h1n1_sick_from_vacc    float64
opinion_seas_vacc_effective    float64
opinion_seas_risk              float64
opinion_seas_sick_from_vacc    float64
age_group                       object
education                       object
race                            object
sex                             object
income_poverty           

In [11]:
train_features, test_features = preprocess(train_features, test_features)

In [12]:
kf = KFold(n_splits=5, shuffle=True, random_state=SEED)

def cross_validation(model, X, y, cv):
  measures = []
  time_total_0 = time.time()

  for train, test in cv.split(X, y):
    X_train = X.loc[train, :].values
    X_test = X.loc[test, :].values
    y_train = y.loc[train, :].values
    y_test = y.loc[test, :].values
    time_iter_0 = time.time()
    _model = model.fit(X_train, y_train)
    time_iter = time.time() - time_iter_0
    preds = _model.predict_proba(X_test)
    y_preds = pd.DataFrame(
      {
        'h1n1_vaccine':     preds[0][:,1],
        'seasonal_vaccine': preds[1][:,1],
      },
      index = y.loc[test,:].index
    )
    measure = roc_auc_score(y_test, y_preds)
    print(f"[CV] AUC score: {measure:.4f}. Time: {time_iter:6.2f}")
    measures.append(measure)
  
  time_total = time.time() - time_total_0
  print(f"[CV] Average AUC: {np.mean(measures):.4f}. Total time: {time_total:6.2f}")
  return model, y_preds

def compile_submission(preds, route):
  df_submission = pd.read_csv('../data/submission_format.csv')
  y_test_preds = pd.DataFrame(
    {
      'h1n1_vaccine': preds[0][:,1],
      'seasonal_vaccine': preds[1][:,1]
    },
    index = df_submission.index
  )
  df_submission['h1n1_vaccine'] = y_test_preds.h1n1_vaccine
  df_submission['seasonal_vaccine'] = y_test_preds.seasonal_vaccine
  df_submission.to_csv(route, index=False)

In [13]:
# LightGBM
lgbm = lgb.LGBMClassifier(objective='binary', n_estimators=200)
multi_lgbm = MultiOutputClassifier(lgbm)
lgbm_cvd, y_preds_lgbm = cross_validation(multi_lgbm, train_features, train_labels, kf)

[CV] AUC score: 0.8622. Time:   1.55
[CV] AUC score: 0.8638. Time:   1.18
[CV] AUC score: 0.8614. Time:   1.38
[CV] AUC score: 0.8648. Time:   1.27
[CV] AUC score: 0.8610. Time:   1.37
[CV] Average AUC: 0.8626. Total time:   7.33


In [14]:
# aprendemos con todos los ejemplos
lgbm_cvd = lgbm_cvd.fit(train_features, train_labels)
# aplicamos probabilidad
lgbm_preds = lgbm_cvd.predict_proba(test_features)

In [15]:
compile_submission(lgbm_preds, '../submissions/01.csv')