# Intento 14

- Uso de `SMOTENC`.

In [8]:
import pandas as pd
import numpy as np
import time, re
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
from imblearn.over_sampling import SMOTENC
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import MinMaxScaler
from imblearn.pipeline import Pipeline as imbpipeline

# código común
import common.preprocessing as pre
from common.vars import SEED

# display all columns
pd.options.display.max_columns = None

In [2]:
train_features = pd.read_csv('../data/flu_training_set_features.csv', index_col='respondent_id')
train_labels = pd.read_csv('../data/flu_training_set_labels.csv', index_col='respondent_id')
test_features = pd.read_csv('../data/flu_test_set_features.csv', index_col='respondent_id')

In [3]:
def preprocess(df_train, df_test, pre_onehot=True):
  """PREPROCESAMIENTO
  """
  _df_train = df_train.copy()
  _df_test = df_test.copy()
  # antes de realizar one-hot encoding, eliminamos los valores perdidos de atributos categóricos
  # lo haremos imputando a la clase mayoritaria (mode)
  if pre_onehot:
    attrs_imp = ['race', 'employment_status', 'census_msa', 'hhs_geo_region', 'employment_industry', 'employment_occupation']
    _df_train = pre.impute(_df_train, strategy='mode', attrs=attrs_imp)
    _df_test = pre.impute(_df_test, strategy='mode', attrs=attrs_imp)
  
  def trans(df):
    df = pre.trans_onehot(df, 'race')
    df = pre.trans_onehot(df, 'employment_status')
    df = pre.trans_onehot(df, 'census_msa')
    df = pre.trans_discretize(df, 'age_group', {
      '18 - 34 Years':  1,
      '35 - 44 Years':  2,
      '45 - 54 Years':  3,
      '55 - 64 Years':  4,
      '65+ Years':      5
    })
    df = pre.trans_discretize(df, 'education', {
      '< 12 Years':       1,
      '12 Years':         2,
      'Some College':     3,
      'College Graduate': 4
    })
    df = pre.trans_discretize(df, 'sex', {
      'Male':   0,
      'Female': 1
    })
    df = pre.trans_discretize(df, 'income_poverty', {
      'Below Poverty':              1,
      '<= $75,000, Above Poverty':  2,
      '> $75,000':                  3
    })
    df = pre.trans_discretize(df, 'marital_status', {
      'Not Married':  0,
      'Married':      1
    })
    df = pre.trans_discretize(df, 'rent_or_own', {
      'Rent': 0,
      'Own':  1
    })
    return df
  _df_test = trans(_df_test)
  _df_train = trans(_df_train)

  # discretizamos estas columnas asignando un valor discreto cualquiera, secuencialmente
  # (hacemos la labor de LabelEncoder para los atributos)
  for attr in ['hhs_geo_region', 'employment_industry', 'employment_occupation']:
    _df_train, discrete_dict = pre.trans_discretize(_df_train, attr)
    # añadimos las mismas claves que df_train
    _df_test = pre.trans_discretize(_df_test, attr, discrete_dict)
  
  # VALORES PERDIDOS
  # del resto de atributos, imputamos la mediana (median)
  _df_train = pre.impute(_df_train, strategy='median')
  _df_test = pre.impute(_df_test, strategy='median')

  _df_train = _df_train.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
  _df_test = _df_test.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
  
  return _df_train, _df_test

In [4]:
train_features, test_features = preprocess(train_features, test_features)

### _Holdout_ como proxy de ROC en test

In [5]:
# split train features in two
X_train, X_test, y_train, y_test = train_test_split(train_features, train_labels, test_size=0.2, stratify=train_labels, random_state=SEED)

In [6]:
def preds_to_df(preds, features):
  return pd.DataFrame(
    {
      'h1n1_vaccine': preds[0][:,1],
      'seasonal_vaccine': preds[1][:,1]
    },
    index = features.index
  )

In [9]:
pipeline = imbpipeline(steps=[
  ('smote', SMOTENC(categorical_features=[i for i, attr in enumerate(train_features.columns) if attr in ['hhs_geo_region', 'employment_industry', 'employment_occupation']], random_state=SEED)),
  ('scaler', MinMaxScaler()),
  ('classifier', lgb.LGBMClassifier(objective='binary'))# n_estimators=200, colsample_bytree=0.5, learning_rate= 0.05, num_leaves=50))
])

skfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
param_grid = {'classifier__n_estimators':[200], 'classifier__colsample_bytree': [0.25, 0.5], 'classifier__learning_rate': [0.01, 0.05], 'classifier__num_leaves': [50, 75]}
grid_search = {'h1n1_vaccine': GridSearchCV(estimator=pipeline,
                          param_grid=param_grid,
                          scoring='roc_auc',
                          cv=skfold,
                          n_jobs=1, verbose=4)}
grid_search['h1n1_vaccine'].fit(X_train, y_train['h1n1_vaccine'])

Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV 1/5] END classifier__colsample_bytree=0.25, classifier__learning_rate=0.01, classifier__n_estimators=200, classifier__num_leaves=50; total time=   4.4s
[CV 2/5] END classifier__colsample_bytree=0.25, classifier__learning_rate=0.01, classifier__n_estimators=200, classifier__num_leaves=50; total time=   3.4s
[CV 3/5] END classifier__colsample_bytree=0.25, classifier__learning_rate=0.01, classifier__n_estimators=200, classifier__num_leaves=50; total time=   3.2s
[CV 4/5] END classifier__colsample_bytree=0.25, classifier__learning_rate=0.01, classifier__n_estimators=200, classifier__num_leaves=50; total time=   3.6s
[CV 5/5] END classifier__colsample_bytree=0.25, classifier__learning_rate=0.01, classifier__n_estimators=200, classifier__num_leaves=50; total time=   3.9s
[CV 1/5] END classifier__colsample_bytree=0.25, classifier__learning_rate=0.01, classifier__n_estimators=200, classifier__num_leaves=75; total time=   5.3s
[CV 

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=123456, shuffle=True),
             estimator=Pipeline(steps=[('smote',
                                        SMOTENC(categorical_features=[27, 30,
                                                                      31],
                                                random_state=123456)),
                                       ('scaler', MinMaxScaler()),
                                       ('classifier',
                                        LGBMClassifier(objective='binary'))]),
             n_jobs=1,
             param_grid={'classifier__colsample_bytree': [0.25, 0.5],
                         'classifier__learning_rate': [0.01, 0.05],
                         'classifier__n_estimators': [200],
                         'classifier__num_leaves': [50, 75]},
             scoring='roc_auc', verbose=4)

In [10]:
pipeline = imbpipeline(steps=[
  ('smote', SMOTENC(categorical_features=[i for i, attr in enumerate(train_features.columns) if attr in ['hhs_geo_region', 'employment_industry', 'employment_occupation']], random_state=SEED)),
  ('scaler', MinMaxScaler()),
  ('classifier', lgb.LGBMClassifier(objective='binary'))# n_estimators=200, colsample_bytree=0.5, learning_rate= 0.05, num_leaves=50))
])

skfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
param_grid = {'classifier__n_estimators':[200], 'classifier__colsample_bytree': [0.25, 0.5], 'classifier__learning_rate': [0.01, 0.05], 'classifier__num_leaves': [50, 75]}
grid_search['seasonal_vaccine'] = GridSearchCV(estimator=pipeline,
                          param_grid=param_grid,
                          scoring='roc_auc',
                          cv=skfold,
                          n_jobs=1, verbose=4)
grid_search['seasonal_vaccine'].fit(X_train, y_train['seasonal_vaccine'])

Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV 1/5] END classifier__colsample_bytree=0.25, classifier__learning_rate=0.01, classifier__n_estimators=200, classifier__num_leaves=50; total time=   7.6s
[CV 2/5] END classifier__colsample_bytree=0.25, classifier__learning_rate=0.01, classifier__n_estimators=200, classifier__num_leaves=50; total time=   7.0s
[CV 3/5] END classifier__colsample_bytree=0.25, classifier__learning_rate=0.01, classifier__n_estimators=200, classifier__num_leaves=50; total time=   7.2s
[CV 4/5] END classifier__colsample_bytree=0.25, classifier__learning_rate=0.01, classifier__n_estimators=200, classifier__num_leaves=50; total time=   7.0s
[CV 5/5] END classifier__colsample_bytree=0.25, classifier__learning_rate=0.01, classifier__n_estimators=200, classifier__num_leaves=50; total time=   7.2s
[CV 1/5] END classifier__colsample_bytree=0.25, classifier__learning_rate=0.01, classifier__n_estimators=200, classifier__num_leaves=75; total time=   7.4s
[CV 

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=123456, shuffle=True),
             estimator=Pipeline(steps=[('smote',
                                        SMOTENC(categorical_features=[27, 30,
                                                                      31],
                                                random_state=123456)),
                                       ('scaler', MinMaxScaler()),
                                       ('classifier',
                                        LGBMClassifier(objective='binary'))]),
             n_jobs=1,
             param_grid={'classifier__colsample_bytree': [0.25, 0.5],
                         'classifier__learning_rate': [0.01, 0.05],
                         'classifier__n_estimators': [200],
                         'classifier__num_leaves': [50, 75]},
             scoring='roc_auc', verbose=4)

In [11]:
grid_search['h1n1_vaccine'].best_estimator_

Pipeline(steps=[('smote',
                 SMOTENC(categorical_features=[27, 30, 31],
                         random_state=123456)),
                ('scaler', MinMaxScaler()),
                ('classifier',
                 LGBMClassifier(colsample_bytree=0.5, learning_rate=0.05,
                                n_estimators=200, num_leaves=50,
                                objective='binary'))])

In [12]:
grid_search['seasonal_vaccine'].best_estimator_

Pipeline(steps=[('smote',
                 SMOTENC(categorical_features=[27, 30, 31],
                         random_state=123456)),
                ('scaler', MinMaxScaler()),
                ('classifier',
                 LGBMClassifier(colsample_bytree=0.5, learning_rate=0.05,
                                n_estimators=200, num_leaves=50,
                                objective='binary'))])

In [13]:
# aprendemos con todos los ejemplos
h1n1_result_model = grid_search['h1n1_vaccine'].best_estimator_.fit(X_train, y_train['h1n1_vaccine'])
# aplicamos probabilidad
h1n1_preds = h1n1_result_model.predict_proba(X_test)

In [14]:
# aprendemos con todos los ejemplos
seasonal_result_model = grid_search['seasonal_vaccine'].best_estimator_.fit(X_train, y_train['seasonal_vaccine'])
# aplicamos probabilidad
seasonal_preds = seasonal_result_model.predict_proba(X_test)

In [15]:
roc_auc_score(y_test, preds_to_df([h1n1_preds, seasonal_preds], X_test))

0.8543942441488444

### Submission

In [16]:
# aprendemos con todos los ejemplos
h1n1_result_model = grid_search['h1n1_vaccine'].best_estimator_.fit(train_features, train_labels['h1n1_vaccine'])
# aplicamos probabilidad
h1n1_preds = h1n1_result_model.predict_proba(test_features)

In [17]:
# aprendemos con todos los ejemplos
seasonal_result_model = grid_search['seasonal_vaccine'].best_estimator_.fit(train_features, train_labels['seasonal_vaccine'])
# aplicamos probabilidad
seasonal_preds = seasonal_result_model.predict_proba(test_features)

In [18]:
preds_to_df([h1n1_preds, seasonal_preds], test_features).to_csv('../submissions/14.csv')