# Advanced ensembling techniques

Improve results on the dataset from previous homework with:
1. Simple ensembling approaches (mean, median, etc.)
2. Stacking

Airbnb New User Bookings from https://www.kaggle.com/competitions/airbnb-recruiting-new-user-bookings. Our task is to predict which country a new user's first booking destination will be.

In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
import lightgbm as lgb

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, cross_validate, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression

skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=24)
np.random.seed(24)


In [2]:
train = pd.read_csv('./data/airbnb-recruiting/train_fe_session.csv')


In [3]:
def get_train_test(df):
    df = df.copy()
    X = df.drop(columns=['country_destination', 'id'])
    y = df['country_destination']
    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=24)
    return X_train, X_test, y_train, y_test


def get_feat_types(df):
    numeric_features = df.select_dtypes('number').columns
    categorical_features = df.select_dtypes('object').columns
    return numeric_features, categorical_features


def get_train_test_transform(df, get_test=False):
    df = df.copy()
    X_train, X_test, y_train, y_test = get_train_test(df)
    numeric_features, categorical_features = get_feat_types(X_train)

    numeric_transformer = Pipeline([
        ('imputer', SimpleImputer(strategy='median'))
        ])

    categorical_transformer = Pipeline([
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('enc', OrdinalEncoder(handle_unknown='use_encoded_value',
                               unknown_value=-1))])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)],
            verbose_feature_names_out=False)

    preprocessor.set_output(transform='pandas')

    X_train_transform = preprocessor.fit_transform(X_train)

    y_transformer = LabelEncoder()
    y_train_enc = y_transformer.fit_transform(y_train)

    if get_test == True:
        X_test_transform = preprocessor.transform(X_test)
        y_test_enc = y_transformer.transform(y_test)
        return X_train_transform, X_test_transform, y_train_enc, y_test_enc

    return X_train_transform, y_train_enc


In [4]:
target_names = train['country_destination'].unique()
target_names


array(['NDF', 'US', 'other', 'FR', 'CA', 'GB', 'ES', 'IT', 'PT', 'NL',
       'DE', 'AU'], dtype=object)

In [21]:
X_train_transform, y_train_enc = get_train_test_transform(train)


### XGBoost model

In [26]:
num_rounds = 1_000

parameters_xgb = {
    "objective": "multi:softprob",
    "num_class": 12,
    "eta": 0.1,
    "verbosity": 1,
    "seed": 24,
    "eval_metric": "merror",
    "tree_method": "hist",
    "grow_policy": "lossguide",

    # regularization parameters
    "max_depth": 10,
    "max_leaves": 15,
    "subsample": 0.7,
    "colsample_bytree": 0.6,
}

xgb_train = xgb.DMatrix(X_train_transform, y_train_enc)

results = xgb.cv(parameters_xgb, xgb_train, num_rounds,
                 folds=skf, early_stopping_rounds=10, verbose_eval=50)


[0]	train-merror:0.39005+0.00005	test-merror:0.39002+0.00036
[50]	train-merror:0.35189+0.00026	test-merror:0.35394+0.00097
[100]	train-merror:0.34702+0.00067	test-merror:0.35191+0.00083
[133]	train-merror:0.34484+0.00076	test-merror:0.35158+0.00106


In [5]:
parameters_xgb = {
    "n_estimators":135,
    "objective": "multi:softprob",
    "num_class": 12,
    "learning_rate": 0.1,
    "verbosity": 1,
    "random_state": 24,
    "eval_metric": "merror",
    "tree_method": "hist",
    "grow_policy": "lossguide",

    # regularization parameters
    "max_depth": 10,
    "max_leaves": 15,
    "subsample": 0.7,
    "colsample_bytree": 0.6,
}

model_xgb = xgb.XGBClassifier(**parameters_xgb)


### LightGBM model

In [28]:
num_rounds = 1_000

parameters_lgb = {
    "objective": "multiclass",
    "num_class": 12,
    "learning_rate": 0.1,
    "num_leaves": 15,
    "num_threads": 4,
    "seed": 24,
    "metric": "multi_error",
    "verbose": 0,
    "force_row_wise": True,
    "is_unbalance": True,

    #regularization
    "colsample_bytree": 0.6,
    "subsample": 0.7,
    "subsample_freq": 1,
    "min_data_in_leaf": 25
}


X_train_transform, y_train_enc = get_train_test_transform(train)

lgb_train = lgb.Dataset(X_train_transform, y_train_enc)

result = lgb.cv(
    parameters_lgb, lgb_train, num_rounds, folds=skf,
    eval_train_metric=True,
    callbacks=[lgb.early_stopping(10), lgb.log_evaluation(50)]
    )


Training until validation scores don't improve for 10 rounds
[50]	cv_agg's train multi_error: 0.346867 + 0.000438177	cv_agg's valid multi_error: 0.353473 + 0.00097905
Early stopping, best iteration is:
[62]	cv_agg's train multi_error: 0.344563 + 0.000628837	cv_agg's valid multi_error: 0.353074 + 0.00114784


In [6]:
parameters_lgb = {
    "n_estimators": 60,
    "objective": "multiclass",
    "num_class": 12,
    "learning_rate": 0.1,
    "num_leaves": 15,
    "n_jobs": 4,
    "random_state": 24,
    "metric": "multi_error",
    "verbose": 0,
    "force_row_wise": True,
    "is_unbalance": True,

    #regularization
    "colsample_bytree": 0.6,
    "subsample": 0.7,
    "subsample_freq": 1,
    "min_child_samples": 25
}

model_lgb = lgb.LGBMClassifier(**parameters_lgb)


### ExtraTreesClassifier model

In [30]:
def cv_results(model, X, y):
    results = cross_validate(model, X, y, scoring='accuracy',
                            cv=skf, return_train_score=True)
    df_results = pd.DataFrame(results)
    df_results = df_results[['test_score', 'train_score']].apply(lambda x: 1-x)
    return df_results


In [32]:
X_train_transform, X_test_transform, y_train_enc, y_test_enc = get_train_test_transform(train, get_test=True)


In [None]:
model_etc = ExtraTreesClassifier(max_depth=5, max_features=10,
                                 min_samples_leaf=3, random_state=24)

cv_results(model_etc, X_train_transform, y_train_enc)


Unnamed: 0,test_score,train_score
0,0.4159,0.415416
1,0.415039,0.415188
2,0.415531,0.415452


In [34]:
model_etc = ExtraTreesClassifier(max_depth=10, max_features=100,
                                 min_samples_leaf=3, random_state=24)

cv_results(model_etc, X_train_transform, y_train_enc)


Unnamed: 0,test_score,train_score
0,0.364248,0.348674
1,0.364652,0.350395
2,0.363176,0.35094


In [35]:
model_etc = ExtraTreesClassifier(max_depth=10, max_features=150,
                                 min_samples_leaf=5, random_state=24)

cv_results(model_etc, X_train_transform, y_train_enc)


Unnamed: 0,test_score,train_score
0,0.362614,0.347874
1,0.362193,0.348296
2,0.361314,0.348955


In [36]:
model_etc = ExtraTreesClassifier(max_depth=25, max_features=150,
                                 min_samples_leaf=10, random_state=24)

cv_results(model_etc, X_train_transform, y_train_enc)


Unnamed: 0,test_score,train_score
0,0.358117,0.265021
1,0.357203,0.263879
2,0.355692,0.267103


In [7]:
model_etc = ExtraTreesClassifier(max_depth=10, max_features=100,
                                 min_samples_leaf=3, random_state=24)


## Ensembling

In [14]:
xgb_prediction = []
lgb_prediction = []
etc_prediction = []

for train, val in skf.split(X_train_transform, y_train_enc):
    model_xgb.fit(X_train_transform.iloc[train], y_train_enc[train])
    model_lgb.fit(X_train_transform.iloc[train], y_train_enc[train])
    model_etc.fit(X_train_transform.iloc[train], y_train_enc[train])

    xgb_prediction.append([y_train_enc[val], model_xgb.predict_proba(X_train_transform.iloc[val])])
    lgb_prediction.append([y_train_enc[val], model_lgb.predict_proba(X_train_transform.iloc[val])])
    etc_prediction.append([y_train_enc[val], model_etc.predict_proba(X_train_transform.iloc[val])])


In [41]:
print('xgb model, accuracy: ', np.mean([accuracy_score(i[0], np.argmax(i[1], axis=1)) for i in xgb_prediction]))
print('lgb model, accuracy: ', np.mean([accuracy_score(i[0], np.argmax(i[1], axis=1)) for i in lgb_prediction]))
print('etc model, accuracy: ', np.mean([accuracy_score(i[0], np.argmax(i[1], axis=1)) for i in etc_prediction]))


xgb model, accuracy:  0.6477746544858279
lgb model, accuracy:  0.6462871866947763
etc model, accuracy:  0.6359744670883111


### Averaging

In [42]:
np.mean(
    [accuracy_score(xgb_p[0], np.argmax((xgb_p[1]/3 + lgb_p[1]/3 + etc_p[1]/3), axis=1))
     for xgb_p, lgb_p, etc_p in zip(xgb_prediction, lgb_prediction, etc_prediction)]
)


0.6480030452096509

In [43]:
np.mean(
    [accuracy_score(xgb_p[0], np.argmax((xgb_p[1]*0.5 + lgb_p[1]*0.5), axis=1))
     for xgb_p, lgb_p in zip(xgb_prediction, lgb_prediction)]
)


0.6484012649332396

In [44]:
np.mean(
    [accuracy_score(xgb_p[0], np.argmax((xgb_p[1]*0.5 + etc_p[1]*0.5), axis=1))
     for xgb_p, etc_p in zip(xgb_prediction, etc_prediction)]
)


0.6466151323494964

In [45]:
np.mean(
    [accuracy_score(lgb_p[0], np.argmax((lgb_p[1]*0.5 + etc_p[1]*0.5), axis=1))
     for lgb_p, etc_p in zip(lgb_prediction, etc_prediction)]
)


0.6437456078706957

### Geometric mean

In [46]:
np.mean(
    [accuracy_score(xgb_p[0], np.argmax((xgb_p[1] * lgb_p[1] * etc_p[1])**(1/3), axis=1))
     for xgb_p, lgb_p, etc_p in zip(xgb_prediction, lgb_prediction, etc_prediction)]
)


0.6478332162098853

In [47]:
np.mean(
    [accuracy_score(xgb_p[0], np.argmax((xgb_p[1] * lgb_p[1])**(1/2), axis=1))
     for xgb_p, lgb_p in zip(xgb_prediction, lgb_prediction)]
)


0.6486765050363082

In [48]:
np.mean(
    [accuracy_score(xgb_p[0], np.argmax((xgb_p[1] * etc_p[1])**(1/2), axis=1))
     for xgb_p, etc_p in zip(xgb_prediction, etc_prediction)]
)


0.6468435230733193

In [49]:
np.mean(
    [accuracy_score(lgb_p[0], np.argmax((lgb_p[1] * etc_p[1])**(1/2), axis=1))
     for lgb_p, etc_p in zip(lgb_prediction, etc_prediction)]
)


0.6442433825251815

### Weighted average

In [80]:
np.mean(
    [accuracy_score(xgb_p[0], np.argmax((xgb_p[1]*0.35 + lgb_p[1]*0.35 + etc_p[1]*0.3), axis=1))
     for xgb_p, lgb_p, etc_p in zip(xgb_prediction, lgb_prediction, etc_prediction)]
)


0.6479444834855937

In [81]:
np.mean(
    [accuracy_score(xgb_p[0], np.argmax((xgb_p[1]*0.4 + lgb_p[1]*0.4 + etc_p[1]*0.2), axis=1))
     for xgb_p, lgb_p, etc_p in zip(xgb_prediction, lgb_prediction, etc_prediction)]
)


0.648190442726634

In [82]:
np.mean(
    [accuracy_score(xgb_p[0], np.argmax((xgb_p[1]*0.45 + lgb_p[1]*0.45 + etc_p[1]*0.1), axis=1))
     for xgb_p, lgb_p, etc_p in zip(xgb_prediction, lgb_prediction, etc_prediction)]
)


0.6483661278988053

In [84]:
np.mean(
    [accuracy_score(xgb_p[0], np.argmax((xgb_p[1]*0.6 + lgb_p[1]*0.4), axis=1))
     for xgb_p, lgb_p in zip(xgb_prediction, lgb_prediction)]
)


0.6486179433122512

In [86]:
np.mean(
    [accuracy_score(xgb_p[0], np.argmax((xgb_p[1]*0.7 + lgb_p[1]*0.3), axis=1))
     for xgb_p, lgb_p in zip(xgb_prediction, lgb_prediction)]
)


0.6485769501054112

In [108]:
np.mean(
    [accuracy_score(xgb_p[0], np.argmax((xgb_p[1]*0.57 + lgb_p[1]*0.43), axis=1))
     for xgb_p, lgb_p in zip(xgb_prediction, lgb_prediction)]
)


0.6487526352775825

The best result 0.64875 for weighted average of two model: XGBoost and LightGBM. 

## Blending

1. Divide a training dataset on a new train and hold out.
2. Train n-models on the new training dataset and make predictions on hold-out.
3. Concatenate all hold-out predictions and train a new meta-model on this data.
4. Make predictions on test data with models trained on the "new train" dataset.
5. Concatenate these predictions and use a meta-model to get final predictions.


In [13]:
# Divide dataset on train and test
X_train_val, X_test, y_train_val, y_test = get_train_test(train)


In [14]:
# 1. Divide a training dataset on a new train and hold out
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val,
                                                  stratify=y_train_val,
                                                  test_size=0.2, random_state=24)


In [37]:
def pipeline(X_train, y_train, model):
    numeric_features, categorical_features = get_feat_types(X_train)

    numeric_transformer = Pipeline([
        ('imputer', SimpleImputer(strategy='median'))
        ])

    categorical_transformer = Pipeline([
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('enc', OrdinalEncoder(handle_unknown='use_encoded_value',
                               unknown_value=-1))])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)],
            verbose_feature_names_out=False)

    preprocessor.set_output(transform='pandas')

    pipeline = Pipeline([
        ('prepros', preprocessor),
        ('classifier', model)
        ])

    pipeline.fit(X_train, y_train)

    return pipeline


In [36]:
y_transformer = LabelEncoder()
y_transformer.fit(y_train)
y_train_enc = y_transformer.transform(y_train)
y_val_enc = y_transformer.transform(y_val)
y_test_enc = y_transformer.transform(y_test)


In [30]:
# 2. Train all models on the new training dataset and make predictions on hold-out
model_xgb_fit = pipeline(X_train, y_train_enc, model_xgb)
pred_xgb = model_xgb_fit.predict_proba(X_val)

model_lgb_fit = pipeline(X_train, y_train_enc, model_lgb)
pred_lgb = model_lgb_fit.predict_proba(X_val)

model_etc_fit = pipeline(X_train, y_train_enc, model_etc)
pred_etc = model_etc_fit.predict_proba(X_val)


In [None]:
# 3.1 Concatenate all hold-out predictions
stack_val_preds = np.column_stack((pred_xgb, pred_lgb, pred_etc))


In [42]:
# specify meta model
meta_model = LogisticRegression(max_iter=1000, multi_class='multinomial', random_state=24, C=0.3)


In [43]:
# 3.2 Train a new meta-model on concatenated predictions
meta_model.fit(stack_val_preds, y_val_enc)


In [31]:
# 4. Make predictions on test data with models trained on the "new train" dataset
test_pred_xgb = model_xgb_fit.predict_proba(X_test)

test_pred_lgb = model_lgb_fit.predict_proba(X_test)

test_pred_etc = model_etc_fit.predict_proba(X_test)


In [None]:
# 5. Concatenate these predictions and use a meta-model to get final predictions
stack_test_preds = np.column_stack((test_pred_xgb, test_pred_lgb, test_pred_etc))

predictions = meta_model.predict_proba(stack_test_preds)


In [54]:
print('accuracy on cross validation: ', np.mean(cross_val_score(meta_model, stack_val_preds,
                                                         y_val_enc, cv=skf, scoring='accuracy')))
print('accuracy on validation: ', meta_model.score(stack_val_preds, y_val_enc))
print('accuracy on test: ', meta_model.score(stack_test_preds, y_test_enc))


accuracy on cross validation:  0.650474349964863
accuracy on validation:  0.6505914734129773
accuracy on test:  0.6496685484059872


## Stacking

1. Split train data on n-folds.
2. Train on each fold a model and make predictions for a hold-out fold.
3. Concatenate hold-out folds predictions and add them as a new meta-feature. Train a final meta-model.
4. Train a first-level model on the whole train data set.
5. Make predictions on a test set with the first-level model. Get your meta-feature for the test set.
6. Concatenate the meta-feature to the test set. Use the meta-model from step 3 to make final predictions

In [8]:
# Divide dataset on train and test
X_train, X_test, y_train, y_test = get_train_test(train)


In [9]:
y_transformer = LabelEncoder()
y_transformer.fit(y_train)
y_train_enc = y_transformer.transform(y_train)
y_test_enc = y_transformer.transform(y_test)


In [11]:
xgb_prediction = []
lgb_prediction = []
etc_prediction = []

# 1. Split train data on n-folds
for i, (train, val) in enumerate(skf.split(X_train, y_train_enc)):
    # 2. Train on each fold a model and make predictions for a hold-out fold
    model_xgb_fit = pipeline(X_train.iloc[train], y_train_enc[train], model_xgb)
    model_lgb_fit = pipeline(X_train.iloc[train], y_train_enc[train], model_lgb)
    model_etc_fit = pipeline(X_train.iloc[train], y_train_enc[train], model_etc)

    xgb_prediction.append([y_train_enc[val], model_xgb_fit.predict_proba(X_train.iloc[val])])
    lgb_prediction.append([y_train_enc[val], model_lgb_fit.predict_proba(X_train.iloc[val])])
    etc_prediction.append([y_train_enc[val], model_etc_fit.predict_proba(X_train.iloc[val])])

    # 3.1 Concatenate hold-out folds predictions and add them as a new meta-feature
    X_train_new = X_train.copy()
    for j in range(12):
        X_train_new.loc[X_train.index[val], f'xgb_{j}'] = xgb_prediction[i][1][:, j]
        X_train_new.loc[X_train.index[val], f'lgb_{j}'] = lgb_prediction[i][1][:, j]
        X_train_new.loc[X_train.index[val], f'etc_{j}'] = etc_prediction[i][1][:, j]


In [12]:
# specify meta model
parameters_xgb = {
    "n_estimators":135,
    "objective": "multi:softprob",
    "num_class": 12,
    "learning_rate": 0.1,
    "verbosity": 1,
    "random_state": 24,
    "eval_metric": "merror",
    "tree_method": "hist",
    "grow_policy": "lossguide",

    # regularization parameters
    "max_depth": 10,
    "max_leaves": 15,
    "subsample": 0.7,
    "colsample_bytree": 0.6,
}

meta_model = xgb.XGBClassifier(**parameters_xgb)

# 3.2 Train a final meta-model
meta_model_fit = pipeline(X_train_new, y_train_enc, meta_model)


In [13]:
# 4. Train a first-level model on the whole train data set
model_xgb_fit_new = pipeline(X_train, y_train_enc, model_xgb)
model_lgb_fit_new = pipeline(X_train, y_train_enc, model_lgb)
model_etc_fit_new = pipeline(X_train, y_train_enc, model_etc)


In [14]:
# 5. Make predictions on a test set with the first-level model.
# Get your meta-feature for the test set
test_pred_xgb = model_xgb_fit_new.predict_proba(X_test)
test_pred_lgb = model_lgb_fit_new.predict_proba(X_test)
test_pred_etc = model_etc_fit_new.predict_proba(X_test)


In [15]:
# 6.1 Concatenate the meta-feature to the test set.
X_test_new = X_test.copy()
for j in range(12):
    X_test_new[f'xgb_{j}'] = test_pred_xgb[:, j]
    X_test_new[f'lgb_{j}'] = test_pred_lgb[:, j]
    X_test_new[f'etc_{j}'] = test_pred_etc[:, j]


In [24]:
# 6.2 Use the meta-model from step 3 to make final predictions
predictions = meta_model_fit.predict_proba(X_test_new)


In [29]:
print('accuracy on cross validation: ', np.mean(cross_val_score(
    meta_model_fit, X_train_new, y_train_enc, cv=skf, scoring='accuracy')))
print('accuracy on train: ', meta_model_fit.score(X_train_new, y_train_enc))
print('accuracy on test: ', meta_model_fit.score(X_test_new, y_test_enc))


accuracy on cross validation:  0.6454673225579761
accuracy on train:  0.6519793862731319
accuracy on test:  0.6472558618912652
