In [1]:
import numpy as np
import pandas as pd

from sklearn.metrics import accuracy_score

In [2]:
# Loading everything

lgbm_base = pd.read_csv("models/lgbm-2022-01-24-11-31/OOF.csv", index_col = 0)
lgbm_te = pd.read_csv("models/lgbm-2022-01-24-11-51/OOF.csv", index_col = 0)
svm = pd.read_csv("models/svm-2022-01-24-12-15/OOF.csv", index_col = 0)
cb = pd.read_csv("models/cb-2022-01-24-13-06/OOF.csv", index_col = 0)

train_df = pd.read_csv("data/train.csv")

In [3]:
# Fix svm prediction format

columns = [f"class_{i}" for i in range(7)]

svm[columns] = pd.get_dummies(svm["predicted"])
svm.drop("predicted", axis = 1, inplace = True)

In [4]:
# OOF inference
def inference_func(df):
    return np.argmax(df[columns].to_numpy(), axis = 1) + 1

print("LGBM BASE:", f"{accuracy_score(inference_func(lgbm_base), train_df['Cover_Type']):.4f}")
print("LGBM Target Encoding:", f"{accuracy_score(inference_func(lgbm_te), train_df['Cover_Type']):.4f}")
print("SVM:", f"{accuracy_score(inference_func(svm), train_df['Cover_Type']):.4f}")
print("CB:", f"{accuracy_score(inference_func(cb), train_df['Cover_Type']):.4f}")
averaged = (lgbm_base + lgbm_te + svm + cb) / 4
print("Averaging:", f"{accuracy_score(inference_func(averaged), train_df['Cover_Type']):.4f}")

LGBM BASE: 0.8646
LGBM Target Encoding: 0.8663
SVM: 0.8430
CB: 0.8724
Averaging: 0.8755


In [5]:
# Stacking with OOF

import optuna
from optuna.samplers import TPESampler
from data import validation
from sklearn.linear_model import LogisticRegression

import warnings
warnings.filterwarnings("ignore")

concat_df = pd.concat([lgbm_base, lgbm_te, svm, cb], axis = 1)

blender_fit = False

if blender_fit:
    def objective(trial):
        trial_params = {'C': trial.suggest_loguniform('C', 1e-04, 1e04)}

        eval_scores = []
        splits = validation(5, 43)

        for i, (train_index, val_index) in enumerate(splits):
            train_split = concat_df.iloc[train_index]
            val_split = concat_df.iloc[val_index]

            model = LogisticRegression(C = trial_params['C'], fit_intercept = False, max_iter = 200)

            model.fit(train_split, train_df.iloc[train_index]['Cover_Type'])
            eval_scores.append(accuracy_score(model.predict(val_split), train_df.iloc[val_index]['Cover_Type']))

        return np.mean(eval_scores)

    sampler = TPESampler(seed=42)  # Make the sampler behave in a deterministic way.
    study = optuna.create_study(direction="maximize", sampler=sampler)
    study.optimize(objective, n_trials=10)

In [6]:
# Ensemble score
blender = LogisticRegression(C = 0.1, fit_intercept = False, max_iter = 200)
blender.fit(concat_df, train_df['Cover_Type'])
blender_predicted = blender.predict(concat_df)
print("Ensemble:", f"{accuracy_score(blender_predicted, train_df['Cover_Type']):.4f}")

Ensemble: 0.8788


In [7]:
# Ensemble errors by class
import plotly.express as px

train_df["Error"] = blender_predicted != train_df['Cover_Type']
errors_by_class = train_df.groupby("Cover_Type")["Error"].mean().sort_values(ascending=False)
px.bar(errors_by_class, title = "Error percentage by class", text_auto='.3f')