In [4]:
import math
import random
import lightgbm as lgb
import os
import pickle
import optuna
import numpy as np
import pandas as pd
from statistics import mean
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, median_absolute_error
from sklearn.model_selection import GroupKFold

In [5]:
def quantile_loss(y_true, y_pred, quantile):
    """
    Computes the quantile loss.

    Parameters
    ----------
    y_true : np.ndarray
        Ground truth values

    y_pred : np.ndarray
        Prediction values

    quantile : float in [0,1]
        Quantile, e.g. 0.5 for the median quantile
    """
    residual = y_true - y_pred
    return mean(np.maximum(quantile * residual, (quantile - 1) * residual))


def MBE(y_true, y_pred):
    """
    Computes Mean Bias Error (MBE)

    Parameters
    ----------
    y_true : np.ndarray
        Ground truth values

    y_pred : np.ndarray
        Prediction values
    """
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    y_true = y_true.reshape(len(y_true), 1)
    y_pred = y_pred.reshape(len(y_pred), 1)
    diff = (y_true - y_pred)
    mbe = diff.mean()
    return mbe

In [ ]:
df = pd.read_csv(os.path.join("./final.csv"),
                     usecols=lambda column: column != 'Unnamed: 0')

df.fillna(-1, inplace=True)
df.replace([np.inf, -np.inf], -1, inplace=True)

print("Median Support: ")
print(df["support"].median())
df.columns = df.columns.str.replace(':', '_')
df["group"] = df['dataset'].astype('category').cat.codes.tolist()

target = "support"
sample_dfs = random.sample(df["group"].unique().tolist(), int(len(df["group"].unique().tolist()) * 0.2))
test = df[df['group'].isin(sample_dfs)]
train = df[~df['group'].isin(sample_dfs)]

X_train = train.drop(axis=1, columns=target)
y_train = train[target]

X_test = test.drop(axis=1, columns=target)
y_test = test[target]

In [ ]:
mse_zero = mean_squared_error(y_test, np.zeros(len(y_test)))
rmse_zero = math.sqrt(mse_zero)
print("Baseline prediting 0 RMSE: " + str(rmse_zero))

mse_mean = mean_squared_error(y_test, np.zeros(len(y_test)) + mean(y_train))
rmse_mean = math.sqrt(mse_mean)
print("Baseline predicting mean RMSE: " + str(rmse_mean))

mse_baseline = mean_squared_error(y_test, X_test["parsimony_bootstrap_support"])
rmse_baseline = mean_squared_error(y_test, X_test["parsimony_bootstrap_support"], squared=False)
mbe_baseline = MBE(y_test, X_test["parsimony_bootstrap_support"])
mae_baseline = mean_absolute_error(y_test, X_test["parsimony_bootstrap_support"])
mdae_baseline = median_absolute_error(y_test, X_test["parsimony_bootstrap_support"])

print("MSE (Mean Squared Error):", mse_baseline)
print("RMSE (Root Mean Squared Error):", rmse_baseline)
print("MBE :", mbe_baseline)
print("MAE (Mean Absolute Error):", mae_baseline)
print("MdAE (Median Absolute Error):", mdae_baseline)

Random forest regressor

In [ ]:
rfe_feature_n=20
model = RandomForestRegressor(n_jobs=-1, n_estimators=250, max_depth=10, min_samples_split=20,
                                      min_samples_leaf=10)
rfe = RFE(estimator=model, n_features_to_select=rfe_feature_n)  # Adjust the number of features as needed
rfe.fit(X_train.drop(axis=1, columns=['dataset', 'branchId', 'group']), y_train)
print(rfe.support_)
selected_features = X_train.drop(axis=1, columns=['dataset', 'branchId', 'group']).columns[rfe.support_]
selected_features = selected_features.append(pd.Index(['group']))

print("Selected features for RFE: ")
print(selected_features)
X_train = X_train[selected_features]
X_test = X_test[selected_features]