In [None]:
import pandas as pd
import lightgbm as lgb
import numpy as np
from boruta import BorutaPy
from sklearn.model_selection import GridSearchCV


In [None]:
seed=42

In [None]:
training_hc= pd.read_csv("../Training_residualshealthycontrol_healthycontrol.csv")

In [None]:
training_neuro= pd.read_csv("../Training_residualshealthycontrol_neurodegenerative.csv")

In [None]:
X_hc = training_hc.set_index("eid").drop(columns=training_hc.columns[-1])
y_hc = training_hc.set_index("eid")[training_hc.columns[-1]].astype("category")

In [None]:
X_neuro = training_neuro.set_index("eid").drop(columns=training_neuro.columns[-1])
y_neuro = training_neuro.set_index("eid")[training_neuro.columns[-1]].astype("category")

In [None]:
y_neuro

In [None]:
results_neuro = pd.read_csv("Results/Feature_importances_LightGBM_balanced_Training_residualshealthycontrol_neurodegenerative.csv")

results_hc = pd.read_csv("Results/Feature_importances_LightGBM_balanced_Training_residualshealthycontrol_healthycontrol.csv")

In [None]:
features_neuro = results_neuro[results_neuro["Importance"] !=0]["Feature"].unique().tolist()

In [None]:
features_hc = results_hc[results_hc["Importance"] !=0]["Feature"].unique().tolist()

In [None]:
len(features_hc)

In [None]:
clf= lgb.LGBMClassifier(random_state=seed, class_weight='balanced', verbose=-1, n_jobs=4)

In [None]:
lgb_params = {
    'n_estimators': [50, 100,200,500],
    'learning_rate': [0.01, 0.1],
    'max_depth': [1,5,7,10,15],
    'subsample': [0.01, 1],
    'colsample_bytree': [0.01, 0.1, 1.0],
    'reg_alpha': [0,0.1, 10],
    'reg_lambda': [0, 0.1, 10]
}

In [None]:
grid_search = GridSearchCV(
    estimator=clf,
    param_grid=lgb_params,
    cv=3,
    scoring="recall",
    n_jobs=4
)
grid_search.fit(X_hc[features_hc], y_hc)

best_model_hc = grid_search.best_estimator_
print("Best params:", grid_search.best_params_)
print("Best CV recall:", grid_search.best_score_)

In [None]:
grid_search.fit(X_neuro[features_neuro], y_neuro)

best_model_neuro = grid_search.best_estimator_
print("Best params:", grid_search.best_params_)
print("Best CV recall:", grid_search.best_score_)

In [None]:
from sklearn.utils import check_random_state

class BorutaPyForLGB(BorutaPy):
    def __init__(self, estimator, n_estimators='auto', min_estimators=50, perc=100, alpha=0.05,
                 two_step=True, max_iter=100, random_state=42, verbose=0):
        super().__init__(estimator, n_estimators=n_estimators, perc=perc, alpha=alpha,
                         two_step=two_step, max_iter=max_iter, random_state=random_state, verbose=verbose)
        self._is_lightgbm = 'lightgbm' in str(type(self.estimator))
        self.min_estimators = min_estimators

    
    def _get_tree_num(self, n_features):
        """Return a safe number of trees for Boruta + LightGBM"""
        if self.n_estimators == 'auto':
            n_tree = max(int(n_features * 10), self.min_estimators)
            return n_tree
        return self.n_estimators
        
    def _validate_pandas_input(self, arg):
        """Convert pandas DataFrame/Series to numpy array if needed."""
        try:
            return arg.values
        except AttributeError:
            raise ValueError("Input needs to be a numpy array or pandas DataFrame/Series.")

    
        
    def _fit(self, X, y):
        # check input params
        self._check_params(X, y)

        if not isinstance(X, np.ndarray):
            X = self._validate_pandas_input(X) 
        if not isinstance(y, np.ndarray):
            y = self._validate_pandas_input(y)

        self.random_state = check_random_state(self.random_state)
        # setup variables for Boruta
        n_sample, n_feat = X.shape
        _iter = 1
        # holds the decision about each feature:
        # 0  - default state = tentative in original code
        # 1  - accepted in original code
        # -1 - rejected in original code
        dec_reg = np.zeros(n_feat, dtype=int)
        # counts how many times a given feature was more important than
        # the best of the shadow features
        hit_reg = np.zeros(n_feat, dtype=int)
        # these record the history of the iterations
        imp_history = np.zeros(n_feat, dtype=float)
        sha_max_history = []

        # set n_estimators
        if self.n_estimators != 'auto':
            self.estimator.set_params(n_estimators=self.n_estimators)

        # main feature selection loop
        while np.any(dec_reg == 0) and _iter < self.max_iter:
            # find optimal number of trees and depth
            if self.n_estimators == 'auto':
                # number of features that aren't rejected
                not_rejected = np.where(dec_reg >= 0)[0].shape[0]
                n_tree = self._get_tree_num(not_rejected)
                self.estimator.set_params(n_estimators=n_tree)

            # make sure we start with a new tree in each iteration
            if self._is_lightgbm:
                self.estimator.set_params(random_state=self.random_state.randint(0, 10000))
            else:
                self.estimator.set_params(random_state=self.random_state)

            # add shadow attributes, shuffle them and train estimator, get imps
            cur_imp = self._add_shadows_get_imps(X, y, dec_reg)

            # get the threshold of shadow importances we will use for rejection
            imp_sha_max = np.percentile(cur_imp[1], self.perc)

            # record importance history
            sha_max_history.append(imp_sha_max)
            imp_history = np.vstack((imp_history, cur_imp[0]))

            # register which feature is more imp than the max of shadows
            hit_reg = self._assign_hits(hit_reg, cur_imp, imp_sha_max)

            # based on hit_reg we check if a feature is doing better than
            # expected by chance
            dec_reg = self._do_tests(dec_reg, hit_reg, _iter)

            # print out confirmed features
            if self.verbose > 0 and _iter < self.max_iter:
                self._print_results(dec_reg, _iter, 0)
            if _iter < self.max_iter:
                _iter += 1

        # we automatically apply R package's rough fix for tentative ones
        confirmed = np.where(dec_reg == 1)[0]
        tentative = np.where(dec_reg == 0)[0]
        # ignore the first row of zeros
        tentative_median = np.median(imp_history[1:, tentative], axis=0)
        # which tentative to keep
        tentative_confirmed = np.where(tentative_median
                                       > np.median(sha_max_history))[0]
        tentative = tentative[tentative_confirmed]

        # basic result variables
        self.n_features_ = confirmed.shape[0]
        self.support_ = np.zeros(n_feat, dtype=bool)
        self.support_[confirmed] = 1
        self.support_weak_ = np.zeros(n_feat, dtype=bool)
        self.support_weak_[tentative] = 1

        # ranking, confirmed variables are rank 1
        self.ranking_ = np.ones(n_feat, dtype=int)
        # tentative variables are rank 2
        self.ranking_[tentative] = 2
        # selected = confirmed and tentative
        selected = np.hstack((confirmed, tentative))
        # all rejected features are sorted by importance history
        not_selected = np.setdiff1d(np.arange(n_feat), selected)
        # large importance values should rank higher = lower ranks -> *(-1)
        imp_history_rejected = imp_history[1:, not_selected] * -1

        # update rank for not_selected features
        if not_selected.shape[0] > 0:
                # calculate ranks in each iteration, then median of ranks across feats
                iter_ranks = self._nanrankdata(imp_history_rejected, axis=1)
                rank_medians = np.nanmedian(iter_ranks, axis=0)
                ranks = self._nanrankdata(rank_medians, axis=0)

                # set smallest rank to 3 if there are tentative feats
                if tentative.shape[0] > 0:
                    ranks = ranks - np.min(ranks) + 3
                else:
                    # and 2 otherwise
                    ranks = ranks - np.min(ranks) + 2
                self.ranking_[not_selected] = ranks
        else:
            # all are selected, thus we set feature supports to True
            self.support_ = np.ones(n_feat, dtype=bool)

        self.importance_history_ = imp_history

        # notify user
        if self.verbose > 0:
            self._print_results(dec_reg, _iter, 1)
        return self

In [None]:
lgb_params = {
    'n_estimators': [50, 100,200,500],
    'learning_rate': [0.01, 0.1],
    'max_depth': [1,5,7,10,15],
    'subsample': [0.01, 1],
    'colsample_bytree': [0.01, 0.1, 1.0],
    'reg_alpha': [0,0.1, 10],
    'reg_lambda': [0, 0.1, 10]
}

In [None]:
clf = lgb.LGBMClassifier(
    random_state=seed,
    class_weight='balanced',
    verbose=-1,
    n_jobs=4
)


In [None]:
X_neuro[:

In [None]:
feat_selector = BorutaPyForLGB(best_model_neuro, n_estimators='auto', verbose=0, min_estimators=50,random_state=seed)
feat_selector.fit(X_neuro[features_neuro], y_neuro)


In [None]:
# Check the selected features
[feat_selector.support_]

In [None]:

features_neuro_array = np.array(features_neuro)
selected_features = features_neuro_array[feat_selector.support_]
print(selected_features.tolist())


In [None]:
feat_selector_hc = BorutaPyForLGB(best_model_hc, n_estimators='auto', verbose=0, min_estimators=50,random_state=seed)
feat_selector_hc.fit(X_hc[features_hc], y_hc)

In [None]:

features_hc_array = np.array(features_hc)
selected_features_hc = features_hc_array[feat_selector_hc.support_]
print(selected_features_hc.tolist())
