In [33]:
import pandas as pd
import numpy as np
import tscv

In [34]:
from sklearn.datasets import make_classification

X, y = make_classification(
    n_samples=20000,
    n_features=30,
    n_informative=5,
    n_redundant=10,
    n_repeated=3,
    n_classes=2,
    n_clusters_per_class=1,
    class_sep=0.8,
    random_state=0,
)

In [41]:
y.sum(), y.shape

(9999, (20000,))

In [42]:
from sklearn.feature_selection import RFECV
from sklearn.model_selection import StratifiedKFold, TimeSeriesSplit
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

min_features_to_select = 1  # Minimum number of features to consider
clf = RandomForestClassifier(n_estimators=1000)
# cv = StratifiedKFold(5)
# cv = TimeSeriesSplit(n_splits=5, gap=100)
cv = tscv.GapKFold(n_splits=5, gap_before=200, gap_after=200)

rfecv = RFECV(
    estimator=clf,
    step=0.1,
    cv=cv,
    scoring="neg_log_loss",
    verbose=1,
    min_features_to_select=min_features_to_select,
    n_jobs=6,
)

In [43]:
rfecv.fit(X, y)

Fitting estimator with 30 features.
Fitting estimator with 27 features.
Fitting estimator with 24 features.
Fitting estimator with 21 features.
Fitting estimator with 18 features.
Fitting estimator with 15 features.
Fitting estimator with 12 features.
Fitting estimator with 9 features.
Fitting estimator with 6 features.
Fitting estimator with 3 features.
Fitting estimator with 30 features.
Optimal number of features: 27


In [47]:
rfecv.get_support()

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True, False,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True, False,  True,  True, False,
        True,  True,  True])

In [50]:
rfecv.get_feature_names_out()

array(['x0', 'x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x9', 'x10',
       'x11', 'x13', 'x14', 'x15', 'x16', 'x17', 'x18', 'x19', 'x20',
       'x21', 'x22', 'x24', 'x25', 'x27', 'x28', 'x29'], dtype=object)

In [16]:
rfecv.cv_results_

{'mean_test_score': array([0.91767973, 0.99184697, 0.9909965 , 0.99084652]),
 'std_test_score': array([0.00371839, 0.00129094, 0.00148488, 0.00164927]),
 'split0_test_score': array([0.92274996, 0.98974999, 0.98899993, 0.98875   ]),
 'split1_test_score': array([0.9147491 , 0.99274916, 0.99249901, 0.99224902]),
 'split2_test_score': array([0.9124874 , 0.99249946, 0.99149939, 0.99099901]),
 'split3_test_score': array([0.92047423, 0.99324695, 0.99249669, 0.9929972 ]),
 'split4_test_score': array([0.91793794, 0.99098927, 0.98948749, 0.98923737])}

In [17]:
rfecv.ranking_

array([2, 3, 3, 2, 2, 1, 2, 3, 1, 3, 3, 1, 2, 1, 1, 1, 2, 1, 3, 1, 2, 3,
       1, 2, 1, 3, 3, 2, 2, 3])

In [65]:
len(rfecv.ranking_), len(rfecv.get_feature_names_out())

(300, 100)

In [66]:
X

array([[ 0.88661908,  1.42146609,  1.10345691, ...,  0.03404206,
         0.23978821,  2.41669025],
       [-0.06950958,  1.007678  ,  0.28973477, ..., -0.57177621,
         1.5297184 ,  0.65710649],
       [ 1.18707986, -0.15726531,  2.39808412, ...,  0.47989883,
         0.65131619, -0.48580281],
       ...,
       [ 0.5187474 ,  0.26190761, -3.56036828, ...,  1.24963992,
         1.31375189,  1.5736302 ],
       [-0.72134957, -0.18112951, -0.395346  , ..., -0.21437854,
         0.59573587,  0.35230313],
       [ 0.47388275,  0.43178594, -1.97444937, ...,  0.12952139,
        -0.85605538, -0.07904277]])