# Brute Force Automated Feature Selection
## Experiments with Tree-based Scikit-learn models | Quantitative Analysis

**Objective**: The primary objective of this notebook is to perform experimentation with the tree-based models for multi-step ahead forecasting with *Quant* features. 



In [1]:
import sys
import os

# Get the directory of the current notebook
notebook_dir = os.path.dirname(os.path.abspath("__file__"))  # Use __file__ for portability

# Move up one level to the project root
project_root = os.path.abspath(os.path.join(notebook_dir, ".."))

# Change working directory
os.chdir(project_root)

# Verify the change
print(os.getcwd()) 

/home/michel/projects/tree-based-forecasting-boilerplate


In [2]:
%load_ext autoreload
%autoreload 2

from statsmodels.tsa.seasonal import STL, seasonal_decompose
from sklearn.preprocessing import MinMaxScaler, RobustScaler

from scripts.run_wfv import *
from src.models.train_model import *
from src.models.evaluate_model import *
from src.models.predict_model import *
from src.visualization.data_viz import *

In [4]:
df = pd.read_csv(os.path.join('./data/processed', 'processed_df.csv'), parse_dates=["DATE"])
df = df.set_index("DATE")
df.head()

Unnamed: 0_level_0,TICKER,CLOSE,DAY_OF_MONTH,DAY_OF_WEEK,WEEK_OF_MONTH,MONTH,QUARTER,YEAR,CLOSE_LAG_1,CLOSE_LAG_2,CLOSE_MA_3
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2019-07-05,BOVA11.SA,100.27,5,4,1,7,3,2019,100.0,98.32,98.363
2019-07-08,BOVA11.SA,100.56,8,0,2,7,3,2019,100.27,100.0,99.53
2019-07-10,BOVA11.SA,101.96,10,2,2,7,3,2019,100.56,100.27,100.277
2019-07-11,BOVA11.SA,101.21,11,3,2,7,3,2019,101.96,100.56,100.93
2019-07-12,BOVA11.SA,100.05,12,4,2,7,3,2019,101.21,101.96,101.243


In [5]:
df = df[df['TICKER'] == 'BOVA11.SA']

In [16]:
import numpy as np
from sklearn.model_selection import cross_val_score
from itertools import chain, combinations

def automated_feature_selection(model, features, sample_ratio, cv=5, scoring='accuracy'):
    """
    Performs automated feature selection by iterating through feature combinations and evaluating model performance.

    Args:
        model: A scikit-learn model instance.
        features: A list of feature names.
        sample_ratio: A float between 0 and 1, representing the proportion of features to sample.
        cv: Number of cross-validation folds (default=5).
        scoring: The metric used for evaluating the model (default='accuracy').

    Returns:
        A tuple containing:
            - A dictionary mapping feature combinations to their average cross-validation scores.
            - A list of the feature combination with the best (lowest) average cross-validation score.
    """
    
    num_features_to_sample = int(len(features) * sample_ratio)
    sampled_features = np.random.choice(features, size=num_features_to_sample, replace=False).tolist()
    print(f'Sampled features: {sampled_features}')

    # Create powerset of feature combinations
    powerset = chain.from_iterable(combinations(sampled_features, r) for r in range(1, len(sampled_features) + 1))
    
    results = {}  # Dictionary to store results
    best_score = float('inf')  # Initialize best score as infinity
    best_features = []

    for idx, feature_set in enumerate(powerset):
        print(f"Testing {idx}th feature set: {feature_set}")
        X_selected = X[list(feature_set)]  # Assume you have a DataFrame X with your features
        scores = cross_val_score(model, X_selected, y, cv=cv, scoring=scoring)  
        avg_score = np.mean(scores)
        results[feature_set] = avg_score

        if avg_score < best_score:
            best_score = avg_score
            best_features = list(feature_set)

    return results, best_features


In [20]:
from sklearn.linear_model import LinearRegression

X,y = df.drop(columns=["CLOSE", "TICKER"]), df['CLOSE']
# Assuming X is your feature matrix and y is your target variable
results, best_features = automated_feature_selection(LinearRegression(), X.columns.tolist(), 1.0, scoring='neg_mean_squared_log_error')
print("Best Features:", best_features)
print("All Results:", results)


Sampled features: ['QUARTER', 'CLOSE_LAG_2', 'DAY_OF_WEEK', 'CLOSE_LAG_1', 'CLOSE_MA_3', 'YEAR', 'MONTH', 'DAY_OF_MONTH', 'WEEK_OF_MONTH']
Testing 0th feature set: ('QUARTER',)
Testing 1th feature set: ('CLOSE_LAG_2',)
Testing 2th feature set: ('DAY_OF_WEEK',)
Testing 3th feature set: ('CLOSE_LAG_1',)
Testing 4th feature set: ('CLOSE_MA_3',)
Testing 5th feature set: ('YEAR',)
Testing 6th feature set: ('MONTH',)
Testing 7th feature set: ('DAY_OF_MONTH',)
Testing 8th feature set: ('WEEK_OF_MONTH',)
Testing 9th feature set: ('QUARTER', 'CLOSE_LAG_2')
Testing 10th feature set: ('QUARTER', 'DAY_OF_WEEK')
Testing 11th feature set: ('QUARTER', 'CLOSE_LAG_1')
Testing 12th feature set: ('QUARTER', 'CLOSE_MA_3')
Testing 13th feature set: ('QUARTER', 'YEAR')
Testing 14th feature set: ('QUARTER', 'MONTH')
Testing 15th feature set: ('QUARTER', 'DAY_OF_MONTH')
Testing 16th feature set: ('QUARTER', 'WEEK_OF_MONTH')
Testing 17th feature set: ('CLOSE_LAG_2', 'DAY_OF_WEEK')
Testing 18th feature set: ('C

In [21]:
results

{('QUARTER',): -0.016149184479824212,
 ('CLOSE_LAG_2',): -0.00045121617284803775,
 ('DAY_OF_WEEK',): -0.015683079957963787,
 ('CLOSE_LAG_1',): -0.000270046655622246,
 ('CLOSE_MA_3',): -0.00035767186415635075,
 ('YEAR',): -0.0116681643260659,
 ('MONTH',): -0.016073870363970493,
 ('DAY_OF_MONTH',): -0.015681585334499638,
 ('WEEK_OF_MONTH',): -0.015680684116753815,
 ('QUARTER', 'CLOSE_LAG_2'): -0.00045175221861210306,
 ('QUARTER', 'DAY_OF_WEEK'): -0.016151603329242325,
 ('QUARTER', 'CLOSE_LAG_1'): -0.0002702060311884637,
 ('QUARTER', 'CLOSE_MA_3'): -0.00035762035994180657,
 ('QUARTER', 'YEAR'): -0.01217207231194341,
 ('QUARTER', 'MONTH'): -0.016192026579962583,
 ('QUARTER', 'DAY_OF_MONTH'): -0.016149946245824227,
 ('QUARTER', 'WEEK_OF_MONTH'): -0.016149037452838154,
 ('CLOSE_LAG_2', 'DAY_OF_WEEK'): -0.00045139624348056434,
 ('CLOSE_LAG_2', 'CLOSE_LAG_1'): -0.0002660634415149779,
 ('CLOSE_LAG_2', 'CLOSE_MA_3'): -0.000359482064895263,
 ('CLOSE_LAG_2', 'YEAR'): -0.0004504077553259018,
 ('CLO