In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import mutual_info_classif
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from skrebate import ReliefF

df = pd.read_csv('data_training.csv', parse_dates=['datetime'])

train_start, train_end = '2024-01-01', '2024-03-30'
test_start, test_end = '2024-04-01', '2024-06-30'

treino = df[(df['datetime'] >= train_start) & (df['datetime'] <= train_end)].copy()
validacao = df[(df['datetime'] >= test_start) & (df['datetime'] <= test_end)].copy()

for sub_df in [treino, validacao]:
    sub_df.drop(columns=['datetime', 'date', 'close', 'open', 'low', 'high','volume', 'average', 'amount_stock', 'id_ticker', 'business'], 
                inplace=True, errors='ignore')

def remove_non_numeric(df):
    return df.select_dtypes(include=[np.number])

X_train = remove_non_numeric(treino.drop(columns=['trend']))
y_train = treino['trend']

X_valid = remove_non_numeric(validacao.drop(columns=['trend']))
y_valid = validacao['trend']

scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X_train)


In [2]:
#Funcoes dos metodos

def cfs_subset_eval(X, y):
    corr_matrix = X.corr().abs()
    feature_target_corr = X.apply(lambda col: col.corr(y)).abs()
    selected = feature_target_corr.sort_values(ascending=False).index.tolist()
    return selected

def classifier_attribute_eval(X, y):
    clf = RandomForestClassifier(n_estimators=100, random_state=42)
    clf.fit(X, y)
    importances = clf.feature_importances_
    return list(X.columns[np.argsort(importances)[::-1]])

def correlation_attribute_eval(X, y):
    corrs = X.apply(lambda col: abs(col.corr(y)))
    return list(corrs.sort_values(ascending=False).index)

def pca_ranking(X):
    pca = PCA(n_components=min(5, X.shape[1]))
    pca.fit(X)
    component_weights = np.abs(pca.components_[0])
    return list(X.columns[np.argsort(component_weights)[::-1]])

def information_gain_eval(X_scaled, y):
    info_gain = mutual_info_classif(X_scaled, y, random_state=42)
    info_gain_series = pd.Series(info_gain, index=X_train.columns)
    return list(info_gain_series.sort_values(ascending=False).index)

def reliefF_eval(X_scaled, y):
    relief = ReliefF(n_neighbors=100, n_features_to_select=X_train.shape[1])
    relief.fit(X_scaled, y)
    relief_scores = pd.Series(relief.feature_importances_, index=X_train.columns)
    return list(relief_scores.sort_values(ascending=False).index)

#Execução tudo

rankings = {
    "CFS_SubsetEval": cfs_subset_eval(X_train, y_train),
    "ClassifierAttributeEval": classifier_attribute_eval(X_train, y_train),
    "CorrelationAttributeEval": correlation_attribute_eval(X_train, y_train),
    "PCA": pca_ranking(X_train),
    "Information_Gain": information_gain_eval(X_scaled, y_train),
    "ReliefF": reliefF_eval(X_scaled, y_train)
}

for method, ranking in rankings.items():
    print(f"\n Top Features - {method}:")
    print(ranking[:6])


 Top Features - CFS_SubsetEval:
['NSMA_3', 'NSMA_11', 'NSMA_5', 'NSMA_9', 'NSMA_7', 'Bands_Norm']

 Top Features - ClassifierAttributeEval:
['Bands_Norm', 'NSMA_3', 'NSMA_5', 'NSMA_9', 'NSMA_11', 'NSMA_7']

 Top Features - CorrelationAttributeEval:
['NSMA_3', 'NSMA_11', 'NSMA_5', 'NSMA_9', 'NSMA_7', 'Bands_Norm']

 Top Features - PCA:
['NSMA_11', 'NSMA_9', 'NSMA_7', 'NSMA_5', 'NSMA_3', 'Bands_Norm']

 Top Features - Information_Gain:
['NSMA_5', 'NSMA_7', 'Bands_Norm', 'NSMA_9', 'NSMA_3', 'NSMA_11']

 Top Features - ReliefF:
['Bands_Norm', 'NSMA_3', 'NSMA_5', 'NSMA_7', 'NSMA_9', 'NSMA_11']
