In [43]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import VarianceThreshold

def perform_preselection(df, threshold_missing, threshold_variance, threshold_domain):
    # Cálculo do percentual de preenchimento (% de missings) das variáveis
    missing_percent = df.isnull().sum() / len(df)
    
    # Seleção das variáveis numéricas
    numeric_vars = df.select_dtypes(include=np.number).columns
    
    # Aplicação do limiar de variância para as variáveis numéricas
    sel = VarianceThreshold(threshold=threshold_variance)
    numeric_vars_selected = sel.fit_transform(df[numeric_vars])
    
    # Seleção das variáveis categóricas
    categorical_vars = df.select_dtypes(include='object').columns
    
    # Preparação dos DataFrames para armazenar as informações sobre a seleção de variáveis
    numeric_selection_info = pd.DataFrame(columns=['VAR', 'PCT_MISSING', 'VARIANCIA'])
    categorical_selection_info = pd.DataFrame(columns=['VAR', 'PCT_MISSING', 'DOMINIO'])
    
    # Pré-seleção das variáveis numéricas
    for var in numeric_vars:
        if missing_percent[var] < threshold_missing:
            numeric_selection_info = pd.concat([numeric_selection_info, pd.DataFrame({'VAR': [var],
                                                                                       'PCT_MISSING': 1,
                                                                                       'VARIANCIA': 0})])
        elif np.var(df[var]) > threshold_variance:
            numeric_selection_info = pd.concat([numeric_selection_info, pd.DataFrame({'VAR': [var],
                                                                                       'PCT_MISSING': 0,
                                                                                       'VARIANCIA': 1})])
    
    # Pré-seleção das variáveis categóricas
    for var in categorical_vars:
        if missing_percent[var] < threshold_missing:
            categorical_selection_info = pd.concat([categorical_selection_info, pd.DataFrame({'VAR': [var],
                                                                                       'PCT_MISSING': 1,
                                                                                       'DOMINIO': 0})])
        elif df[var].value_counts(normalize=True).max() < threshold_domain:
            categorical_selection_info = pd.concat([categorical_selection_info, pd.DataFrame({'VAR': [var],
                                                                                       'PCT_MISSING': 0,
                                                                                       'DOMINIO': 1})])
    
    # Transformando as colunas 'Selection_Reason' em dummies
    # numeric_selection_info = pd.get_dummies(numeric_selection_info, columns=['Selection_Reason'])
    # categorical_selection_info = pd.get_dummies(categorical_selection_info, columns=['Selection_Reason'])
    
    return numeric_selection_info, categorical_selection_info

In [44]:
# Gerando dados fictícios
np.random.seed(0)
data = pd.DataFrame({
    'feature_1': np.random.randn(1000),
    'feature_2': np.random.randint(0, 2, 1000),
    'feature_3': np.random.choice(['A', 'B', 'C'], 1000),
    'feature_4': np.random.randint(1, 6, 1000),
    'feature_5': np.random.choice([np.nan], 1000),
    'target': np.random.randint(0, 2, 1000)
})

In [45]:
numeric_selection, categorical_selection = perform_preselection(data, 0.2, 0.01, 0.8)

  self.variances_ = np.nanvar(X, axis=0)


In [46]:
numeric_selection

Unnamed: 0,VAR,PCT_MISSING,VARIANCIA
0,feature_1,1,0
0,feature_2,1,0
0,feature_4,1,0
0,target,1,0


In [47]:
categorical_selection

Unnamed: 0,VAR,PCT_MISSING,DOMINIO
0,feature_3,1,0


In [42]:
data.isnull().sum()

feature_1       0
feature_2       0
feature_3       0
feature_4       0
feature_5    1000
target          0
dtype: int64