In [11]:
import pandas as pd


from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from lightgbm import LGBMClassifier, LGBMRegressor
from xgboost import XGBClassifier

from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.feature_selection import RFE
from sklearn.model_selection import cross_val_score



from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import Lasso
from sklearn.feature_selection import RFE



def drop_null_column(df, drop_list):
    
    for col_name in drop_list:
        # print(col_name, type(col_name))
        df = df.drop(col_name, axis=1)


    return df


def encode_onehot(df):
    
    catcols = df.select_dtypes(exclude = ['int64','float64']).columns
    df = pd.get_dummies(df, columns = catcols)
    
    return df, catcols



In [17]:
def select_feature(df, model):

    #Define a list of available models for selection
    available_models = {
        'ExtraTrees': ExtraTreesClassifier(n_estimators=100),
        'RandomForest': RandomForestClassifier(n_estimators=100),
        'SVM': SVC(kernel='linear'),
        'KNN': KNeighborsClassifier(n_neighbors=5),
        'LASSO': Lasso(alpha=0.01),  # Agrega LASSO aquí
        'RFE': RFE(estimator=RandomForestClassifier(n_estimators=100), n_features_to_select=13),
        'LGBMC': LGBMClassifier(),
        'LGBMR': LGBMRegressor(),
        'XGB':XGBClassifier(booster='gbtree', importance_type='gain'),
    }

    # Choose the desired model for feature selection
    chosen_model = model

    # Create the selected model
    clf = available_models[chosen_model]

    #Train the model with the data
    clf = clf.fit(df.values, y)

    # Obtain feature importances from the model

    if model == 'LGBMC' or model == 'LGBMR': 
        feature_importances = clf.booster_.feature_importance(importance_type="gain")
    else:        
        feature_importances = clf.feature_importances_

    # Create a SelectFromModel object with the trained classifier
    model = SelectFromModel(clf, prefit=True)

    #Transform the original features to obtain the selected ones

    X_df = model.transform(df.values)

    selected_feature_indices = model.get_support(indices=True)

    #Get the indices of the selected features
    selected_columns = df.columns[selected_feature_indices]
    
    return X_df, selected_columns


In [16]:
ml_churner_df = pd.read_csv("./data/bank_churner.csv")
ml_churner_df = ml_churner_df.drop('cstno', axis=1)
ml_churner_df = ml_churner_df.drop('sex', axis=1)
ml_churner_df.dropna(axis=0, inplace=True)

ml_churner_df, catcols = encode_onehot(ml_churner_df)  


X=ml_churner_df.drop(['is_churned'],axis=1)
y=ml_churner_df['is_churned']


model_list = ['ExtraTrees', 'RandomForest', 'SVM', 'KNN', 'LASSO', 'RFE', 'LGBMC', 'LGBMR', 'XGB']
for model in model_list: 
    X_new, selected_columns = select_feature(X, 'ExtraTrees')
    col_count = len(selected_columns)
    print(f'Model Name: {model}, col_count: {col_count}, Selected Col: {selected_columns}')

Model Name: ExtraTrees, col_count: 14, Selected Col: Index(['age', 'dependent_num', 'mon_on_book', 'tot_product_count',
       'months_inact_for_12m', 'contact_cnt_for_12m', 'credit_line',
       'tot_revol_balance', 'mean_open_to_buy', 'tot_amt_ratio_q4_q1',
       'tot_trans_amt_for_12m', 'tot_trans_cnt_for_12m', 'tot_cnt_ratio_q4_q1',
       'mean_util_pct'],
      dtype='object')
Model Name: RandomForest, col_count: 14, Selected Col: Index(['age', 'dependent_num', 'mon_on_book', 'tot_product_count',
       'months_inact_for_12m', 'contact_cnt_for_12m', 'credit_line',
       'tot_revol_balance', 'mean_open_to_buy', 'tot_amt_ratio_q4_q1',
       'tot_trans_amt_for_12m', 'tot_trans_cnt_for_12m', 'tot_cnt_ratio_q4_q1',
       'mean_util_pct'],
      dtype='object')
Model Name: SVM, col_count: 14, Selected Col: Index(['age', 'dependent_num', 'mon_on_book', 'tot_product_count',
       'months_inact_for_12m', 'contact_cnt_for_12m', 'credit_line',
       'tot_revol_balance', 'mean_open_to_