<a href="https://colab.research.google.com/github/mulcahrj/DATA6545_Final/blob/main/Final_Project_Feature_Selection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
import pandas as pd
import numpy as np

In [2]:
# Read Excel file and set up in
path = 'https://raw.githubusercontent.com/mulcahrj/DATA6545_Final/main/train_complete.csv'
train_complete = pd.read_csv(path,index_col=0)

In [3]:
# Import the necessary libraries first
#### Correlation method - for classification chi2 only
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
#### RFE method - replace the model with the model you plan to use
from sklearn.feature_selection import RFE
# from sklearn.linear_model import LogisticRegression
#### feature importance method
#### this method can be used for both topK and cut-off
from sklearn.linear_model import Ridge
#### specifically tree-based feature importance method
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
#### for voting
from collections import Counter

## Select Features

In [4]:

X = train_complete.iloc[:,:-1].values
y = train_complete.iloc[:,-1].values

In [5]:
names = list(train_complete.iloc[:,:-1])
names

['emp.var.rate',
 'cons.price.idx',
 'cons.conf.idx',
 'euribor3m',
 'total_assets',
 'rep_avg_rating',
 'customer_length',
 'customer_ratings',
 'cust_exp_ind',
 'marital_divorced',
 'marital_married',
 'marital_single',
 'marital_unknown',
 'housing_no',
 'housing_unknown',
 'housing_yes',
 'loan_no',
 'loan_unknown',
 'loan_yes',
 'contact_cellular',
 'contact_telephone',
 'day_of_week_fri',
 'day_of_week_mon',
 'day_of_week_thu',
 'day_of_week_tue',
 'day_of_week_wed',
 'poutcome_failure',
 'poutcome_nonexistent',
 'poutcome_success',
 'customer_language_English',
 'customer_language_Other',
 'customer_language_Portuguese',
 'customer_language_Spanish',
 'other_banks_no',
 'other_banks_unknown',
 'other_banks_yes',
 'education_bin_high-school',
 'education_bin_illiterate',
 'education_bin_post high-school',
 'education_bin_pre high-school',
 'education_bin_unknown',
 'qtr_bin_Q1',
 'qtr_bin_Q2',
 'qtr_bin_Q3',
 'qtr_bin_Q4',
 'job_bin_blue-collar',
 'job_bin_other',
 'job_bin_servi

In [6]:
def feature_selector(X, y, model, names, _method="topk", n=int(X.shape[1]/2), fit_X=False, thres = 0.1):
  """voting based feature selector
  - _method: "topk" for top-K method (default), "cutoff" for cut-off based method
  - n: number of features to be selected. only available for top-K method - default half of the total features
  - thres: cut-off threshold (default 0.1), only availabel for cutoff method
  - fit_X: fit_transform X or just return indices
  TODO: include support for regression problems
  """
  #### Placeholder for tests
  # X.shape[0] == y.shape[0]
  if names: # if given feature names
    feature_names = np.array(names)
  else: # otherwise use location
    feature_names = np.array(["X%s" % x for x in range(len(X.shape[0]))])
  if _method == 'topk':
    #### np.argpartition gets the indices of the largest n element from the array in ascending order
    #### [::-1] reverse the order
    corr_features = list(np.argpartition(SelectKBest(score_func=chi2, k=n).fit(X,y).scores_, -n)[-n:][::-1])
    #### binary masking on features, use np.where() to get the indices of selected
    rfe_features = list(np.where(RFE(model, n_features_to_select=n, step=1).fit(X, y).support_)[0])
    #### argsort get sorted indices by values, ::-1] reverse the asceding order
    ridge_features = list(np.argsort(Ridge(alpha=1.0).fit(X, y).coef_)[-n:][::-1])
    #### using ExtraTree
    extratree_features = list(np.argsort(ExtraTreesClassifier().fit(X, y).feature_importances_)[-n:][::-1])
    ### using Random Forest:
    rf_features = list(np.argsort(RandomForestClassifier().fit(X, y).feature_importances_)[-n:][::-1])
    print("Using the Top-K method: ")
    print("Selected features by correlation: ", feature_names[corr_features])
    print("Selected features by RFE: ", feature_names[rfe_features])
    print("Selected features by Ridge coefficients: ", feature_names[ridge_features])
    print("Selected features by Extra Tree feature importance: ", feature_names[extratree_features])
    print("Selected features by Random Forest feature importance: ", feature_names[rf_features])

  elif _method == 'cutoff':
    corr_features, rfe_features = list(), list() #### N/A
    ridge_features = list(np.where(Ridge(alpha=1.0).fit(X, y).coef_ > thres)[0])
    extratree_features = list(np.where(ExtraTreesClassifier().fit(X, y).feature_importances_ > thres)[0])
    rf_features = list(np.where(RandomForestClassifier().fit(X, y).feature_importances_ > thres)[0])
    print("Using the Cutoff method: ")
    # print("Selected features by correlation: ", feature_names[corr_features])
    # print("Selected features by RFE: ", feature_names[rfe_features])
    print("Selected features by Ridge coefficients: ", feature_names[ridge_features])
    print("Selected features by Extra Tree feature importance: ", feature_names[extratree_features])
    print("Selected features by Random Forest feature importance: ", feature_names[rf_features])
  else:
    return("Only Top-K and Cutoff methods are currently supported!")

  ######################################
  #### combine results using voting ####
  ######################################
  counted = Counter(np.concatenate((corr_features, rfe_features, ridge_features, extratree_features, rf_features), axis=None)).most_common(n)

  #### list of tuples (feature_index, votes)
  counted.sort(key = lambda x: x[1], reverse=True)
  final_select_series = pd.Series({feature_names[f]:c for f,c in counted}).sort_values(ascending=False)
  selected_idx = sorted([f for f,c in counted])
  assert len(selected_idx) == n
  print(final_select_series)
  if fit_X:
    return(X[:,selected_idx])
  else:
    return(feature_names[selected_idx])

In [10]:
selected_features = feature_selector(X, y, model=RandomForestClassifier(), n=10, names=names, fit_X=False)
selected_features

Using the Top-K method: 
Selected features by correlation:  ['emp.var.rate' 'euribor3m' 'contact_cellular' 'contact_telephone'
 'poutcome_success' 'qtr_bin_Q1' 'age_bin_56+' 'poutcome_nonexistent'
 'job_bin_other' 'age_bin_25 or younger']
Selected features by RFE:  ['emp.var.rate' 'cons.price.idx' 'cons.conf.idx' 'euribor3m'
 'total_assets' 'rep_avg_rating' 'customer_length' 'housing_yes'
 'poutcome_success' 'customer_language_Portuguese']
Selected features by Ridge coefficients:  ['cons.price.idx' 'poutcome_success' 'cons.conf.idx' 'qtr_bin_Q1'
 'loan_unknown' 'education_bin_illiterate' 'housing_no' 'housing_yes'
 'marital_unknown' 'contact_cellular']
Selected features by Extra Tree feature importance:  ['euribor3m' 'total_assets' 'customer_length' 'rep_avg_rating'
 'emp.var.rate' 'poutcome_success' 'cons.conf.idx' 'cons.price.idx'
 'customer_ratings' 'housing_no']
Selected features by Random Forest feature importance:  ['euribor3m' 'total_assets' 'customer_length' 'rep_avg_rating'
 '

array(['emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m',
       'total_assets', 'rep_avg_rating', 'customer_length', 'housing_yes',
       'contact_cellular', 'poutcome_success'], dtype='<U30')

In [14]:
selected_features = ['poutcome_success', 'emp.var.rate', 'euribor3m', 'cons.price.idx',
       'cons.conf.idx', 'total_assets', 'rep_avg_rating', 'customer_length', 'housing_yes',
       'contact_cellular', 'y']
selected_features

['poutcome_success',
 'emp.var.rate',
 'euribor3m',
 'cons.price.idx',
 'cons.conf.idx',
 'total_assets',
 'rep_avg_rating',
 'customer_length',
 'housing_yes',
 'contact_cellular',
 'y']

In [15]:
selected_features = train_complete[selected_features]

In [16]:
selected_features.head()

Unnamed: 0_level_0,poutcome_success,emp.var.rate,euribor3m,cons.price.idx,cons.conf.idx,total_assets,rep_avg_rating,customer_length,housing_yes,contact_cellular,y
campaign,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0.573524,0,1.0,0.967665,0.605052,0.408442,0.553302,0.740661,0.787138,1,1,0
0.744386,0,0.14832,0.061928,0.534818,0.235723,0.817121,0.275429,0.357026,1,0,0
0.0,0,1.0,0.968428,0.414483,0.692481,0.19164,0.959432,0.966685,1,1,0
0.0,0,0.849418,0.92914,0.637496,0.680736,0.399434,0.285757,0.369019,1,0,0
0.391017,0,1.0,0.967665,0.605052,0.408442,0.291517,0.244337,0.325443,0,1,0


In [18]:
from google.colab import drive
drive.mount('/content/drive')

MessageError: ignored

In [17]:
path = '/content/drive/MyDrive/Classroom/DATA 6545: Machine Learning for Predictive Analysis SP2022/Final Project/selected_features.csv'
selected_features.to_csv(path,index=False)

FileNotFoundError: ignored