In [None]:
# Notebook imports

# Used to define paths to files for data import and export
import os
import os.path as op

# Basic libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns 

# Used for preprocessing and splitting of data 
from sklearn.model_selection import train_test_split

# Different models for feature selection
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFECV
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import SequentialFeatureSelector

# Different models that can be used as classifiers for feature selection
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import RidgeClassifier
from sklearn.svm import SVC
from sklearn.svm import NuSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import ExtraTreeClassifier
# import xgboost as xgb



# Feature selection notebook

### Part 1 : Loading preprocessed data

### Part 2 : Splitting in train and test sets

We need train and test sets to be able to do feature selection using the sklearn models.

In [None]:
# Sepatate train and test data 
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

# Get X and y matrices for feature classification
X_train = df_train.drop(columns=[""])
y_train = df_train[""]
X_test = df_test.drop(columns=[""])
y_test = df_test[""]

# Get feature names
features = X_train.columns

### Part 3 : Feature selection
#### 3.1 Choosing classifier

In [None]:
# Choosing the classifier
classifier = LogisticRegression(max_iter = 500, multi_class="multinomial")
name_classifier = 'LogisticRegression' # give the name of the classifier to save the file at the end

# Choosing the minimum number of features so consider
min_features_to_select = 15 

# List of other possible classifiers :
    # LogisticRegression(max_iter = 500, multi_class="ovr")
    # RidgeClassifier()
    # SVC(kernel="linear")
    # NuSVC(kernel="linear")
    # DecisionTreeClassifier()
    # ExtraTreeClassifier()

#### 3.2 Recursive feature elimination (RFE) with cross-validation

In [None]:
# Create the RFE object and compute a cross-validated score
rfecv = RFECV(estimator=classifier,
              step=1,
              cv=StratifiedKFold(4),
              scoring='f1',
              min_features_to_select=min_features_to_select)
rfecv.fit(X_train, y_train)

In [None]:
# Optimal number of features
print("Optimal number of features : %d" % rfecv.n_features_)

# Plot number of features VS. cross-validation scores
plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score (f1_score)")
plt.plot(range(min_features_to_select, len(rfecv.grid_scores_) + min_features_to_select),
         rfecv.grid_scores_)
plt.show()

In [None]:
# See feature ranking
df_rfecv = pd.DataFrame(list(zip(features, rfecv.ranking_)), columns =['Feature', 'Rank']).sort_values(by='Rank')
df_rfecv.head(40)

In [None]:
# Columns selected
features_rfecv = features[rfecv.support_]
features_rfecv

#### 3.3 Sequential Feature Selection
##### 3.3.1 SelectFromModel

First we get an idea of the importance of the features with a logistic regression. Features with the highest absolute coefficient are considered most important

In [None]:
# Feature importance from coefficients
importance_fit = classifier.fit(X_train, y_train)

if len(importance_fit.coef_.tolist())>1:
       importance = np.abs(importance_fit.coef_)  
else:
       importance = np.abs(importance_fit.coef_.tolist()[0])

df_importance = pd.DataFrame(list(zip(features, importance)), columns =['Feature', 'Importance']).sort_values(by='Importance', ascending=False)

plt.figure(figsize=(25, 7))
plt.bar('Feature', 'Importance', data=df_importance)
plt.title("Feature importances via coefficients")
plt.xticks(rotation=90)
plt.show()

In [None]:
# See feature ranking
df_importance.head(10)

Now we select the features which are most important according to the coefficients using SelectFromModel.

In [None]:
threshold = np.sort(importance)[-35] + 0.01
sfm = SelectFromModel(classifier, threshold=threshold).fit(X_train, y_train)

features_sfm = features[sfm.get_support()]
print(f"Features selected by SelectFromModel: {features_sfm}")

##### 3.3.2 SequentialFeatureSelection

In [None]:
# Forward sequential feature selection
sfs_forward = SequentialFeatureSelector(classifier, n_features_to_select=20, direction="forward").fit(X_train, y_train)

In [None]:
# Backwards sequential feature selection, takes a lot of time to run
sfs_backward = SequentialFeatureSelector(classifier, n_features_to_select=20, direction="backward").fit(X_train, y_train)

In [None]:
# Selected features
features_sfs_forward = features[sfs_forward.get_support()]
print("Features selected by forward sequential selection:\n " f"{features_sfs_forward}")

#print('------')

features_sfs_backward = features[sfs_backward.get_support()]
print("Features selected by backward sequential selection:\n " f"{features_sfs_backward}")

### Part 4 : Discussion