# testing implementation of forward step-wise regression

In [1]:
import numpy as np
import pandas as pd 
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso


In [2]:
# creaete a toy example
from sklearn.datasets import make_regression

X, y, coef = make_regression(n_samples=92, n_features=20,
                                      n_informative=5, noise=10,
                                      coef=True, random_state=0)

In [3]:
def forward_devel(X, Y, n):
    """
    1. start with evaluation of individual features
    2. select the one feature that results in the best performance
        ** what is the best? That depends on the selected evaluation criteria (in this case it can be R)
    3. Consider all the possible combinations of the selected feature and another feature and select the best combination
    4. Repeat 1 to 3 untill you have the desired number of features

    Args: 
    X(np.ndarray)   -    design matrix   
    Y(np.ndarray)   -    response variables
    n(int)          -    number of features to select
    """

    # 1. starting with an empty list: the list will be filled with best features eventual
    selected = []
    remaining = list(range(X.shape[1]))

    # 2. loop over features
    while (remaining) and (len(selected)< n): # while remaining is not empty and n features are not selected 
        scores = pd.Series(np.empty((len(remaining))), index=remaining) # the scores will be stored in this 
        for i in remaining:
    
            feats = selected +[i] # list containing the current features that will be used in regression
            # fit the model
            ## get the features from X
            X_feat = X[:, list(feats)]

            ## scale X_feat
            scale_ = np.sqrt(np.nansum(X_feat ** 2, 0) / X_feat.shape[0])
            Xs = X_feat / scale_
            Xs = np.nan_to_num(Xs) # there are 0 values after scaling

            ## fit the model
            model = LinearRegression(fit_intercept=False).fit(Xs, Y)

            ## get the score and put it in scores
            scores.loc[i] = model.score(Xs, y, sample_weight=None)

        # find the feature/feature combination with the best score
        best = scores.idxmax()
        selected.append(best)
        # update remaining
        ## remove the selected feature/features from remaining
        remaining.remove(best)

    return selected

In [4]:
def forward_scikit(X, Y, n):
    """
    uses scikit learn SequentialFeatureSelector to select the best features

    """
    selector = SequentialFeatureSelector(LinearRegression(fit_intercept=False), n_features_to_select=n)
    
    # scale X
    scale_ = np.sqrt(np.nansum(X ** 2, 0) / X.shape[0])
    Xs = X / scale_
    Xs = np.nan_to_num(Xs) # there are 0 values after scaling

    selector.fit(Xs, Y)

    selected_bool = selector.get_support()
    selected = np.where(selected_bool)[0]


    return selected


In [5]:
a = forward_devel(X, y, n = 6)


In [6]:
b = forward_scikit(X, y, n = 6)

In [7]:
print(a)
print(b)

[6, 8, 1, 14, 2, 0]
[ 0  1  2  6  8 14]
