# Feature Selection

In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_selection import SelectKBest, RFE, f_regression, SequentialFeatureSelector

In [2]:
# in order to select the best features,
# sometimes it can be ambiguous to find out which ones we should be using


 ### Words of caution:
  - What I'm going to demonstrate is a means of eliminating some features in your quest to find the best drivers.
  - THIS IS A PROCESS THAT WILL ADD ONTO YOUR EXPLORATORY DATA ANALYSIS, NOT REPLACE IT

In [3]:
df = pd.read_csv('student_grades.csv')

In [5]:
from sklearn.model_selection import train_test_split

In [7]:
train_val, test = train_test_split(df, train_size=0.8, random_state=1349)
train, validate = train_test_split(train_val, train_size=0.7, random_state=1349)

In [8]:
train.shape, validate.shape, test.shape

((58, 5), (25, 5), (21, 5))

In [9]:
train.head()

Unnamed: 0,student_id,exam1,exam2,exam3,final_grade
90,91,70.0,75,78,72
15,16,85.0,83,87,87
55,56,83.0,80,86,85
75,76,58.0,65,70,68
25,26,70.0,75,78,72


In [17]:
kbest_0 = SelectKBest(f_regression, k=2)

In [18]:
kbest_0

In [19]:
X_train, y_train = train.drop(columns='final_grade'), train.final_grade

In [20]:
X_train.columns

Index(['student_id', 'exam1', 'exam2', 'exam3'], dtype='object')

In [21]:
# fit the object that we just created:
kbest_0.fit(X_train, y_train)

In [22]:
kbest_0.scores_

array([3.54359027e-01, 1.73203112e+03, 3.33762096e+02, 6.43406793e+02])

In [23]:
X_train, y_train = train.drop(
    columns=['final_grade','student_id']
), train.final_grade

In [24]:
# create a new object for selectkbest:
kbest_1 = SelectKBest(f_regression, k=2)
kbest_1.fit(X_train, y_train)

In [25]:
kbest_1.scores_

array([1732.03112475,  333.76209615,  643.40679309])

In [27]:
kbest_1.transform(X_train)[:5]

array([[70.0, '78'],
       [85.0, '87'],
       [83.0, '86'],
       [58.0, '70'],
       [70.0, '78']], dtype=object)

In [28]:
kbest_1.get_feature_names_out()

array(['exam1', 'exam3'], dtype=object)

### Recursive Feature Elimination:
 - start from all, continue to eliminate features until we find best number

In [29]:
from sklearn.linear_model import LinearRegression

In [30]:
# make a model object for our wrapper:
model = LinearRegression()

In [31]:
# now that we have this linear regression model,
# we will also create an object for RFE

In [32]:
rfe = RFE(model, n_features_to_select=2)

In [33]:
rfe

In [34]:
# just like with every other sklearn object, we have created it, 
# and now we need to fit it
rfe.fit(X_train, y_train)

In [35]:
rfe.ranking_

array([1, 2, 1])

In [37]:
pd.DataFrame(
{
    'feature': X_train.columns.to_list(),
    'rfe_ranking': rfe.ranking_
    
})

Unnamed: 0,feature,rfe_ranking
0,exam1,1
1,exam2,2
2,exam3,1


In [38]:
# make a new rfe object
rfe_1 = RFE(model, n_features_to_select=1)
# fit the rfe object
rfe_1.fit(X_train, y_train)
pd.DataFrame(
{
    'feature': X_train.columns.to_list(),
    'rfe_1_ranking': rfe_1.ranking_
    
})

Unnamed: 0,feature,rfe_1_ranking
0,exam1,1
1,exam2,3
2,exam3,2


In [39]:
# Running backwards fro Recursive Feature Elimination:
# We also have sequential feature selection
# this operates very similarly in structure to what we see in 
# RFE, but it builds upwards in significance rather than starting
# from everything and building down

In [40]:
# Sequential:
seq = SequentialFeatureSelector(model, n_features_to_select=2)
# fit the thing now that its been made:
seq.fit(X_train, y_train)


In [41]:
seq.support_

array([ True, False,  True])

In [43]:
X_train.columns[seq.support_]

Index(['exam1', 'exam3'], dtype='object')

In [45]:
X_train[X_train.columns[seq.support_]].head()

Unnamed: 0,exam1,exam3
90,70.0,78
15,85.0,87
55,83.0,86
75,58.0,70
25,70.0,78


In [None]:
# if I wanted to cascade this out into something more valuable:
# think about looping through RFE structures with different values
# of best feature
# a structure that I may want to explore in the future:
# best_score = ####
# for n in range(1,max_num_features):
#     model = LinearRegression()
#     rfe = RFE(model, n_number_of_features=n)
#     best_features = rfe.get_feature_names_out()
#     model_score = score(rfe.predict(X_train))
#     if model_score < best_score:
#           best_score = model_score
#           best_features = X_train.columns[rfe.support_]