In [26]:
import pandas as pd
from sklearn.feature_selection import SelectKBest, f_regression, chi2

In [2]:
# import sample data
my_df = pd.read_csv('feature_selection_sample_data.csv')
my_df.head()

Unnamed: 0,output,input1,input2,input3,input4
0,564,534,536,466,599
1,182,264,103,244,519
2,323,272,339,609,474
3,438,416,444,330,582
4,547,520,464,104,412


In [4]:
# seperate input and output variables
X = my_df.drop(['output'], axis = 1) # axis=1 drops column
y = my_df['output']

### Regression

In [5]:
# instantiate feature slection object
feature_selector = SelectKBest(f_regression, k = 'all') # f_regression returns f-score and p value, k determines # of features to select

In [6]:
# learn relationships
fit = feature_selector.fit(X, y)

In [8]:
# print p values between each input variable and the output variable
fit.pvalues_ # lower p-values indicate a stronger relationship

array([6.41321253e-14, 3.11971032e-14, 3.28616228e-01, 5.11901492e-01])

In [10]:
# print f-scores between each input variable and the output variable
fit.scores_ # higher f-scores indicate a stronger relationship

array([96.13254595, 99.97291167,  0.97062608,  0.43552725])

In [14]:
# create a dataframe with p values and f-scores
p_values = pd.DataFrame(fit.pvalues_)
f_scores = pd.DataFrame(fit.scores_)
input_var_names = pd.DataFrame(X.columns)

summary_stats = pd.concat([input_var_names, p_values, f_scores], axis = 1)
summary_stats.columns = ['input_var', 'p_value', 'f_score']
summary_stats.sort_values(by = 'p_value', inplace = True)
summary_stats

Unnamed: 0,input_var,p_value,f_score
1,input2,3.11971e-14,99.972912
0,input1,6.413213e-14,96.132546
2,input3,0.3286162,0.970626
3,input4,0.5119015,0.435527


In [18]:
# create thresholds and apply logic
p_value_threshold = 0.05
f_score_threshold = 5

selected_variables = summary_stats.loc[
    (summary_stats['p_value'] <= p_value_threshold) &
    (summary_stats['f_score'] >= f_score_threshold)
]

selected_variables

Unnamed: 0,input_var,p_value,f_score
1,input2,3.11971e-14,99.972912
0,input1,6.413213e-14,96.132546


In [22]:
# create new X object only keeping selected variables
selected_variables_list = selected_variables['input_var'].tolist()
X_new = X[selected_variables_list]
X_new.head()

Unnamed: 0,input2,input1
0,536,534
1,103,264
2,339,272
3,444,416
4,464,520


In [24]:
# above is a manual process which is preferred for transparency 
# using k and transform feature to automate the process
feature_selector_automate = SelectKBest(f_regression, k = 2)
fit = feature_selector_automate.fit(X, y)
feature_selector_automate.get_support() # indicates which input variables have been selected

# create new dataframe with selected variables
X_new_automate = X.loc[:, feature_selector_automate.get_support()]
X_new_automate.head()

Unnamed: 0,input1,input2
0,534,536
1,264,103
2,272,339
3,416,444
4,520,464


### Classification

In [39]:
# instantiate feature slection object
feature_selector = SelectKBest(chi2, k = 'all') # f_regression returns f-score and p value, k determines # of features to select

In [40]:
# learn relationships
fit = feature_selector.fit(X, y)

In [41]:
# print p values between each input variable and the output variable
fit.pvalues_ # lower p-values indicate a stronger relationship

array([0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 9.0052362e-76])

In [42]:
# print chi2 scores between each input variable and the output variable
fit.scores_ # higher chi2 scores indicate a stronger relationship

array([1845.67078271, 2480.83241048, 4699.49979698,  514.11705149])

In [43]:
# create a dataframe with p values and f-scores
p_values = pd.DataFrame(fit.pvalues_)
chi2_scores = pd.DataFrame(fit.scores_)
input_var_names = pd.DataFrame(X.columns)

summary_stats = pd.concat([input_var_names, p_values, chi2_scores], axis = 1)
summary_stats.columns = ['input_var', 'p_value', 'chi2_score']
summary_stats.sort_values(by = 'p_value', inplace = True)
summary_stats

Unnamed: 0,input_var,p_value,chi2_score
0,input1,0.0,1845.670783
1,input2,0.0,2480.83241
2,input3,0.0,4699.499797
3,input4,9.005236e-76,514.117051


In [46]:
# create thresholds and apply logic (thresholds are just placeholder values for example, replace as needed)
p_value_threshold = 0.05
chi2_score_threshold = 2000

selected_variables = summary_stats.loc[
    (summary_stats['p_value'] <= p_value_threshold) &
    (summary_stats['chi2_score'] >= chi2_score_threshold)
]

selected_variables

Unnamed: 0,input_var,p_value,chi2_score
1,input2,0.0,2480.83241
2,input3,0.0,4699.499797


In [47]:
# create new X object only keeping selected variables
selected_variables_list = selected_variables['input_var'].tolist()
X_new = X[selected_variables_list]
X_new.head()

Unnamed: 0,input2,input3
0,536,466
1,103,244
2,339,609
3,444,330
4,464,104
