# Feature Selection

In [1]:
# lets demonstrate feature selection 

# # Import necessary dependencies and settings
import numpy as np
import pandas as pd

# Threshold-Based Methods

In [2]:
# ## Limiting features in bag of word based models

from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(min_df=0.1, max_df=0.85, max_features=2000)
# 2000 is our decision to give.it can be increased if reqd
# discard if word count is less than 10% and greater than 85%
print(cv)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=0.85, max_features=2000, min_df=0.1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)


In [3]:
df = pd.read_csv('./datasets_n_images/datasets_module_4/Pokemon.csv')
poke_gen = pd.get_dummies(df['Generation'])
print(poke_gen.head())

   Gen 1  Gen 2  Gen 3  Gen 4  Gen 5  Gen 6
0      1      0      0      0      0      0
1      1      0      0      0      0      0
2      1      0      0      0      0      0
3      1      0      0      0      0      0
4      1      0      0      0      0      0


In [4]:
# Next, we want to remove features from the one hot encoded features 
# where the variance is less than 0.15. 
# We can do this using the following snippet.

from sklearn.feature_selection import VarianceThreshold

vt = VarianceThreshold(threshold=.15)
print(vt.fit(poke_gen))

VarianceThreshold(threshold=0.15)


In [5]:
# To view the variances as well as which features were finally 
# selected by this algorithm, we can use the variances_ property and the 
# get_support(...) function respectively. 
# The following snippet depicts this clearly in a formatted dataframe.

pd.DataFrame({'variance': vt.variances_,
'select_feature': vt.get_support()},
 index=poke_gen.columns).T

Unnamed: 0,Gen 1,Gen 2,Gen 3,Gen 4,Gen 5,Gen 6
variance,0.164444,0.114944,0.16,0.128373,0.163711,0.0919937
select_feature,True,False,True,False,True,False


We can clearly see which features have been selected based on their True values and also their variance being above 0.15. To get the final subset of selected features, you can use the following code.

In [6]:
poke_gen_subset = poke_gen.iloc[:,vt.get_support()].head()
print(poke_gen_subset)

   Gen 1  Gen 3  Gen 5
0      1      0      0
1      1      0      0
2      1      0      0
3      1      0      0
4      1      0      0
