<b>Filter Method</b>

In [87]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

boston=pd.read_csv("boston.csv")

#define the X and y (target) features
X=boston.drop("MEDV",axis=1)
y=boston['MEDV']

scaler=MinMaxScaler(feature_range=(0,1))
X_=scaler.fit_transform(X)
X=pd.DataFrame(X_,columns=X.columns)


# %%
X.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,LSTAT
0,0.0,0.18,0.067815,0.0,0.314815,0.577505,0.641607,0.269203,0.0,0.208015,0.287234,0.08968
1,0.000236,0.0,0.242302,0.0,0.17284,0.547998,0.782698,0.348962,0.043478,0.104962,0.553191,0.20447
2,0.000236,0.0,0.242302,0.0,0.17284,0.694386,0.599382,0.348962,0.043478,0.104962,0.553191,0.063466
3,0.000293,0.0,0.06305,0.0,0.150206,0.658555,0.441813,0.448545,0.086957,0.066794,0.648936,0.033389
4,0.000705,0.0,0.06305,0.0,0.150206,0.687105,0.528321,0.448545,0.086957,0.066794,0.648936,0.099338


VarianceThreshold() function to remove less important features

In [88]:
# Split into train (2/3) and test (1/3) sets
test_size = 0.33
seed = 7
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=seed)
print(X_train.shape)
print(X_test.shape)

(339, 12)
(167, 12)


In [89]:
from sklearn.feature_selection import VarianceThreshold
sel=VarianceThreshold(threshold=(0.02))
sel.fit(X_train)
print("Features selection ", sel.get_support())
print("Selected features ", list (X.columns[sel.get_support()]))
print(" Removed features ", list (X.columns[~sel.get_support()]))

Features selection  [False  True  True  True  True False  True  True  True  True  True  True]
Selected features  [' ZN ', 'INDUS ', 'CHAS', 'NOX', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'LSTAT']
 Removed features  ['CRIM', 'RM']


In [90]:
X_train=sel.transform(X_train)
X_test=sel.transform(X_test)
# %%
print(X_train.shape)
print(X_test.shape)


(339, 10)
(167, 10)


<b>SelectKBest()</b> function to select a given number of features using a univariate statistical test.

In [91]:
# Split into train (2/3) and test (1/3) sets
test_size = 0.33
seed = 7
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=seed)
print(X_train.shape)
print(X_test.shape)

(339, 12)
(167, 12)


In [92]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif # use this for classification tasks
from sklearn.feature_selection import f_regression # use this for regression tasks

kbest=SelectKBest(score_func=f_regression, k=3)
kbest.fit(X_train,y_train)

print("Feature selection", kbest.get_support())
print("Feature scores", kbest.scores_)
print("Selected features:", list(X.columns[kbest.get_support()]))
print("Removed features:", list(X.columns[~kbest.get_support()]))

Feature selection [False False False False False  True False False False False  True  True]
Feature scores [ 71.7505991   45.3094539  102.27204507  12.96777535  75.75687056
 442.09927992  46.82483075  22.32450311  54.40234107  94.37168391
 109.47144894 384.84276122]
Selected features: ['RM', 'PTRATIO', 'LSTAT']
Removed features: ['CRIM', ' ZN ', 'INDUS ', 'CHAS', 'NOX', 'AGE', 'DIS', 'RAD', 'TAX']


In [93]:
X_train=kbest.transform(X_train)
X_test=kbest.transform(X_test)
# %%
print(X_train.shape)
print(X_test.shape)

(339, 3)
(167, 3)


Function to list featurees that are correlated

Adds the first of the correlated pair only

In [94]:
def correlatedFeatures(dataset,threshold):
    correlated_features=set()

    correlations=dataset.corr()
    for i in range(len(correlations)):
        for j in range(i):
            if abs(correlations.iloc[i,j])>threshold:
                correlated_features.add(correlations.columns[i])

    return correlated_features            
            
        

In [95]:
# Split into train (2/3) and test (1/3) sets
test_size = 0.33
seed = 7
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=seed)
print(X_train.shape)
print(X_test.shape)


(339, 12)
(167, 12)


In [96]:
cf=correlatedFeatures(X_train,0.85)
cf

{'TAX'}

In [97]:
X_train=X_train.drop(cf,axis=1)
X_test=X_test.drop(cf,axis=1)
print(X_train.shape)
print(X_test.shape)

(339, 11)
(167, 11)


#<b>Forward selection</b>
In this approach, we start with the best single feature and progressively add additional the best performing of the remaining features.

#<b>Backward selection</b>
In this approach, we start with all features and progressively remove the worst performing of the remaining features.

#<b>Recursive Feature Elimination (RFE)</b>
In this approach, we first train the model with all the features. Then the least important feature is removed and we recursively train models with the remaining features. This is repeated until we reach the desired number of features. This is an extremely thorough approach but at the cost of a considerable amount of computation.




In [98]:
# Split into train (2/3) and test (1/3) sets
test_size = 0.33
seed = 7
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=seed)
print(X_train.shape)
print(X_test.shape)

(339, 12)
(167, 12)


In [99]:
# Feature selection using Recursive Feature Elimimation
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE

# Create a model
model = LinearRegression()

# Select the best 3 features according to RFE
rfe = RFE(estimator= model, n_features_to_select=3)
rfe.fit(X_train, y_train)

print("Feature selection", rfe.support_)
print("Feature ranking", rfe.ranking_)
print("Selected features: ['RM', 'PTRATIO', 'LSTAT']")

Feature selection [False False False False False  True False False False False  True  True]
Feature ranking [ 4  7 10  8  3  1  9  2  6  5  1  1]
Selected features: ['RM', 'PTRATIO', 'LSTAT']


In [100]:
# Transform (remove features not selected)
X_train = rfe.transform(X_train)

X_test = rfe.transform(X_test)
print(X_train.shape)
print(X_test.shape)

(339, 3)
(167, 3)
