In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

In [2]:
df = sns.load_dataset('diamonds')

In [3]:
df

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.20,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75
...,...,...,...,...,...,...,...,...,...,...
53935,0.72,Ideal,D,SI1,60.8,57.0,2757,5.75,5.76,3.50
53936,0.72,Good,D,SI1,63.1,55.0,2757,5.69,5.75,3.61
53937,0.70,Very Good,D,SI1,62.8,60.0,2757,5.66,5.68,3.56
53938,0.86,Premium,H,SI2,61.0,58.0,2757,6.15,6.12,3.74


In [4]:
df = df.drop(['cut', 'color', 'clarity'], axis = 1)

In [5]:
df

Unnamed: 0,carat,depth,table,price,x,y,z
0,0.23,61.5,55.0,326,3.95,3.98,2.43
1,0.21,59.8,61.0,326,3.89,3.84,2.31
2,0.23,56.9,65.0,327,4.05,4.07,2.31
3,0.29,62.4,58.0,334,4.20,4.23,2.63
4,0.31,63.3,58.0,335,4.34,4.35,2.75
...,...,...,...,...,...,...,...
53935,0.72,60.8,57.0,2757,5.75,5.76,3.50
53936,0.72,63.1,55.0,2757,5.69,5.75,3.61
53937,0.70,62.8,60.0,2757,5.66,5.68,3.56
53938,0.86,61.0,58.0,2757,6.15,6.12,3.74


# SelectKBest

In [6]:
from sklearn.feature_selection import SelectKBest, f_classif

In [7]:
X = df.drop('price', axis = 1)
y = df['price']

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2)

In [9]:
selector = SelectKBest(score_func = f_classif, k = 5)
#New dataframe with the selected features for later use in the classifier. fit() method works too, if you want only the feature names and their corresponding scores
X_new = selector.fit_transform(X_train, y_train)
names = X_train.columns.values[selector.get_support()]
scores = selector.scores_[selector.get_support()]
names_scores = list(zip(names, scores))
ns_df = pd.DataFrame(data = names_scores, columns=['Feat_names', 'F_Scores'])
#Sort the dataframe for better visualization
ns_df_sorted = ns_df.sort_values(['F_Scores', 'Feat_names'], ascending = [False, True])
print(ns_df_sorted)

  Feat_names   F_Scores
2          x  79.085557
0      carat  74.016978
4          z  37.741028
3          y  35.848754
1      table   1.308916


In [10]:
ns_df.sort_values('F_Scores', ascending=False)

Unnamed: 0,Feat_names,F_Scores
2,x,79.085557
0,carat,74.016978
4,z,37.741028
3,y,35.848754
1,table,1.308916


In [11]:
X_train.columns.values

array(['carat', 'depth', 'table', 'x', 'y', 'z'], dtype=object)

In [12]:
selector.get_support()

array([ True, False,  True,  True,  True,  True])

In [13]:
selector.scores_

array([74.01697785,  1.18891924,  1.30891572, 79.08555731, 35.84875371,
       37.74102791])

In [14]:
ns_df.sort_values('F_Scores', ascending=False)

Unnamed: 0,Feat_names,F_Scores
2,x,79.085557
0,carat,74.016978
4,z,37.741028
3,y,35.848754
1,table,1.308916


# RFE

In [15]:
X = df.drop('price', axis = 1)
y = df['price']
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2)

In [16]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression

In [17]:
model = LinearRegression()
rfe = RFE(estimator = model, n_features_to_select = 5)
fit = rfe.fit(X_train, y_train)
print("Num Features: %d" % fit.n_features_ )
print("Selected Features: %s" % fit.support_ )
print("Feature Ranking: %s" % fit.ranking_ )

Num Features: 5
Selected Features: [ True  True  True  True  True False]
Feature Ranking: [1 1 1 1 1 2]


In [18]:
feature_names = np.array(X_train.columns)
print(feature_names[fit.support_])

['carat' 'depth' 'table' 'x' 'y']


In [19]:
model.fit(X_train, y_train)

LinearRegression()

In [20]:
pd.DataFrame(model.coef_, X.columns, columns = ['Coefficient'])

Unnamed: 0,Coefficient
carat,10876.170964
depth,-208.360165
table,-103.757783
x,-1359.864782
y,45.247256
z,28.789999


# Boruta

In [21]:
import numpy as np
### make X_shadow by randomly permuting each column of X
np.random.seed(42)
X_shadow = X.apply(np.random.permutation)
X_shadow.columns = ['shadow_' + feat for feat in X.columns]
### make X_boruta by appending X_shadow to X
X_boruta = pd.concat([X, X_shadow], axis = 1)

In [22]:
X_boruta

Unnamed: 0,carat,depth,table,x,y,z,shadow_carat,shadow_depth,shadow_table,shadow_x,shadow_y,shadow_z
0,0.23,61.5,55.0,3.95,3.98,2.43,0.24,63.7,59.0,5.76,5.84,3.54
1,0.21,59.8,61.0,3.89,3.84,2.31,0.58,63.0,56.0,5.64,4.90,4.27
2,0.23,56.9,65.0,4.05,4.07,2.31,0.40,60.1,58.0,5.65,5.81,2.86
3,0.29,62.4,58.0,4.20,4.23,2.63,0.43,61.0,56.2,5.69,6.54,4.64
4,0.31,63.3,58.0,4.34,4.35,2.75,1.55,62.9,56.0,4.65,6.62,2.69
...,...,...,...,...,...,...,...,...,...,...,...,...
53935,0.72,60.8,57.0,5.75,5.76,3.50,1.05,63.0,56.0,5.51,5.97,3.92
53936,0.72,63.1,55.0,5.69,5.75,3.61,0.47,63.1,59.0,5.36,4.35,4.73
53937,0.70,62.8,60.0,5.66,5.68,3.56,0.33,61.8,55.0,7.38,5.77,3.18
53938,0.86,61.0,58.0,6.15,6.12,3.74,0.90,61.7,57.0,5.29,6.79,3.60


In [28]:
from sklearn.ensemble import RandomForestRegressor
from boruta import BorutaPy

forest = RandomForestRegressor(max_depth = 5)

feat_selector = BorutaPy(forest, n_estimators='auto', verbose=2, random_state=1)

feat_selector.fit(np.array(X_train), np.array(y_train))

Iteration: 	1 / 100
Confirmed: 	0
Tentative: 	6
Rejected: 	0
Iteration: 	2 / 100
Confirmed: 	0
Tentative: 	6
Rejected: 	0
Iteration: 	3 / 100
Confirmed: 	0
Tentative: 	6
Rejected: 	0
Iteration: 	4 / 100
Confirmed: 	0
Tentative: 	6
Rejected: 	0
Iteration: 	5 / 100
Confirmed: 	0
Tentative: 	6
Rejected: 	0
Iteration: 	6 / 100
Confirmed: 	0
Tentative: 	6
Rejected: 	0
Iteration: 	7 / 100
Confirmed: 	0
Tentative: 	6
Rejected: 	0
Iteration: 	8 / 100
Confirmed: 	6
Tentative: 	0
Rejected: 	0


BorutaPy finished running.

Iteration: 	9 / 100
Confirmed: 	6
Tentative: 	0
Rejected: 	0


BorutaPy(estimator=RandomForestRegressor(max_depth=5, n_estimators=69,
                                         random_state=RandomState(MT19937) at 0x10E1F0140),
         n_estimators='auto', random_state=RandomState(MT19937) at 0x10E1F0140,
         verbose=2)

In [29]:
feat_selector.support_

array([ True,  True,  True,  True,  True,  True])

In [30]:
feat_selector.ranking_

array([1, 1, 1, 1, 1, 1])

In [31]:
feature_df = pd.DataFrame(X_train.columns.tolist(), columns=['features'])
feature_df ['rank']=feat_selector.ranking_

In [32]:
print (feature_df.head(feat_selector.n_features_))

  features  rank
0    carat     1
1    depth     1
2    table     1
3        x     1
4        y     1
5        z     1
