# Performing feature selection.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel

In [3]:
ds = pd.read_csv('House_FE.csv')

In [4]:
ds.head()

Unnamed: 0.1,Unnamed: 0,area,rooms,bathroom,parking spaces,floor,hoa (R$),rent amount (R$),property tax (R$),fire insurance (R$),total (R$),city_Campinas,city_Porto Alegre,city_Rio de Janeiro,city_São Paulo,animal_not acept,furniture_not furnished
0,0,4.248495,0.693147,0.0,1,1.609438,7.632886,8.101678,5.351858,3.73767,5618,0,0,0,1,0,0
1,1,5.768321,1.386294,1.386294,0,1.609438,7.090077,8.509161,7.467371,4.143135,7973,0,0,0,1,0,1
2,2,4.382027,0.0,0.0,1,1.609438,6.907755,7.937375,4.828314,3.713572,3841,0,1,0,0,0,1
3,3,3.931826,0.693147,0.0,0,1.609438,5.598422,7.013915,3.091042,2.833213,1421,0,1,0,0,0,1
4,4,3.218876,0.0,0.0,0,1.609438,6.327937,6.684612,3.218876,2.397895,836,0,0,0,1,1,1


In [5]:
x = ds.drop('total (R$)', axis = 1) # copy all the features except the 'total' in x.

In [6]:
y = ds['total (R$)'] # copy 'total' feature in y.

We will do the feature selection using lasso regression which is an regulariztion technique which not only helps to avoid overfitting but also removes unnecessary feature from data.

In [9]:
feature = SelectFromModel(Lasso(alpha = 0.005, random_state = 0)) 
feature.fit(x,y) # calling laso and passing data x and y to it.

SelectFromModel(estimator=Lasso(alpha=0.005, copy_X=True, fit_intercept=True,
                                max_iter=1000, normalize=False, positive=False,
                                precompute=False, random_state=0,
                                selection='cyclic', tol=0.0001,
                                warm_start=False),
                max_features=None, norm_order=1, prefit=False, threshold=None)

In [11]:
selected = x.columns[(feature.get_support())]

In [34]:
print('total number of features:', x.shape[1])
print('selected out of total:', len(selected))
print('non selected features:', np.sum(feature.estimator_.coef_ == 0))

total number of features: 16
selected out of total: 15
non selected features: 1


As we can observe above that using lasso removed one feature out of 16 which was not necessary that is 'floor'.

In [19]:
x[selected] # checking the selected features.

Unnamed: 0.1,Unnamed: 0,area,rooms,bathroom,parking spaces,hoa (R$),rent amount (R$),property tax (R$),fire insurance (R$),city_Campinas,city_Porto Alegre,city_Rio de Janeiro,city_São Paulo,animal_not acept,furniture_not furnished
0,0,4.248495,0.693147,0.000000,1,7.632886,8.101678,5.351858,3.737670,0,0,0,1,0,0
1,1,5.768321,1.386294,1.386294,0,7.090077,8.509161,7.467371,4.143135,0,0,0,1,0,1
2,2,4.382027,0.000000,0.000000,1,6.907755,7.937375,4.828314,3.713572,0,1,0,0,0,1
3,3,3.931826,0.693147,0.000000,0,5.598422,7.013915,3.091042,2.833213,0,1,0,0,0,1
4,4,3.218876,0.000000,0.000000,0,6.327937,6.684612,3.218876,2.397895,0,0,0,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10687,10687,4.143135,0.693147,0.000000,1,5.996452,7.298445,3.178054,3.091042,0,1,0,0,1,0
10688,10688,5.652489,1.386294,1.386294,4,8.039157,9.615805,6.880384,5.252273,0,0,0,1,0,1
10689,10689,4.248495,1.098612,1.098612,0,6.887553,8.699515,5.805135,4.356709,0,0,1,0,1,0
10690,10690,4.787492,0.693147,0.693147,2,7.368340,9.392662,5.631212,5.043425,0,0,1,0,0,0


In [20]:
df = x[selected] # copying the seleted data to updata the main data.

In [21]:
df.head()

Unnamed: 0.1,Unnamed: 0,area,rooms,bathroom,parking spaces,hoa (R$),rent amount (R$),property tax (R$),fire insurance (R$),city_Campinas,city_Porto Alegre,city_Rio de Janeiro,city_São Paulo,animal_not acept,furniture_not furnished
0,0,4.248495,0.693147,0.0,1,7.632886,8.101678,5.351858,3.73767,0,0,0,1,0,0
1,1,5.768321,1.386294,1.386294,0,7.090077,8.509161,7.467371,4.143135,0,0,0,1,0,1
2,2,4.382027,0.0,0.0,1,6.907755,7.937375,4.828314,3.713572,0,1,0,0,0,1
3,3,3.931826,0.693147,0.0,0,5.598422,7.013915,3.091042,2.833213,0,1,0,0,0,1
4,4,3.218876,0.0,0.0,0,6.327937,6.684612,3.218876,2.397895,0,0,0,1,1,1


In [22]:
df = df.drop('Unnamed: 0', axis = 1) # dropping the extra index column.

In [23]:
df.head()

Unnamed: 0,area,rooms,bathroom,parking spaces,hoa (R$),rent amount (R$),property tax (R$),fire insurance (R$),city_Campinas,city_Porto Alegre,city_Rio de Janeiro,city_São Paulo,animal_not acept,furniture_not furnished
0,4.248495,0.693147,0.0,1,7.632886,8.101678,5.351858,3.73767,0,0,0,1,0,0
1,5.768321,1.386294,1.386294,0,7.090077,8.509161,7.467371,4.143135,0,0,0,1,0,1
2,4.382027,0.0,0.0,1,6.907755,7.937375,4.828314,3.713572,0,1,0,0,0,1
3,3.931826,0.693147,0.0,0,5.598422,7.013915,3.091042,2.833213,0,1,0,0,0,1
4,3.218876,0.0,0.0,0,6.327937,6.684612,3.218876,2.397895,0,0,0,1,1,1


In [None]:
df.to_csv() # saving the data to a csv file.