# Select Features

Notebook used to automate feature selection; this notebook takes as input the features output by the regressions notebook; it then ranks them in order of importance

In [1]:
from sklearn.feature_selection import RFECV
from sklearn.svm import SVR
import pandas as pd
from sklearn.linear_model import LinearRegression
import numpy as np

In [2]:
master = pd.read_parquet('master.parquet',engine='fastparquet')

In [3]:
mask = (master['date'] > '2017-07-01') & (master['date'] <= '2017-10-07')

In [4]:
new = master.loc[mask]
new = new.reset_index()
X = new.drop(columns='index')

In [5]:
new = new.select_dtypes(include=['float64','int64'])

In [6]:
X = new.replace([np.inf, -np.inf], np.nan)
X = X.fillna(X.mean())

In [7]:
X = X.drop('price_y', axis = 1)
y = new['price_y']

In [8]:
X = X.drop('index', axis=1)

In [9]:
X = X.drop('scrape_id', axis=1)

In [10]:
X = X.drop('neighbourhood_group_cleansed', axis=1)

In [11]:
X = X.drop('has_availability', axis=1)

In [12]:
X = X.drop('license', axis=1)

In [13]:
X.head(3)

Unnamed: 0,id,host_id,host_listings_count,host_total_listings_count,zipcode,latitude,longitude,accommodates,bathrooms,bedrooms,...,calculated_host_listings_count,reviews_per_month,listing_id,transit_length,transit_variety,transit_vocab_size,transitTextLength,transitTextWordsPerc,transitTextPuncPerc,transitTextDigitsPerc
0,13124681,66918996,1,1,91915.0,32.638694,-116.951252,4,1.5,2.0,...,1,3.0,13124681,3,1,2,1,1.0,0.0,0.0
1,13124681,66918996,1,1,91915.0,32.638694,-116.951252,4,1.5,2.0,...,1,3.0,13124681,3,1,2,1,1.0,0.0,0.0
2,13124681,66918996,1,1,91915.0,32.638694,-116.951252,4,1.5,2.0,...,1,3.0,13124681,3,1,2,1,1.0,0.0,0.0


In [14]:
y = y.replace([np.inf, -np.inf], np.nan)
y = y.fillna(y.mean())

In [15]:
X = X.astype(float)
y = y.astype(float)

In [16]:
include = ['accommodates',
'bathrooms',
'bedrooms',
'square_feet',
'guests_included',
'minimum_nights',
'maximum_nights',
'availability_30',
'availability_60',
'availability_90',
'availability_365',
'number_of_reviews',
'review_scores_rating',
'review_scores_accuracy',
'review_scores_cleanliness',
'review_scores_checkin',
'review_scores_communication',
'review_scores_location',
'review_scores_value',
'calculated_host_listings_count',
'reviews_per_month',
'transit_length',
'transit_variety',
'transit_vocab_size',
'transitTextLength',
'transitTextWordsPerc',
'transitTextPuncPerc',
'transitTextDigitsPerc']

Estimation with Epsilon Support Vector Regression

In [None]:
#This one will freeze

In [None]:
estimator1 = SVR(kernel="linear")
selector1 = RFECV(estimator1, step=3, cv=5)
selector1 = selector1.fit(X[include], y)

In [None]:
selector1.support_ 

In [None]:
selector1.ranking_

In [None]:
for i,k in zip(X.columns, selector1.ranking_):
    print i,k

Estimation with Linear Regression

In [17]:
estimator2 = LinearRegression()
selector2 = RFECV(estimator2, step=3, cv=5)
selector2 = selector2.fit(X[include], y)

In [18]:
selector2.support_ 

array([ True,  True,  True, False, False, False, False, False, False,
       False, False, False, False,  True,  True,  True,  True,  True,
        True, False,  True, False,  True, False, False, False,  True,  True], dtype=bool)

In [19]:
selector2.ranking_

array([1, 1, 1, 5, 5, 2, 6, 4, 3, 6, 6, 4, 2, 1, 1, 1, 1, 1, 1, 3, 1, 3, 1,
       5, 2, 4, 1, 1])

In [20]:
for i,k in zip(X.columns, selector2.ranking_):
    print i,k

id 1
host_id 1
host_listings_count 1
host_total_listings_count 5
zipcode 5
latitude 2
longitude 6
accommodates 4
bathrooms 3
bedrooms 6
beds 6
square_feet 4
guests_included 2
minimum_nights 1
maximum_nights 1
availability_30 1
availability_60 1
availability_90 1
availability_365 1
number_of_reviews 3
review_scores_rating 1
review_scores_accuracy 3
review_scores_cleanliness 1
review_scores_checkin 5
review_scores_communication 2
review_scores_location 4
review_scores_value 1
calculated_host_listings_count 1
