# Feature Selection and LASSO 

# Fire up Graphlab Create

In [1]:
import graphlab

# Load in house sales data

Dataset is from house sales in King County, the region where the city of Seattle, WA is located.

In [2]:
sales = graphlab.SFrame('kc_house_data.gl')

This non-commercial license of GraphLab Create for academic use is assigned to nanlee_89@yahoo.com and will expire on December 07, 2018.


[INFO] graphlab.cython.cy_server: GraphLab Create v2.1 started. Logging: /tmp/graphlab_server_1529352712.log


# Create new features

In [3]:
from math import log, sqrt
sales['sqft_living_sqrt'] = sales['sqft_living'].apply(sqrt)
sales['sqft_lot_sqrt'] = sales['sqft_lot'].apply(sqrt)
sales['bedrooms_square'] = sales['bedrooms']*sales['bedrooms']

# In the dataset, 'floors' was defined with type string, 
# so we'll convert them to float, before creating a new feature.
sales['floors'] = sales['floors'].astype(float) 
sales['floors_square'] = sales['floors']*sales['floors']

* Squaring bedrooms will increase the separation between not many bedrooms (e.g. 1) and lots of bedrooms (e.g. 4) since 1^2 = 1 but 4^2 = 16. Consequently this variable will mostly affect houses with many bedrooms.
* On the other hand, taking square root of sqft_living will decrease the separation between big house and small house. The owner may not be exactly twice as happy for getting a house that is twice as big.

# Linear regression model with L1 penalty

In [4]:
all_features = ['bedrooms', 'bedrooms_square',
            'bathrooms',
            'sqft_living', 'sqft_living_sqrt',
            'sqft_lot', 'sqft_lot_sqrt',
            'floors', 'floors_square',
            'waterfront', 'view', 'condition', 'grade',
            'sqft_above',
            'sqft_basement',
            'yr_built', 'yr_renovated']

In [5]:
model_all = graphlab.linear_regression.create(sales, target='price', features=all_features,
                                              validation_set=None, verbose = False,
                                              l2_penalty=0., l1_penalty=1e10)

## Explore weights in the model

In [6]:
model_all_weights = model_all.get('coefficients')
model_all_weights.sort('value', ascending=False)

name,index,value,stderr
(intercept),,274873.05595,
bathrooms,,8468.53108691,
grade,,842.068034898,
sqft_living_sqrt,,350.060553386,
sqft_living,,24.4207209824,
sqft_above,,20.0247224171,
sqft_lot,,0.0,
sqft_lot_sqrt,,0.0,
floors,,0.0,
floors_square,,0.0,


Note that a majority of the weights have been set to zero. So by setting an L1 penalty that's large enough, we are performing a subset selection. 


# Selecting an L1 penalty

In [7]:
(training_and_validation, testing) = sales.random_split(.9,seed=1) # initial train/test split
(training, validation) = training_and_validation.random_split(0.5, seed=1) # split training into train and validate

In [8]:
import numpy as np

In [9]:
def cross_validation_error(l1_penalty, train, validation, output_name, features_list):
    
    
    model = graphlab.linear_regression.create(train, target = output_name, 
                                              features = features_list, 
                                              verbose = False,
                                              l1_penalty = l1_penalty,
                                              l2_penalty=0. ,
                                              validation_set = None)
    # compute validation error
    validation_predicted = model.predict(validation)
    validation_error = ((validation_predicted-validation[output_name])**2).sum()
        
    #print "Validation_error for 11_penalty = ", l1_penalty, validation_error

    return validation_error

In [10]:
l1_penalty_validation_error = []
l1_penalty_values = []

for l1_penalty in np.logspace(1, 7, num=13):
    l1_penalty_values.append(l1_penalty)
    validation_error = cross_validation_error(l1_penalty, training, validation, 'price', all_features)
    l1_penalty_validation_error.append(validation_error)

## The best value for the `l1_penalty`

In [11]:
for l1, error in zip(l1_penalty_values, l1_penalty_validation_error):
    if error == min(l1_penalty_validation_error):
        print l1


10.0


### Number of nonzero weights using selected best l1 value

In [12]:
model_best_l1 = graphlab.linear_regression.create(sales, target='price', features=all_features,
                                              validation_set=None, verbose= False,
                                              l2_penalty=0., l1_penalty=10)

In [13]:
model_best_l1.get('coefficients').sort('value')

name,index,value,stderr
sqft_lot,,-0.0168499198461,
yr_built,,10.1881669529,
sqft_living,,37.4911504798,
sqft_above,,41.3497390696,
yr_renovated,,58.7115840166,
sqft_basement,,118.23242135,
sqft_lot_sqrt,,149.569423985,
sqft_living_sqrt,,1109.39597073,
bedrooms_square,,1479.73787423,
grade,,6205.64105779,


In [14]:
model_best_l1['coefficients']['value'].nnz()

18

# Limit the number of nonzero weights

In [15]:
max_nonzeros = 7

## Exploring the larger range of values to find a narrow range with the desired sparsity


In [16]:
# define a wide range of possible l1_penalty_values:
l1_penalty_values = np.logspace(8, 10, num=20)

In [17]:
l1_penalty_validation_error = []
non_zero_parameters = []

for l1_penalty in np.logspace(8, 10, num=20):
    # l1_penalty_values.append(l1_penalty)
    model = graphlab.linear_regression.create(training, target = 'price', 
                                              features = all_features, 
                                              verbose = False,
                                              l1_penalty = l1_penalty,
                                              l2_penalty=0. ,
                                              validation_set = None)
    # compute validation error
    validation_predicted = model.predict(validation)
    validation_error = ((validation_predicted-validation['price'])**2).sum()   
    model_non_zero_weights = model['coefficients']['value'].nnz()
    
    l1_penalty_validation_error.append(validation_error)
    non_zero_parameters.append(model_non_zero_weights)

### Explore models with number of nonzero weights and corresponding l1 value

In [18]:
for l1, weight in zip(l1_penalty_values, non_zero_parameters):
    print (l1, weight)

(100000000.0, 18)
(127427498.57031322, 18)
(162377673.91887242, 18)
(206913808.111479, 18)
(263665089.87303555, 17)
(335981828.6283788, 17)
(428133239.8719396, 17)
(545559478.1168514, 17)
(695192796.1775591, 17)
(885866790.4100832, 16)
(1128837891.6846883, 15)
(1438449888.2876658, 15)
(1832980710.8324375, 13)
(2335721469.0901213, 12)
(2976351441.6313133, 10)
(3792690190.7322536, 6)
(4832930238.571753, 5)
(6158482110.6602545, 3)
(7847599703.514623, 1)
(10000000000.0, 1)


Out of this large range, we want to find the two ends of our desired narrow range of l1_penalty. At one end, we will have l1_penalty values that have too few non-zeros, and at the other end, we will have an l1_penalty that has too many non-zeros.

* The largest l1_penalty that has more non-zeros than `max_nonzeros`
* The smallest l1_penalty that has fewer non-zeros than `max_nonzeros'

In [19]:
l1_penalty_min_range = []
l1_penalty_max_range = []
for l1, nonzero in zip(l1_penalty_values, non_zero_parameters):
    if nonzero > max_nonzeros:
        l1_penalty_min_range.append(l1)
    else:
        l1_penalty_max_range.append(l1)
        
l1_penalty_min = np.max(l1_penalty_min_range)
l1_penalty_max = np.min(l1_penalty_max_range)

In [20]:
print "l1_penalty_min = ", l1_penalty_min
print "l1_penalty_max = ", l1_penalty_max


l1_penalty_min =  2976351441.6313133
l1_penalty_max =  3792690190.7322536


## Exploring the narrow range of values to find the solution with the right number of non-zeros that has lowest RSS on the validation set 

We will now explore the narrow region of `l1_penalty` values we found:

In [21]:
l1_penalty_values = np.linspace(l1_penalty_min,l1_penalty_max,20)

* For `l1_penalty` in `np.linspace(l1_penalty_min,l1_penalty_max,20)`:
    * Fit a regression model with a given `l1_penalty` on TRAIN data. Specify `l1_penalty=l1_penalty` and `l2_penalty=0.` in the parameter list. When you call `linear_regression.create()` make sure you set `validation_set = None`
    * Measure the RSS of the learned model on the VALIDATION set

Find the model that the lowest RSS on the VALIDATION set and has sparsity *equal* to `max_nonzeros`.

In [22]:
l1_penalty_validation_error = []
non_zero_parameters = []

for l1_penalty in np.linspace(l1_penalty_min,l1_penalty_max,20):
    # l1_penalty_values.append(l1_penalty)
    model = graphlab.linear_regression.create(training, target = 'price', 
                                              features = all_features, 
                                              verbose = False,
                                              l1_penalty = l1_penalty,
                                              l2_penalty=0. ,
                                              validation_set = None)
    # compute validation error
    validation_predicted = model.predict(validation)
    validation_error = ((validation_predicted-validation['price'])**2).sum()   
    model_non_zero_weights = model['coefficients']['value'].nnz()
    
    l1_penalty_validation_error.append(validation_error)
    non_zero_parameters.append(model_non_zero_weights)

In [23]:
for l1, nonzeros, rss in zip(l1_penalty_values, non_zero_parameters, l1_penalty_validation_error):
    print (l1, nonzeros, rss)

(2976351441.6313133, 10, 966925692362084.5)
(3019316638.9524155, 10, 974019450084556.1)
(3062281836.2735176, 10, 981188367942452.8)
(3105247033.5946198, 10, 989328342459474.0)
(3148212230.915722, 10, 998783211265891.2)
(3191177428.236824, 10, 1008477167020094.0)
(3234142625.557926, 10, 1018298780553819.8)
(3277107822.8790283, 10, 1028247992205977.2)
(3320073020.2001305, 8, 1034616909232828.1)
(3363038217.5212326, 8, 1038554735941040.8)
(3406003414.8423347, 8, 1043237237871703.0)
(3448968612.163437, 7, 1046937488751711.1)
(3491933809.484539, 7, 1051147625612860.9)
(3534899006.805641, 7, 1055992735342999.1)
(3577864204.126743, 7, 1060799531763287.8)
(3620829401.447845, 6, 1065707689498230.1)
(3663794598.768947, 6, 1069464335425586.5)
(3706759796.0900493, 6, 1073504549585599.6)
(3749724993.4111514, 6, 1077632775581416.0)
(3792690190.7322536, 6, 1081867592324110.6)


In [24]:
model_final = graphlab.linear_regression.create(training, target = 'price', 
                                              features = all_features, 
                                              verbose = False,
                                              l1_penalty = 3448968612.163437,
                                              l2_penalty=0. ,
                                              validation_set = None)
    # compute validation error

In [25]:
model_final['coefficients'].sort('value', ascending = False)

name,index,value,stderr
(intercept),,222253.192544,
bathrooms,,15873.9572593,
grade,,2899.42026975,
sqft_living_sqrt,,690.114773313,
bedrooms,,661.722717782,
sqft_living,,32.4102214513,
sqft_above,,30.0115753022,
sqft_lot_sqrt,,0.0,
floors,,0.0,
floors_square,,0.0,
