In [4]:
import pandas as pd
import numpy as np
import sklearn.linear_model

### Load data sets

In [13]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':float, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}

sales = pd.read_csv('kc_house_data.csv', dtype = dtype_dict)

### Create new features

In [14]:
from math import log, sqrt
sales['sqft_living_sqrt'] = sales['sqft_living'].apply(sqrt)
sales['sqft_lot_sqrt'] = sales['sqft_lot'].apply(sqrt)
sales['bedrooms_square'] = sales['bedrooms']*sales['bedrooms']
sales['floors_square'] = sales['floors']*sales['floors']

### Creating first Lasso model

In [17]:
from sklearn import linear_model  # using scikit-learn
all_features = ['bedrooms', 'bedrooms_square', 'bathrooms', 'sqft_living', 'sqft_living_sqrt',
                'sqft_lot', 'sqft_lot_sqrt', 'floors', 'floors_square', 'waterfront', 'view',
                'condition', 'grade', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated']
model_all = linear_model.Lasso(alpha = 5e2, normalize = True) # set parameters
model_all.fit(sales[all_features], sales['price']) # learn weights

Lasso(alpha=500.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=True, positive=False, precompute='auto', tol=0.0001,
   warm_start=False)

## Quiz Question: Which features have been chosen by LASSO, i.e. which features were assigned nonzero weights?

In [20]:
print model_all.coef_

[     0.              0.              0.            134.43931396      0.
      0.              0.              0.              0.              0.
  24750.00458561      0.          61749.10309071      0.              0.
     -0.              0.        ]


In [21]:
print zip(all_features, model_all.coef_)

[('bedrooms', 0.0), ('bedrooms_square', 0.0), ('bathrooms', 0.0), ('sqft_living', 134.43931395540966), ('sqft_living_sqrt', 0.0), ('sqft_lot', 0.0), ('sqft_lot_sqrt', 0.0), ('floors', 0.0), ('floors_square', 0.0), ('waterfront', 0.0), ('view', 24750.004585614093), ('condition', 0.0), ('grade', 61749.103090711666), ('sqft_above', 0.0), ('sqft_basement', 0.0), ('yr_built', -0.0), ('yr_renovated', 0.0)]


## Part 2

In [22]:
testing = pd.read_csv('wk3_kc_house_test_data.csv', dtype=dtype_dict)
training = pd.read_csv('wk3_kc_house_train_data.csv', dtype=dtype_dict)
validation = pd.read_csv('wk3_kc_house_valid_data.csv', dtype=dtype_dict)

In [23]:
testing['sqft_living_sqrt'] = testing['sqft_living'].apply(sqrt)
testing['sqft_lot_sqrt'] = testing['sqft_lot'].apply(sqrt)
testing['bedrooms_square'] = testing['bedrooms']*testing['bedrooms']
testing['floors_square'] = testing['floors']*testing['floors']

training['sqft_living_sqrt'] = training['sqft_living'].apply(sqrt)
training['sqft_lot_sqrt'] = training['sqft_lot'].apply(sqrt)
training['bedrooms_square'] = training['bedrooms']*training['bedrooms']
training['floors_square'] = training['floors']*training['floors']

validation['sqft_living_sqrt'] = validation['sqft_living'].apply(sqrt)
validation['sqft_lot_sqrt'] = validation['sqft_lot'].apply(sqrt)
validation['bedrooms_square'] = validation['bedrooms']*validation['bedrooms']
validation['floors_square'] = validation['floors']*validation['floors']

In [24]:
l1_penalty = np.logspace(1, 7, num = 13)

In [25]:
print l1_penalty

[  1.00000000e+01   3.16227766e+01   1.00000000e+02   3.16227766e+02
   1.00000000e+03   3.16227766e+03   1.00000000e+04   3.16227766e+04
   1.00000000e+05   3.16227766e+05   1.00000000e+06   3.16227766e+06
   1.00000000e+07]


In [35]:
def fit_and_rss(train, valid, l1_penalty):
    model = linear_model.Lasso(alpha=l1_penalty, normalize=True)
    model.fit(train[all_features], train['price'])
    preds = model.predict(valid[all_features])
    rss = sum([(x-y)**2 for x,y in zip(preds, valid['price'])])
    
    return (l1_penalty, rss, model)

In [36]:
ll = [fit_and_rss(training, validation, x) for x in l1_penalty]

In [37]:
print ll

[(10.0, 398213327300134.44, Lasso(alpha=10.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=True, positive=False, precompute='auto', tol=0.0001,
   warm_start=False)), (31.622776601683793, 399041900253351.38, Lasso(alpha=31.622776601683793, copy_X=True, fit_intercept=True,
   max_iter=1000, normalize=True, positive=False, precompute='auto',
   tol=0.0001, warm_start=False)), (100.0, 429791604072558.12, Lasso(alpha=100.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=True, positive=False, precompute='auto', tol=0.0001,
   warm_start=False)), (316.22776601683796, 463739831045119.88, Lasso(alpha=316.22776601683796, copy_X=True, fit_intercept=True,
   max_iter=1000, normalize=True, positive=False, precompute='auto',
   tol=0.0001, warm_start=False)), (1000.0, 645898733633810.75, Lasso(alpha=1000.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=True, positive=False, precompute='auto', tol=0.0001,
   warm_start=False)), (3162.2776601683795, 122250

## Quiz Question: Which was the best value for the l1_penalty, i.e. which value of l1_penalty produced the lowest RSS on VALIDATION data?

In [39]:
best = reduce(lambda a,b: a if a[1] < b[1] else b, ll)
print best

(10.0, 398213327300134.44, Lasso(alpha=10.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=True, positive=False, precompute='auto', tol=0.0001,
   warm_start=False))


## Quiz Question: Using the best L1 penalty, how many nonzero weights do you have? Count the number of nonzero coefficients first, and add 1 if the intercept is also nonzero

In [40]:
print best[2].coef_

[ -1.61445628e+04   3.73245384e+02   5.08412433e+04   6.17853560e+02
  -4.44113549e+04   7.85623065e-01  -7.01194765e+02  -0.00000000e+00
   5.01420046e+03   6.19488752e+05   3.80418557e+04   2.49987718e+04
   1.28716235e+05   0.00000000e+00   0.00000000e+00  -3.29383118e+03
   1.00573209e+01]


In [43]:
print best[2].intercept_

6630155.66863


In [44]:
print len(best[2].coef_)-2 + 1

16


In [46]:
np.count_nonzero(best[2].coef_) + np.count_nonzero(best[2].intercept_)

15

## Explore feature selection using Lasso Regression

#### Exploring large range of l1_penalty

In [47]:
max_nonzeros = 7
l1_penalty_large = np.logspace(1, 4, num = 20)
print l1_penalty_large

[    10.             14.38449888     20.69138081     29.76351442
     42.81332399     61.58482111     88.58667904    127.42749857
    183.29807108    263.66508987    379.26901907    545.55947812
    784.75997035   1128.83789168   1623.77673919   2335.72146909
   3359.81828628   4832.93023857   6951.92796178  10000.        ]


In [52]:
def fit_and_extrac_useful_info(train, l1_penalty):
    model = sklearn.linear_model.Lasso(alpha = l1_penalty, normalize = True)
    model.fit(train[all_features], train['price'])
    nn = np.count_nonzero(model.coef_) + np.count_nonzero(model.intercept_)
    
    return (nn, model, l1_penalty)

In [53]:
ll = [fit_and_extrac_useful_info(training,x) for x in l1_penalty_large]

In [54]:
print ll

[(15, Lasso(alpha=10.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=True, positive=False, precompute='auto', tol=0.0001,
   warm_start=False), 10.0), (15, Lasso(alpha=14.384498882876629, copy_X=True, fit_intercept=True,
   max_iter=1000, normalize=True, positive=False, precompute='auto',
   tol=0.0001, warm_start=False), 14.384498882876629), (15, Lasso(alpha=20.691380811147901, copy_X=True, fit_intercept=True,
   max_iter=1000, normalize=True, positive=False, precompute='auto',
   tol=0.0001, warm_start=False), 20.691380811147901), (15, Lasso(alpha=29.763514416313178, copy_X=True, fit_intercept=True,
   max_iter=1000, normalize=True, positive=False, precompute='auto',
   tol=0.0001, warm_start=False), 29.763514416313178), (13, Lasso(alpha=42.813323987193932, copy_X=True, fit_intercept=True,
   max_iter=1000, normalize=True, positive=False, precompute='auto',
   tol=0.0001, warm_start=False), 42.813323987193932), (12, Lasso(alpha=61.584821106602639, copy_X=True, fit_int

In [58]:
l1_penalty_max = reduce(lambda a,b: a if a[2] < b[2] else b, filter(lambda s: s[0] < max_nonzeros, ll))[2]
l1_penalty_min = reduce(lambda a,b: a if a[2] > b[2] else b, filter(lambda s: s[0] > max_nonzeros, ll))[2]

# Quiz Question: What values did you find for l1_penalty_min and l1_penalty_max?

In [59]:
print l1_penalty_max
print l1_penalty_min

263.665089873
127.42749857


#### Exploring narrower range of l1_penalty

In [60]:
l1_penalty_narrow = np.linspace(l1_penalty_min, l1_penalty_max, 20)

In [61]:
def fit_and_rss(train, valid, l1_penalty):
    model = sklearn.linear_model.Lasso(alpha = l1_penalty, normalize=True)
    model.fit(train[all_features], train['price'])
    nn = np.count_nonzero(model.coef_) + np.count_nonzero(model.intercept_)
    preds = model.predict(valid[all_features])
    rss = sum([(x-y)**2 for x,y in zip(preds, valid['price'])])
    
    return (rss, nn, model)

In [62]:
ll = [fit_and_rss(training, validation, x) for x in l1_penalty_narrow]

In [65]:
ll = filter(lambda s: s[1] == max_nonzeros, ll)
best = reduce(lambda a,b: a if a[0] < b[0] else b, ll)

In [66]:
print best

(440037365263316.75, 7, Lasso(alpha=156.10909673930755, copy_X=True, fit_intercept=True,
   max_iter=1000, normalize=True, positive=False, precompute='auto',
   tol=0.0001, warm_start=False))


#Quiz Question: What value of l1_penalty in our narrow range has the lowest RSS on the VALIDATION set and has sparsity equal to ‘max_nonzeros’?

In [69]:
print best[2].alpha

156.109096739


#Quiz Question: What features in this model have non-zero coefficients?

In [75]:
features = zip(all_features, best[2].coef_)
filter(lambda s: s[1] != 0.0, features)

[('bathrooms', 10610.890284398854),
 ('sqft_living', 163.38025164762911),
 ('waterfront', 506451.68711485498),
 ('view', 41960.043554851894),
 ('grade', 116253.55369970684),
 ('yr_built', -2612.2348803574864)]