In [1]:
import pandas as pd

In [2]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':float, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}

In [4]:
data = pd.read_csv('F:\\mlspec\\lasso\\kc_house_data.csv',dtype=dtype_dict)

In [5]:
from math import log, sqrt

In [6]:
data['sqft_living_sqrt'] = data['sqft_living'].apply(sqrt)
data['sqft_lot_sqrt'] = data['sqft_lot'].apply(sqrt)
data['bedrooms_square'] = data['bedrooms']*data['bedrooms']
data['floors_square'] = data['floors']*data['floors']

In [7]:
from sklearn import linear_model

In [9]:
model=linear_model.Lasso(alpha=5e2,normalize=True)

In [13]:
all_features = ['bedrooms', 'bedrooms_square',
            'bathrooms',
            'sqft_living', 'sqft_living_sqrt',
            'sqft_lot', 'sqft_lot_sqrt',
            'floors', 'floors_square',
            'waterfront', 'view', 'condition', 'grade',
            'sqft_above',
            'sqft_basement',
            'yr_built', 'yr_renovated']

In [15]:
model.fit(data[all_features],data['price'])

Lasso(alpha=500.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=True, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [17]:
model.coef_

array([     0.        ,      0.        ,      0.        ,    134.43931396,
            0.        ,      0.        ,      0.        ,      0.        ,
            0.        ,      0.        ,  24750.00458561,      0.        ,
        61749.10309071,      0.        ,      0.        ,     -0.        ,
            0.        ])

In [18]:
testing = pd.read_csv('F:\\mlspec\\lasso\\wk3_kc_house_test_data.csv', dtype=dtype_dict)
training = pd.read_csv('F:\\mlspec\\lasso\\wk3_kc_house_train_data.csv', dtype=dtype_dict)
validation = pd.read_csv('F:\\mlspec\\lasso\\wk3_kc_house_valid_data.csv', dtype=dtype_dict)

In [19]:
testing['sqft_living_sqrt'] = testing['sqft_living'].apply(sqrt)
testing['sqft_lot_sqrt'] = testing['sqft_lot'].apply(sqrt)
testing['bedrooms_square'] = testing['bedrooms']*testing['bedrooms']
testing['floors_square'] = testing['floors']*testing['floors']

training['sqft_living_sqrt'] = training['sqft_living'].apply(sqrt)
training['sqft_lot_sqrt'] = training['sqft_lot'].apply(sqrt)
training['bedrooms_square'] = training['bedrooms']*training['bedrooms']
training['floors_square'] = training['floors']*training['floors']

validation['sqft_living_sqrt'] = validation['sqft_living'].apply(sqrt)
validation['sqft_lot_sqrt'] = validation['sqft_lot'].apply(sqrt)
validation['bedrooms_square'] = validation['bedrooms']*validation['bedrooms']
validation['floors_square'] = validation['floors']*validation['floors']

In [20]:
import numpy as np

In [23]:
rss={}
for l1_penalty in np.logspace(1,7,num=13):
    model1=linear_model.Lasso(alpha=l1_penalty,normalize=True)
    model1.fit(training[all_features],training['price'])
    predicted=model1.predict(validation[all_features])
    rss[l1_penalty]=sum((validation['price']-predicted)**2)

In [24]:
rss

{10.0: 398213327300134.37,
 31.622776601683793: 399041900253348.19,
 100.0: 429791604072557.87,
 316.2277660168379: 463739831045119.5,
 1000.0: 645898733633810.37,
 3162.2776601683795: 1222506859427156.7,
 10000.0: 1222506859427156.7,
 31622.776601683792: 1222506859427156.7,
 100000.0: 1222506859427156.7,
 316227.76601683791: 1222506859427156.7,
 1000000.0: 1222506859427156.7,
 3162277.6601683791: 1222506859427156.7,
 10000000.0: 1222506859427156.7}

In [25]:
min(rss)

10.0

In [26]:
model_test=linear_model.Lasso(alpha=min(rss),normalize=True)

In [27]:
model_test.fit(training[all_features],training['price'])

Lasso(alpha=10.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=True, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [28]:
predict_test=model_test.predict(testing[all_features])

In [29]:
rss_test=sum((testing['price']-predict_test)**2)

In [30]:
rss_test

98467402552698.797

In [31]:
model_test.coef_

array([ -1.61445628e+04,   3.73245384e+02,   5.08412433e+04,
         6.17853560e+02,  -4.44113549e+04,   7.85623065e-01,
        -7.01194765e+02,  -0.00000000e+00,   5.01420046e+03,
         6.19488752e+05,   3.80418557e+04,   2.49987718e+04,
         1.28716235e+05,   0.00000000e+00,   0.00000000e+00,
        -3.29383118e+03,   1.00573209e+01])

In [32]:
model_test.intercept_

6630155.6686283704

In [33]:
np.count_nonzero(model_test.coef_) + np.count_nonzero(model_test.intercept_)

15

In [34]:
max_nonzeros=7

In [46]:
nonzero={}
maxpen=[]
minpen=[]
for l1_penalty in np.logspace(1, 4, num=20):
    model2=linear_model.Lasso(alpha=l1_penalty,normalize=True)
    model2.fit(training[all_features],training['price'])
    nonzero[l1_penalty]=np.count_nonzero(model2.coef_)+np.count_nonzero(model2.intercept_)
    if nonzero[l1_penalty]>max_nonzeros:
        maxpen.append(l1_penalty)
    elif nonzero[l1_penalty]<max_nonzeros:
        minpen.append(l1_penalty)

In [47]:
nonzero

{10.0: 15,
 14.384498882876629: 15,
 20.691380811147901: 15,
 29.763514416313175: 15,
 42.813323987193932: 13,
 61.584821106602639: 12,
 88.586679041008225: 11,
 127.42749857031335: 10,
 183.29807108324357: 7,
 263.66508987303581: 6,
 379.26901907322497: 6,
 545.55947811685144: 6,
 784.75997035146065: 5,
 1128.8378916846884: 3,
 1623.776739188721: 3,
 2335.7214690901214: 2,
 3359.8182862837812: 1,
 4832.9302385717519: 1,
 6951.9279617756056: 1,
 10000.0: 1}

In [48]:
maxpen

[10.0,
 14.384498882876629,
 20.691380811147901,
 29.763514416313175,
 42.813323987193932,
 61.584821106602639,
 88.586679041008225,
 127.42749857031335]

In [49]:
minpen

[263.66508987303581,
 379.26901907322497,
 545.55947811685144,
 784.75997035146065,
 1128.8378916846884,
 1623.776739188721,
 2335.7214690901214,
 3359.8182862837812,
 4832.9302385717519,
 6951.9279617756056,
 10000.0]

In [50]:
l1_penalty_min=max(maxpen)

In [51]:
l1_penalty_max=min(minpen)

In [52]:
l1_penalty_min

127.42749857031335

In [53]:
l1_penalty_max

263.66508987303581

In [56]:
rss1={}
minrss={}
for l1_penalty in np.linspace(l1_penalty_min,l1_penalty_max,20):
    model3=linear_model.Lasso(alpha=l1_penalty,normalize=True)
    model3.fit(training[all_features],training['price'])
    predicted=model3.predict(validation[all_features])
    rss1[l1_penalty]=sum((validation['price']-predicted)**2)
    if np.count_nonzero(model3.coef_)+np.count_nonzero(model3.intercept_)==max_nonzeros:
        minrss[l1_penalty]=rss1[l1_penalty]

In [57]:
minrss

{156.10909673930755: 440037365263317.0,
 163.27949628155611: 440777489641605.19,
 170.44989582380464: 441566698090139.0,
 177.6202953660532: 442406413188664.69,
 184.79069490830176: 443296716874312.81,
 191.96109445055032: 444239780526141.0,
 199.13149399279888: 445230739842613.81}

In [58]:
min(minrss)

156.10909673930755

In [60]:
model4=linear_model.Lasso(alpha=min(minrss),normalize=True)
model4.fit(training[all_features],training['price'])

Lasso(alpha=156.10909673930755, copy_X=True, fit_intercept=True,
   max_iter=1000, normalize=True, positive=False, precompute=False,
   random_state=None, selection='cyclic', tol=0.0001, warm_start=False)

In [61]:
model4.coef_

array([ -0.00000000e+00,  -0.00000000e+00,   1.06108903e+04,
         1.63380252e+02,   0.00000000e+00,  -0.00000000e+00,
        -0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         5.06451687e+05,   4.19600436e+04,   0.00000000e+00,
         1.16253554e+05,   0.00000000e+00,   0.00000000e+00,
        -2.61223488e+03,   0.00000000e+00])

In [62]:
all_features

['bedrooms',
 'bedrooms_square',
 'bathrooms',
 'sqft_living',
 'sqft_living_sqrt',
 'sqft_lot',
 'sqft_lot_sqrt',
 'floors',
 'floors_square',
 'waterfront',
 'view',
 'condition',
 'grade',
 'sqft_above',
 'sqft_basement',
 'yr_built',
 'yr_renovated']