In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
from sklearn.linear_model import LinearRegression, Ridge, Lasso

In [6]:
data=pd.read_csv('kc_house_data.csv')
train=pd.read_csv('wk3_kc_house_train_data.csv')
test=pd.read_csv('wk3_kc_house_test_data.csv')
valid=pd.read_csv('wk3_kc_house_valid_data.csv')

In [7]:
def cols(sales):
    sales['sqft_living_sqrt'] = np.sqrt(sales['sqft_living'])
    sales['sqft_lot_sqrt'] = np.sqrt(sales['sqft_lot'])
    sales['bedrooms_square'] = sales['bedrooms']*sales['bedrooms']
    sales['floors_square'] = sales['floors']*sales['floors']
    return sales

In [8]:
data=cols(data)
train=cols(train)
test=cols(test)
valid=cols(valid)

In [9]:
## model using all data

all_features = ['bedrooms', 'bedrooms_square',
            'bathrooms',
            'sqft_living', 'sqft_living_sqrt',
            'sqft_lot', 'sqft_lot_sqrt',
            'floors', 'floors_square',
            'waterfront', 'view', 'condition', 'grade',
            'sqft_above',
            'sqft_basement',
            'yr_built', 'yr_renovated']

model_all = Lasso(alpha=5e2, normalize=True)          # set parameters
model_all.fit(data[all_features], data['price'])    # learn weights

Lasso(alpha=500.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=True, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [25]:
coef=model_all.coef_
df=pd.DataFrame(coef,index=all_features)
df

Unnamed: 0,0
bedrooms,0.0
bedrooms_square,0.0
bathrooms,0.0
sqft_living,134.439314
sqft_living_sqrt,0.0
sqft_lot,0.0
sqft_lot_sqrt,0.0
floors,0.0
floors_square,0.0
waterfront,0.0


In [26]:
## select best l1

l1_pen=np.logspace(1, 7, num=13)
err=[]
min_err=None
best_l1=None
for l1 in l1_pen:
    model = Lasso(alpha=l1, normalize=True)
    model.fit(train[all_features],train['price'])
    
    y_pred=model.predict(valid[all_features])
    y=valid['price']
    
    e= np.sum( (y-y_pred)**2 )
    err.append(e)
    if(min_err is None or e<min_err):
        min_err=e
        best_l1=l1

In [27]:
best_l1

10.0

In [30]:
print(l1_pen)
print(err)

[1.00000000e+01 3.16227766e+01 1.00000000e+02 3.16227766e+02
 1.00000000e+03 3.16227766e+03 1.00000000e+04 3.16227766e+04
 1.00000000e+05 3.16227766e+05 1.00000000e+06 3.16227766e+06
 1.00000000e+07]
[398213327300134.9, 399041900253346.9, 429791604072559.6, 463739831045121.1, 645898733633800.8, 1222506859427163.0, 1222506859427163.0, 1222506859427163.0, 1222506859427163.0, 1222506859427163.0, 1222506859427163.0, 1222506859427163.0, 1222506859427163.0]


In [31]:
## compute rss on test using selected l1

model = Lasso(alpha=best_l1, normalize=True)
model.fit(train[all_features],train['price'])

y_pred=model.predict(test[all_features])
y=test['price']

e=np.sum( (y-y_pred)**2 )
print(e)

98467402552698.75


In [32]:
coef=model.coef_
df=pd.DataFrame(coef,index=all_features)
df

Unnamed: 0,0
bedrooms,-16144.562757
bedrooms_square,373.245384
bathrooms,50841.24334
sqft_living,617.85356
sqft_living_sqrt,-44411.354867
sqft_lot,0.785623
sqft_lot_sqrt,-701.194765
floors,-0.0
floors_square,5014.200457
waterfront,619488.752486


In [34]:
model.intercept_    # 14+1 = 15 non-zero features

6630155.66862836

In [36]:
### What if we absolutely wanted to limit ourselves to, say, 7 features? 
max_nonzeros = 7
l1_pen=np.logspace(1, 4, num=20)   # explore large range of l1 penalties

In [85]:
## select best l1

l1_pen=np.logspace(1, 4, num=20)

l1_min=None
coef_min=None

coef_max=None
l1_max=None

for l1 in l1_pen:
    model = Lasso(alpha=l1, normalize=True)
    model.fit(train[all_features],train['price'])
    
    coef=model.coef_!=0
    coef=coef.astype(int)
    coef=np.sum(coef)
    if(model.intercept_!=0):
        coef+=1
        
    if(coef>7 and (coef_min is None or coef<coef_min) ):
        coef_min=coef
        l1_min=l1
    elif(coef<7 and (coef_max is None or coef>coef_max) ):
        coef_max=coef
        l1_max=l1

In [86]:
l1_min

127.42749857031335

In [87]:
l1_max

263.6650898730358

In [88]:
## now explore this narrow range of l1
l1_pen = np.linspace(l1_min,l1_max,20)

err=[]
min_err=None
best_pen=None

for l1 in l1_pen:
    model = Lasso(alpha=l1, normalize=True)
    model.fit(train[all_features],train['price'])
    
    y_pred=model.predict(valid[all_features])
    y=valid['price']
    
    coef=model.coef_!=0
    coef=coef.astype(int)
    coef=np.sum(coef)
    if(model.intercept_!=0):
        coef+=1
    
    e= np.sum( (y-y_pred)**2 )
    err.append(e)
    if( (min_err is None or e<min_err) and coef==7):   
        min_err=e
        best_l1=l1

In [89]:
best_l1

156.10909673930755

In [90]:
## fit using best model
model = Lasso(alpha=best_l1, normalize=True)
model.fit(train[all_features],train['price'])   

Lasso(alpha=156.10909673930755, copy_X=True, fit_intercept=True,
   max_iter=1000, normalize=True, positive=False, precompute=False,
   random_state=None, selection='cyclic', tol=0.0001, warm_start=False)

In [91]:
coef=model.coef_
df=pd.DataFrame(coef,index=all_features)
df

Unnamed: 0,0
bedrooms,-0.0
bedrooms_square,-0.0
bathrooms,10610.890284
sqft_living,163.380252
sqft_living_sqrt,0.0
sqft_lot,-0.0
sqft_lot_sqrt,-0.0
floors,0.0
floors_square,0.0
waterfront,506451.687115
