In [27]:
import pandas as pd
import numpy as np

dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':float, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}

sales = pd.read_csv('kc_house_data.csv', dtype=dtype_dict)

In [2]:
from math import log, sqrt
sales['sqft_living_sqrt'] = sales['sqft_living'].apply(sqrt)
sales['sqft_lot_sqrt'] = sales['sqft_lot'].apply(sqrt)
sales['bedrooms_square'] = sales['bedrooms']*sales['bedrooms']
sales['floors_square'] = sales['floors']*sales['floors']

In [3]:
from sklearn import linear_model  # using scikit-learn

all_features = ['bedrooms', 'bedrooms_square',
            'bathrooms',
            'sqft_living', 'sqft_living_sqrt',
            'sqft_lot', 'sqft_lot_sqrt',
            'floors', 'floors_square',
            'waterfront', 'view', 'condition', 'grade',
            'sqft_above',
            'sqft_basement',
            'yr_built', 'yr_renovated']

model_all = linear_model.Lasso(alpha=5e2, normalize=True) # set parameters
model_all.fit(sales[all_features], sales['price']) # learn weights

Lasso(alpha=500.0, normalize=True)

In [11]:
l =list(model_all.coef_)

In [24]:
d = dict(zip(all_features,l))
d

{'bedrooms': 0.0,
 'bedrooms_square': 0.0,
 'bathrooms': 0.0,
 'sqft_living': 134.43931395541438,
 'sqft_living_sqrt': 0.0,
 'sqft_lot': 0.0,
 'sqft_lot_sqrt': 0.0,
 'floors': 0.0,
 'floors_square': 0.0,
 'waterfront': 0.0,
 'view': 24750.004585609488,
 'condition': 0.0,
 'grade': 61749.10309070811,
 'sqft_above': 0.0,
 'sqft_basement': 0.0,
 'yr_built': -0.0,
 'yr_renovated': 0.0}

In [25]:
testing = pd.read_csv('wk3_kc_house_test_data.csv', dtype=dtype_dict)
training = pd.read_csv('wk3_kc_house_train_data.csv', dtype=dtype_dict)
validation = pd.read_csv('wk3_kc_house_valid_data.csv', dtype=dtype_dict)

In [26]:
testing['sqft_living_sqrt'] = testing['sqft_living'].apply(sqrt)
testing['sqft_lot_sqrt'] = testing['sqft_lot'].apply(sqrt)
testing['bedrooms_square'] = testing['bedrooms']*testing['bedrooms']
testing['floors_square'] = testing['floors']*testing['floors']

training['sqft_living_sqrt'] = training['sqft_living'].apply(sqrt)
training['sqft_lot_sqrt'] = training['sqft_lot'].apply(sqrt)
training['bedrooms_square'] = training['bedrooms']*training['bedrooms']
training['floors_square'] = training['floors']*training['floors']

validation['sqft_living_sqrt'] = validation['sqft_living'].apply(sqrt)
validation['sqft_lot_sqrt'] = validation['sqft_lot'].apply(sqrt)
validation['bedrooms_square'] = validation['bedrooms']*validation['bedrooms']
validation['floors_square'] = validation['floors']*validation['floors']

In [28]:
l1_penalty = np.logspace(1,7,num=13)

In [33]:
rss_list = []
for i in l1_penalty:
    model = linear_model.Lasso(alpha=i,normalize=True)
    model.fit(training[all_features],training['price'])
    rss = np.sum((model.predict(validation[all_features]) - validation['price'])**2)
    rss_list.append((i,rss))

In [44]:
rss_list = sorted(rss_list,key=lambda x:x[1])
rss_list

[(10.0, 398213327300135.0),
 (31.622776601683793, 399041900253346.9),
 (100.0, 429791604072559.6),
 (316.22776601683796, 463739831045121.06),
 (1000.0, 645898733633800.8),
 (3162.2776601683795, 1222506859427163.0),
 (10000.0, 1222506859427163.0),
 (31622.776601683792, 1222506859427163.0),
 (100000.0, 1222506859427163.0),
 (316227.7660168379, 1222506859427163.0),
 (1000000.0, 1222506859427163.0),
 (3162277.6601683795, 1222506859427163.0),
 (10000000.0, 1222506859427163.0)]

In [46]:
train_model = linear_model.Lasso(alpha=10.0,normalize=True)
train_model.fit(training[all_features],training['price'])

Lasso(alpha=10.0, normalize=True)

In [49]:
np.count_nonzero(train_model.coef_) + np.count_nonzero(train_model.intercept_)

15

In [50]:
max_nonzeros = 7

In [67]:
l1_penalty = np.logspace(1,4,num=20)

In [80]:
nonzero_count = []

for i in l1_penalty:
    train_model = linear_model.Lasso(alpha=i,normalize=True)
    train_model.fit(training[all_features],training['price'])
    if train_model.intercept_ == 0:
        counts = np.count_nonzero(train_model.coef_)
        nonzero_count.append((i,counts))
    else:
        counts = np.count_nonzero(train_model.coef_) + np.count_nonzero(train_model.intercept_)
        nonzero_count.append((i,counts))

In [81]:
nonzero_count

[(10.0, 15),
 (14.38449888287663, 15),
 (20.6913808111479, 15),
 (29.76351441631318, 15),
 (42.81332398719393, 13),
 (61.58482110660264, 12),
 (88.58667904100822, 11),
 (127.42749857031335, 10),
 (183.29807108324357, 7),
 (263.6650898730358, 6),
 (379.26901907322497, 6),
 (545.5594781168514, 6),
 (784.7599703514607, 5),
 (1128.8378916846884, 3),
 (1623.776739188721, 3),
 (2335.7214690901214, 2),
 (3359.818286283781, 1),
 (4832.930238571752, 1),
 (6951.927961775606, 1),
 (10000.0, 1)]

In [82]:
greater = [k for k,v in nonzero_count if v > 7]
l1_penalty_min = max(greater)
l1_penalty_min

127.42749857031335

In [83]:
smaller = [k for k,v in nonzero_count if v < 7]
l1_penalty_max = min(smaller)
l1_penalty_max

263.6650898730358

In [84]:
l1_penalty = np.linspace(l1_penalty_min,l1_penalty_max,20)

In [101]:
nonzero_count_2 = []
rss_list_2 =[]
for i in l1_penalty:
    train_model = linear_model.Lasso(alpha=i,normalize=True)
    train_model.fit(training[all_features],training['price'])
    rss = np.sum((train_model.predict(validation[all_features]) - validation['price'])**2)
    rss_list_2.append((i,rss))
    if train_model.intercept_ == 0:
        counts = np.count_nonzero(train_model.coef_)
        nonzero_count_2.append((i,counts))
    else:
        counts = np.count_nonzero(train_model.coef_) + np.count_nonzero(train_model.intercept_)
        nonzero_count_2.append((i,counts))

In [102]:
list_of_7 = [k for k,v in nonzero_count_2 if v == max_nonzeros]

In [105]:
rss_list_2 = [(y,x) for x,y in rss_list_2 if x in list_of_7]

In [104]:
rss_list_2

[(440037365263316.56, 156.10909673930755),
 (440777489641605.25, 163.2794962815561),
 (441566698090139.94, 170.44989582380464),
 (442406413188666.25, 177.6202953660532),
 (443296716874315.06, 184.79069490830176),
 (444239780526141.6, 191.96109445055032),
 (445230739842614.2, 199.13149399279888)]

In [110]:
model_1 = linear_model.Lasso(alpha=list_of_7[0], normalize=True) 
model_1.fit(training[all_features], training['price'])

Lasso(alpha=156.10909673930755, normalize=True)

In [111]:
pd.Series(model_1.coef_,index=all_features)

bedrooms                -0.000000
bedrooms_square         -0.000000
bathrooms            10610.890284
sqft_living            163.380252
sqft_living_sqrt         0.000000
sqft_lot                -0.000000
sqft_lot_sqrt           -0.000000
floors                   0.000000
floors_square            0.000000
waterfront          506451.687115
view                 41960.043555
condition                0.000000
grade               116253.553700
sqft_above               0.000000
sqft_basement            0.000000
yr_built             -2612.234880
yr_renovated             0.000000
dtype: float64