# 0.

In [1]:
import pandas as pd

dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':float, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}

sales = pd.read_csv('kc_house_data.csv', dtype=dtype_dict)

# 1.

In [2]:
from math import log, sqrt
sales['sqft_living_sqrt'] = sales['sqft_living'].apply(sqrt)
sales['sqft_lot_sqrt'] = sales['sqft_lot'].apply(sqrt)
sales['bedrooms_square'] = sales['bedrooms']*sales['bedrooms']
sales['floors_square'] = sales['floors']*sales['floors']

# 2.

In [3]:
from sklearn import linear_model  # using scikit-learn

all_features = ['bedrooms', 'bedrooms_square',
            'bathrooms',
            'sqft_living', 'sqft_living_sqrt',
            'sqft_lot', 'sqft_lot_sqrt',
            'floors', 'floors_square',
            'waterfront', 'view', 'condition', 'grade',
            'sqft_above',
            'sqft_basement',
            'yr_built', 'yr_renovated']

model_all = linear_model.Lasso(alpha=5e2, normalize=True) # set parameters
model_all.fit(sales[all_features], sales['price']) # learn weights

Lasso(alpha=500.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=True, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

# 3.

In [4]:
for i in xrange(0,len(model_all.coef_)):
    print all_features[i] + ' - ' + str(model_all.coef_[i])

bedrooms - 0.0
bedrooms_square - 0.0
bathrooms - 0.0
sqft_living - 134.439313955
sqft_living_sqrt - 0.0
sqft_lot - 0.0
sqft_lot_sqrt - 0.0
floors - 0.0
floors_square - 0.0
waterfront - 0.0
view - 24750.0045856
condition - 0.0
grade - 61749.1030907
sqft_above - 0.0
sqft_basement - 0.0
yr_built - -0.0
yr_renovated - 0.0


# 4.

In [5]:
testing = pd.read_csv('wk3_kc_house_test_data.csv', dtype=dtype_dict)
training = pd.read_csv('wk3_kc_house_train_data.csv', dtype=dtype_dict)
validation = pd.read_csv('wk3_kc_house_valid_data.csv', dtype=dtype_dict)

testing['floors'] = testing['floors'].astype(float)
training['floors'] = training['floors'].astype(float)
validation['floors'] = validation['floors'].astype(float)

In [6]:
testing['sqft_living_sqrt'] = testing['sqft_living'].apply(sqrt)
testing['sqft_lot_sqrt'] = testing['sqft_lot'].apply(sqrt)
testing['bedrooms_square'] = testing['bedrooms']*testing['bedrooms']
testing['floors_square'] = testing['floors']*testing['floors']

training['sqft_living_sqrt'] = training['sqft_living'].apply(sqrt)
training['sqft_lot_sqrt'] = training['sqft_lot'].apply(sqrt)
training['bedrooms_square'] = training['bedrooms']*training['bedrooms']
training['floors_square'] = training['floors']*training['floors']

validation['sqft_living_sqrt'] = validation['sqft_living'].apply(sqrt)
validation['sqft_lot_sqrt'] = validation['sqft_lot'].apply(sqrt)
validation['bedrooms_square'] = validation['bedrooms']*validation['bedrooms']
validation['floors_square'] = validation['floors']*validation['floors']

# 5.

In [7]:
import numpy as np
l1_penalty = np.logspace(1, 7, num=13)
l1_penalty

array([  1.00000000e+01,   3.16227766e+01,   1.00000000e+02,
         3.16227766e+02,   1.00000000e+03,   3.16227766e+03,
         1.00000000e+04,   3.16227766e+04,   1.00000000e+05,
         3.16227766e+05,   1.00000000e+06,   3.16227766e+06,
         1.00000000e+07])

In [8]:
l1_penalty_model = []
for i in xrange(0,len(l1_penalty)):
    l1_penalty_model.append(linear_model.Lasso(alpha=l1_penalty[i], normalize=True))
l1_penalty_model

[Lasso(alpha=10.0, copy_X=True, fit_intercept=True, max_iter=1000,
    normalize=True, positive=False, precompute=False, random_state=None,
    selection='cyclic', tol=0.0001, warm_start=False),
 Lasso(alpha=31.622776601683793, copy_X=True, fit_intercept=True,
    max_iter=1000, normalize=True, positive=False, precompute=False,
    random_state=None, selection='cyclic', tol=0.0001, warm_start=False),
 Lasso(alpha=100.0, copy_X=True, fit_intercept=True, max_iter=1000,
    normalize=True, positive=False, precompute=False, random_state=None,
    selection='cyclic', tol=0.0001, warm_start=False),
 Lasso(alpha=316.22776601683796, copy_X=True, fit_intercept=True,
    max_iter=1000, normalize=True, positive=False, precompute=False,
    random_state=None, selection='cyclic', tol=0.0001, warm_start=False),
 Lasso(alpha=1000.0, copy_X=True, fit_intercept=True, max_iter=1000,
    normalize=True, positive=False, precompute=False, random_state=None,
    selection='cyclic', tol=0.0001, warm_start=Fa

In [9]:
for i in xrange(0,len(l1_penalty)):
    l1_penalty_model[i].fit(training[all_features], training['price']) # learn weights

In [10]:
l1_penalty_rss = []
for i in xrange(0,len(l1_penalty)):
    l1_penalty_rss.append(np.mean((l1_penalty_model[i].predict(validation[all_features]) - validation['price']) ** 2))
l1_penalty_rss

[41329873098.093834,
 41415869253.071945,
 44607327874.68167,
 48130755687.0908,
 67036713402.57505,
 126881874356.73656,
 126881874356.73656,
 126881874356.73656,
 126881874356.73656,
 126881874356.73656,
 126881874356.73656,
 126881874356.73656,
 126881874356.73656]

# 6.

In [12]:
l1_penalty[0]

10.0

In [13]:
test = []
test.append(5)
test.append(2)
test.append(3)
test.append(6)
test

[5, 2, 3, 6]

# 7.

In [23]:
l1_penalty_model[0].fit(testing[all_features], testing['price'])



Lasso(alpha=10.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=True, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [None]:
best_rss = np.mean((l1_penalty_model[0].predict(validation[all_features]) - validation['price']) ** 2)