In [102]:
import pandas as pd
import numpy as np

In [103]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':str, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}

In [104]:
data = pd.read_csv('kc_house_data.csv',dtype=dtype_dict)
train = pd.read_csv('kc_house_train_data.csv',dtype=dtype_dict)
test = pd.read_csv('kc_house_test_data.csv',dtype=dtype_dict)

In [105]:
def get_numpy_data(df, features, output):
    df['constant'] = 1 # add a constant column to an SFrame
    features = ['constant'] + features
    features_matrix = df[features].to_numpy()
    output_array = df[output]
    output_array = output_array.to_numpy()
    return(features_matrix, output_array)

In [106]:
def predict_output(feature_matrix, weights):
    predictions = np.dot(feature_matrix,weights)
    return(predictions)

In [107]:
def normalize_features(features):
    norms = np.linalg.norm(features, axis=0)
    normalized_features = features/norms
    return (normalized_features, norms)

In [108]:
normalize_features(np.array([[3,6,9],[4,8,12]]))

(array([[0.6, 0.6, 0.6],
        [0.8, 0.8, 0.8]]),
 array([ 5., 10., 15.]))

In [109]:
(data_feature,output_data) = get_numpy_data(data,['sqft_living','bedrooms'],'price')

In [110]:
(normalize_data,norms_data) = normalize_features(data_feature)

In [111]:
intial_weights = np.array([1.,4.,1.])

In [112]:
predictions = predict_output(normalize_data,intial_weights)
predictions

array([0.02675867, 0.04339256, 0.01990703, ..., 0.02289873, 0.03178473,
       0.02289873])

In [113]:
normalize_data.shape

(21613, 3)

In [114]:
output_data

array([221900., 538000., 180000., ..., 402101., 400000., 325000.])

In [115]:
ro = []
for i in range(1,len(intial_weights)):
	ro.append(np.dot(normalize_data[:,i],(output_data - predictions + intial_weights[i]*normalize_data[:,i])))

In [116]:
ro

[87939470.82325175, 80966698.66623946]

we have ro[0], ro[1], ro[2]

For W1 to be zero, we need ro[1] in [-lambda/2, lambda/2]

We have -lambda/2 <= ro[1] <= lambda/2

This translates to lambda >= -2ro[1] and lambda >= 2ro[1]

For both conditions to be satisfied, lambda >= 2ro[1] = 1.75e8

Similarly for W2. lambda >= 2ro[2] = 1.62e8.

So, w[i] = 0 if lambda >= 2 * abs(ro[i])

In [117]:
print(2*ro[0])
print(2*ro[1])

175878941.6465035
161933397.3324789


In [118]:
def lasso_coordinate_descent_step(i, feature_matrix, output, weights, l1_penalty):
    # compute prediction
    prediction = predict_output(feature_matrix,weights)
    # compute ro[i] = SUM[ [feature_i]*(output - prediction + weight[i]*[feature_i]) ]
    ro_i = np.dot(feature_matrix[:,i],(output - prediction + weights[i]*feature_matrix[:,i]))
    
    if i == 0: # intercept -- do not regularize
        new_weight_i = ro_i
    elif ro_i < -l1_penalty/2.:
        new_weight_i = ro_i + l1_penalty/2
    elif ro_i > l1_penalty/2.:
        new_weight_i = ro_i - l1_penalty/2
    else:
        new_weight_i = 0.
    
    return new_weight_i

In [119]:
import math
print( lasso_coordinate_descent_step(1, np.array([[3./math.sqrt(13),1./math.sqrt(10)],
                   [2./math.sqrt(13),3./math.sqrt(10)]]), np.array([1., 1.]), np.array([1., 4.]), 0.1))

0.4255588466910251


In [120]:
def lasso_cyclical_coordinate_descent(feature_matrix, output, initial_weights, l1_penalty, tolerance):
    max_change = tolerance*2
    while max_change > tolerance:
        change = []
        for i in range(len(initial_weights)):
            old_wight = initial_weights[i]
            initial_weights[i] = lasso_coordinate_descent_step(i,feature_matrix,output,initial_weights,l1_penalty)
            change.append(np.abs(initial_weights[i]-old_wight))
        max_change = max(change)
    return initial_weights

In [121]:
simple_features = ['sqft_living', 'bedrooms']
my_output = 'price'
initial_weights = np.zeros(3)
l1_penalty = 1e7
tolerance = 1.0

In [122]:
(simple_feature_matrix, output) = get_numpy_data(data, simple_features, my_output)
(normalized_simple_feature_matrix, simple_norms) = normalize_features(simple_feature_matrix) # normalize features

In [123]:
weights = lasso_cyclical_coordinate_descent(normalized_simple_feature_matrix, output,
                                            initial_weights, l1_penalty, tolerance)

In [124]:
weights

array([21624997.95951909, 63157247.20788956,        0.        ])

In [125]:
np.sum((output - predict_output(normalized_simple_feature_matrix,weights))**2)

1630492476715386.5

In [177]:

all_features = ['bedrooms',
                'bathrooms',
                'sqft_living',
                'sqft_lot',
                'floors',
                'waterfront', 
                'view', 
                'condition', 
                'grade',
                'sqft_above',
                'sqft_basement',
                'yr_built', 
                'yr_renovated']


In [178]:
my_output = 'price'
(feature_matrix, output) = get_numpy_data(train, all_features, my_output)
normalized_feature_matrix, norms = normalize_features(feature_matrix.astype(float))

In [179]:
l1_penalty = 1e7
initialize_weights = np.zeros(14)
tolerance = 1

In [180]:
weights1e7 = lasso_cyclical_coordinate_descent(normalized_feature_matrix, output,initialize_weights, l1_penalty, tolerance)

In [181]:
weights1e7

array([24429600.23440313,        0.        ,        0.        ,
       48389174.77154894,        0.        ,        0.        ,
        3317511.21492165,  7329961.81171426,        0.        ,
              0.        ,        0.        ,        0.        ,
              0.        ,        0.        ])

In [182]:
pd.Series(weights1e7,index=['intercept']+ train_features)

intercept        2.442960e+07
bedrooms         0.000000e+00
bathrooms        0.000000e+00
sqft_living      4.838917e+07
sqft_lot         0.000000e+00
floors           0.000000e+00
waterfront       3.317511e+06
view             7.329962e+06
condition        0.000000e+00
grade            0.000000e+00
sqft_above       0.000000e+00
sqft_basement    0.000000e+00
yr_built         0.000000e+00
yr_renovated     0.000000e+00
dtype: float64

In [183]:
l1_penalty = 1e8
initialize_weights = np.zeros(14)
tolerance = 1.0

In [184]:
weights1e8 = lasso_cyclical_coordinate_descent(normalized_feature_matrix, output,initialize_weights, l1_penalty, tolerance)

In [185]:
weights1e8

array([71114625.71488702,        0.        ,        0.        ,
              0.        ,        0.        ,        0.        ,
              0.        ,        0.        ,        0.        ,
              0.        ,        0.        ,        0.        ,
              0.        ,        0.        ])

In [186]:
pd.Series(weights1e8, index=['intercept']+train_features)

intercept        7.111463e+07
bedrooms         0.000000e+00
bathrooms        0.000000e+00
sqft_living      0.000000e+00
sqft_lot         0.000000e+00
floors           0.000000e+00
waterfront       0.000000e+00
view             0.000000e+00
condition        0.000000e+00
grade            0.000000e+00
sqft_above       0.000000e+00
sqft_basement    0.000000e+00
yr_built         0.000000e+00
yr_renovated     0.000000e+00
dtype: float64

In [187]:
l1_penalty = 1e4
tolerance = 5e5
initial_weights = np.zeros(14)

In [188]:
weights1e4 = lasso_cyclical_coordinate_descent(normalized_feature_matrix, output,initialize_weights, l1_penalty, tolerance)

In [189]:
weights1e4

array([ 78564738.34156767, -22097398.92430533,  12791071.87278518,
        93808088.092812  ,  -2013172.75704955,  -4219184.93265016,
         6482842.81753506,   7127408.53480688,   5001664.85469639,
        14327518.43714048, -15770959.15237401,  -5159591.22213149,
       -84495341.76843643,   2824439.49703684])

In [190]:
pd.Series(weights1e4, index=['intercept']+train_features)

intercept        7.856474e+07
bedrooms        -2.209740e+07
bathrooms        1.279107e+07
sqft_living      9.380809e+07
sqft_lot        -2.013173e+06
floors          -4.219185e+06
waterfront       6.482843e+06
view             7.127409e+06
condition        5.001665e+06
grade            1.432752e+07
sqft_above      -1.577096e+07
sqft_basement   -5.159591e+06
yr_built        -8.449534e+07
yr_renovated     2.824439e+06
dtype: float64

In [201]:
weights1e7_normalized = weights1e7 / norms
weights1e8_normalized = weights1e8 / norms
weights1e4_normalized = weights1e4 / norms
weights1e7_normalized[3]

161.31745764611756

In [194]:
(test_feature_matrix, test_output) = get_numpy_data(test, train_features, 'price')

In [198]:
print(sum((test_output - predict_output(test_feature_matrix.astype(float),weights1e4_normalized) )**2))

228459958971392.78


In [199]:
print(sum((test_output - predict_output(test_feature_matrix.astype(float),weights1e7_normalized) )**2))

275962075920366.66


In [200]:
print(sum((test_output - predict_output(test_feature_matrix.astype(float),weights1e8_normalized)) )**2)

478797508251945.75
