In [1]:
import pandas as pd
import numpy as np
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':float, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}

In [2]:
def get_numpy_data(data_frame, features, output):
    data_frame['constant']=1

    features=['constant']+features
    
    features_np = data_frame[features].values
    output_np = data_frame[output].values
    
    return features_np, output_np

In [3]:
def predict_outcome(feature_matrix, weights):
    predictions=np.dot(feature_matrix,weights)
    #predictions=np.dot(feature_matrix,weights.reshape((weights.size,1)))
    return(predictions)

In [4]:
def normalize_features(features):
    norms = np.linalg.norm(features, axis=0)
    return (features/norms, norms)

In [9]:
sales = pd.read_csv('kc_house_data.csv', dtype=dtype_dict)
for c in sales.columns:
    print(c, sales[c].dtype)

id object
date object
price float64
bedrooms float64
bathrooms float64
sqft_living float64
sqft_lot int64
floors float64
waterfront int64
view int64
condition int64
grade int64
sqft_above int64
sqft_basement int64
yr_built int64
yr_renovated int64
zipcode object
lat float64
long float64
sqft_living15 float64
sqft_lot15 float64


In [8]:
(features, output) = get_numpy_data(sales, ['sqft_living','bedrooms'], 'price')

In [12]:
normalized_features,norms=normalize_features(features)

In [13]:
weights=np.array([1.,4.,1.])

In [14]:
predictions=predict_outcome(normalized_features,weights)

In [15]:
print(predictions)

[ 0.02675867  0.04339256  0.01990703 ...,  0.02289873  0.03178473
  0.02289873]


In [16]:
def lasso_coordinate_descent_step(i, feature_matrix, output, weights, l1_penalty):
    # compute prediction
    prediction = predict_outcome(feature_matrix,weights)
    # compute ro[i] = SUM[ [feature_i]*(output - prediction + weight[i]*[feature_i]) ]
    ro_i = np.sum(feature_matrix[:,i]*(output - prediction + weights[i]*feature_matrix[:,i])) 
    # subtract weight[i]*feature[i] because prediction already has this quantity and we want to figure out 
    # whether feature i is really important.
    
    if i == 0: # intercept -- do not regularize
        new_weight_i = ro_i
    elif ro_i < -l1_penalty/2.:
        new_weight_i = ro_i + l1_penalty/2
    elif ro_i > l1_penalty/2.:
        new_weight_i = ro_i - l1_penalty/2
    else:
        new_weight_i = 0.
    
    return new_weight_i

In [18]:
# should print 0.425558846691
import math
print(lasso_coordinate_descent_step(1, np.array([[3./math.sqrt(13),1./math.sqrt(10)],
                   [2./math.sqrt(13),3./math.sqrt(10)]]), np.array([1., 1.]), np.array([1., 4.]), 0.1))

0.425558846691


In [32]:
def lasso_cyclical_coordinate_descent(feature_matrix, output, initial_weights, l1_penalty, tolerance):
    weights = np.array(initial_weights)
    max_change = tolerance
    while max_change >= tolerance:        
        max_change = 0
        for i in range(len(weights)):
            old_weight=weights[i]
            weights[i] = lasso_coordinate_descent_step(i,feature_matrix,output,weights,l1_penalty)
            if np.abs(old_weight-weights[i])>max_change:
                max_change = np.abs(old_weight-weights[i])
           # print(i,old_weight-weights[i])
      #  print('max: ', max_change)
                
    return weights

In [22]:
initial_weights = np.zeros(3)
l1_penalty = 1e7
tolerance = 1.0

In [33]:
lasso_cyclical_coordinate_descent(normalized_features, output, initial_weights, l1_penalty, tolerance)

array([ 21625011.88130648,  63157234.47244619,         0.        ])

In [26]:
print(output)

[ 221900.  538000.  180000. ...,  402101.  400000.  325000.]


In [27]:
print(normalized_features)

[[ 0.00680209  0.00353021  0.00583571]
 [ 0.00680209  0.00768869  0.00583571]
 [ 0.00680209  0.00230361  0.00389048]
 ..., 
 [ 0.00680209  0.00305154  0.00389048]
 [ 0.00680209  0.00478673  0.00583571]
 [ 0.00680209  0.00305154  0.00389048]]


In [28]:
print(initial_weights)

[ 0.  0.  0.]


In [46]:
df_house_train=pd.read_csv("kc_house_train_data.csv", dtype=dtype_dict)
df_house_test=pd.read_csv("kc_house_test_data.csv", dtype=dtype_dict)

In [47]:
features_list=['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated']
(features, output) = get_numpy_data(df_house_train,features_list, 'price')

In [48]:
normalized_features,norms=normalize_features(features)

In [37]:
df_house_train.columns

Index(['id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
       'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode',
       'lat', 'long', 'sqft_living15', 'sqft_lot15', 'constant'],
      dtype='object')

In [51]:
initial_weights = np.zeros(14)
l1_penalty = 1e7
tolerance = 1.0

In [52]:
weights1e7=lasso_cyclical_coordinate_descent(normalized_features, output, initial_weights, l1_penalty, tolerance)

In [53]:
print(weights1e7)

[ 24429600.23440313         0.                 0.          48389174.77154895
         0.                 0.           3317511.21492165
   7329961.81171426         0.                 0.                 0.
         0.                 0.                 0.        ]


In [54]:
weights_normalized = weights1e7 / norms

In [55]:
print(weights_normalized)

[  1.85285530e+05   0.00000000e+00   0.00000000e+00   1.61317458e+02
   0.00000000e+00   0.00000000e+00   2.87664705e+05   6.91937041e+04
   0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
   0.00000000e+00   0.00000000e+00]
