In [1]:
import pandas as pd

In [2]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':str, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}

In [3]:
data = pd.read_csv('F:\\mlspec\\lasso\\ass2\\kc_house_data.csv',dtype=dtype_dict)

In [4]:
import numpy as np

In [5]:
def get_numpy_data(data, features, output):
    data['constant'] = 1
    features = ['constant'] + features
    featuredata = data[features]
    feature_matrix = featuredata.as_matrix()
    out = data[output]
    output_array = out.as_matrix()
    return(feature_matrix, output_array)

In [6]:
def predict_output(feature_matrix, weights):
    predictions = np.dot(feature_matrix,weights)
    return(predictions)

In [7]:
data_matrix=data.as_matrix()

In [8]:
from math import sqrt
def normalize_features(features):
    norms = np.linalg.norm(features, axis=0)
    normalized_features=features/norms
    return (normalized_features, norms)
#norms=np.zeros(shape=(1,features.shape[1]))
#    for i in range(features.shape[1]):
#        norms[0,i]=sqrt(sum(features[:,i]**2))
#    normalized_features=features/norms

In [9]:
simple_features = ['sqft_living', 'bedrooms']
my_output = 'price'
(simple_feature_matrix, output) = get_numpy_data(data, simple_features, my_output)

In [10]:
simple_feature_matrix, norms = normalize_features(simple_feature_matrix)

In [11]:
weights = np.array([1., 4., 1.])

In [12]:
prediction = predict_output(simple_feature_matrix,weights)

In [13]:
#ro[i] = SUM[ [feature_i]*(output - (prediction - w[i]*[feature_i])) ]
#ro[i] = SUM[ [feature_i]*(output - prediction + w[i]*[feature_i]) ]

In [14]:
ro=[]

In [15]:
for i in range(simple_feature_matrix.shape[1]):
    ro.append(sum(simple_feature_matrix[:,i]*(output-prediction+weights[i]*simple_feature_matrix[:,i])))
print ro

[79400300.014523208, 87939470.823251516, 80966698.666239053]


In [16]:
roarray=np.asarray(ro)

In [17]:
ro

[79400300.014523208, 87939470.823251516, 80966698.666239053]

In [18]:
def lasso_coordinate_descent_step(i, feature_matrix, output, weights, l1_penalty):
    # compute prediction
    prediction = predict_output(feature_matrix,weights)
    # compute ro[i] = SUM[ [feature_i]*(output - prediction + weight[i]*[feature_i]) ]
    ro_i = sum(feature_matrix[:,i]*(output-prediction+weights[i]*feature_matrix[:,i]))
    
    if i == 0: # intercept -- do not regularize
        new_weight_i = ro_i
    elif ro_i < -l1_penalty/2.:
        new_weight_i = ro_i+(l1_penalty/2)
    elif ro_i > l1_penalty/2.:
        new_weight_i = ro_i-(l1_penalty/2)
    else:
        new_weight_i = 0.
    
    return new_weight_i

In [19]:
import math
print lasso_coordinate_descent_step(1, np.array([[3./math.sqrt(13),1./math.sqrt(10)],
                   [2./math.sqrt(13),3./math.sqrt(10)]]), np.array([1., 1.]), np.array([1., 4.]), 0.1)

0.425558846691


In [20]:
def lasso_cyclical_coordinate_descent(feature_matrix, output, initial_weights, l1_penalty, tolerance):
    converged = 0
    while (converged==0):
        steps=[]
        for i in range(feature_matrix.shape[1]):
            old=initial_weights[i]
            initial_weights[i]=lasso_coordinate_descent_step(i,feature_matrix,output,initial_weights,l1_penalty)
            steps.append(np.absolute(initial_weights[i]-old))
        if max(steps)<tolerance:
            converged=1
    return initial_weights

In [123]:
simple_features = ['sqft_living', 'bedrooms']
my_output = 'price'
initial_weights = np.zeros(3)
l1_penalty = 1e7
tolerance = 1.0

In [124]:
(simple_feature_matrix, output) = get_numpy_data(data, simple_features, my_output)
(normalized_simple_feature_matrix, simple_norms) = normalize_features(simple_feature_matrix) # normalize features

In [125]:
weights = lasso_cyclical_coordinate_descent(normalized_simple_feature_matrix, output,
                                            initial_weights, l1_penalty, tolerance)

In [126]:
weights

array([ 21624997.95951872,  63157247.20788978,         0.        ])

In [109]:
predict = predict_output(normalized_simple_feature_matrix,weights)

In [110]:
sum((output-predict)**2)

1630492476715378.5

In [27]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':float, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}

In [28]:
train = pd.read_csv('F:\\mlspec\\lasso\\kc_house_train_data.csv',dtype=dtype_dict)

In [29]:
test = pd.read_csv('F:\\mlspec\\lasso\\kc_house_test_data.csv',dtype=dtype_dict)

In [30]:
all_features = ['bedrooms',
                'bathrooms',
                'sqft_living',
                'sqft_lot',
                'floors',
                'waterfront', 
                'view', 
                'condition', 
                'grade',
                'sqft_above',
                'sqft_basement',
                'yr_built', 
                'yr_renovated']

In [31]:
feature_matrix,output = get_numpy_data(train,all_features,'price')

In [32]:
norm_feature_matrix,norms=normalize_features(feature_matrix)

In [33]:
weights1e7 = lasso_cyclical_coordinate_descent(norm_feature_matrix, output,
                                            np.zeros(len(all_features)+1), 1e7, 1.0)

In [34]:
weights1e7

array([ 24429600.23440336,         0.        ,         0.        ,
        48389174.77154855,         0.        ,         0.        ,
         3317511.21492165,   7329961.81171433,         0.        ,
               0.        ,         0.        ,         0.        ,
               0.        ,         0.        ])

In [35]:
weights1e8 = lasso_cyclical_coordinate_descent(norm_feature_matrix, output,
                                            np.zeros(len(all_features)+1), 1e8, 1.0)

In [36]:
weights1e8

array([ 71114625.71488713,         0.        ,         0.        ,
               0.        ,         0.        ,         0.        ,
               0.        ,         0.        ,         0.        ,
               0.        ,         0.        ,         0.        ,
               0.        ,         0.        ])

In [37]:
weights1e4 = lasso_cyclical_coordinate_descent(norm_feature_matrix, output,
                                            np.zeros(len(all_features)+1), 1e4, 5e5)

In [38]:
weights1e4

array([ 78564738.34156853, -22097398.92430514,  12791071.87278501,
        93808088.09281223,  -2013172.75704975,  -4219184.93265005,
         6482842.81753503,   7127408.53480683,   5001664.85469713,
        14327518.43714117, -15770959.15237413,  -5159591.2221315 ,
       -84495341.76843902,   2824439.49703689])

In [39]:
norm_weights1e7=weights1e7/norms

In [41]:
norm_weights1e7[3]

161.31745764611625

In [42]:
test_feature_matrix,output = get_numpy_data(test,all_features,'price')

In [44]:
predict1e7=predict_output(test_feature_matrix,norm_weights1e7)

In [45]:
predict1e8=predict_output(test_feature_matrix,weights1e8/norms)

In [46]:
predict1e4=predict_output(test_feature_matrix,weights1e4/norms)

In [47]:
rss1e7=sum((output-predict1e7)**2)

In [48]:
rss1e7

275962075920367.44

In [49]:
rss1e8=sum((output-predict1e8)**2)

In [50]:
rss1e8

537166151497322.37

In [51]:
rss1e4=sum((output-predict1e4)**2)

In [52]:
rss1e4

228459958971392.16