In [41]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib as plt
%matplotlib inline
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':str, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}
khouse_train = pd.read_csv('kc_house_train_data.csv',dtype = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':str, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int})
khouse_test = pd.read_csv('kc_house_test_data.csv',dtype = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':str, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int})
khouse_all = pd.read_csv('assessment data/kc_house_data.csv',dtype=dtype_dict)

In [192]:
def RSS(y,y_est):
    return ((y-y_est)**2).sum()


def set_data(features, output):
    features['constant'] = 1
    features.set_axis([i for i in range(features.shape[1])],axis='columns',inplace=True)
    return (features, output)


def predict_output(feature_matrix, weights):
    return (feature_matrix*weights).sum(axis=1)


def normalize_features(features):
    norms = np.linalg.norm(features,axis=0)
    return (features/norms,norms)


def lasso_coordinate_descent_step(i, feature_matrix, output, weights, l1_penalty):
    prediction = predict_output(feature_matrix, weights)

    ro_i = (feature_matrix[i]*(output-prediction+weights[i]*feature_matrix[i])).sum()

    if i == feature_matrix.shape[1]-1: # intercept -- do not regularize
        new_weight_i = ro_i
    elif ro_i < -l1_penalty/2.:
        new_weight_i = ro_i+(l1_penalty/2)
    elif ro_i > l1_penalty/2.:
        new_weight_i = ro_i-(l1_penalty/2)
    else:
        new_weight_i = 0.
    
    return new_weight_i


def lasso_cyclical_coordinate_descent(feature_matrix, output, initial_weights, l1_penalty, tolerance):
    weights = initial_weights
    max_step = tolerance+1
    while max_step>tolerance:
        max_step = 0
        for i in range(feature_matrix.shape[1]):
            nw = lasso_coordinate_descent_step(i, feature_matrix, output, weights, l1_penalty)
            dif = abs(weights[i]-nw)
            if dif>max_step:
                max_step = dif
            weights[i] = nw
    return weights

In [226]:
simple_features = ['sqft_living','bedrooms']
(simple_feature_matrix, simple_output) = set_data(khouse_all[simple_features].copy(), khouse_all['price'])
simple_feature_matrix, norms = normalize_features(simple_feature_matrix)
initial_weights= np.array([1.,4.,1.])

for penalty in [1.4e8, 1.64e6, 1.73e8,1.9e8,2.3e8]:
    nw1 = lasso_coordinate_descent_step(0,simple_feature_matrix,simple_output,initial_weights, penalty)
    nw2 = lasso_coordinate_descent_step(1,simple_feature_matrix,simple_output,initial_weights, penalty)
    nw3 = lasso_coordinate_descent_step(2,simple_feature_matrix,simple_output,initial_weights, penalty)
    print('{:e}'.format(penalty), nw1, nw2, nw3)

1.400000e+08 17939467.991878748 10966701.497612491 79400299.86693393
1.640000e+06 87119467.99187875 80146701.49761249 79400299.86693393
1.730000e+08 1439467.991878748 0.0 79400299.86693393
1.900000e+08 0.0 0.0 79400299.86693393
2.300000e+08 0.0 0.0 79400299.86693393


In [141]:
initial_weights= np.array([0.,0.,0.])
l1_penalty, tolerance = 1e7, 1.0
fitted_weights = lasso_cyclical_coordinate_descent(simple_feature_matrix, simple_output, initial_weights,l1_penalty, tolerance)
print('{:e}'.format(RSS(simple_output, predict_output(simple_feature_matrix, fitted_weights))))
print(fitted_weights)

1.630492e+15
[63157255.63750926        0.         21624989.5289373 ]


In [255]:
features = ['bedrooms','bathrooms', 'sqft_living', 'sqft_lot','floors','waterfront',
                              'view', 'condition', 'grade', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated']
more_features = khouse_train[features].copy()
more_features['floors'] = more_features['floors'].astype('float64')
(more_feature_matrix, simple_output) = set_data(more_features, khouse_train['price'])
more_feature_matrix, norms = normalize_features(more_feature_matrix)

initial_weights= np.zeros(14)
l1_penalty, tolerance = 1e7, 1.0
weights1e7 = lasso_cyclical_coordinate_descent(more_feature_matrix, simple_output, initial_weights,l1_penalty, tolerance)

In [248]:
[features[i] for i in range(13) if weights1e7[i]]

['sqft_living', 'waterfront', 'view']

In [250]:
l1_penalty, tolerance = 1e8, 1.0
weights1e8 = lasso_cyclical_coordinate_descent(more_feature_matrix, simple_output, initial_weights,l1_penalty, tolerance)
[features[i] for i in range(13) if weights1e8[i]]

[]

In [243]:
l1_penalty, tolerance = 1e4, 5e5
weights1e4 = lasso_cyclical_coordinate_descent(more_feature_matrix, simple_output, initial_weights,l1_penalty, tolerance)
[features[i] for i in range(13) if weights1e4[i]]

['bedrooms',
 'bathrooms',
 'sqft_living',
 'sqft_lot',
 'floors',
 'waterfront',
 'view',
 'condition',
 'grade',
 'sqft_above',
 'sqft_basement',
 'yr_built',
 'yr_renovated']

In [260]:
# use normalized version of weights ot predict
m_1e7 = weights1e7/norms
m_1e8 = weights1e8/norms
m_1e4 = weights1e4/norms

t_features = khouse_test[features].copy()
t_features['floors'] = t_features['floors'].astype('float64')
(t_feature_matrix, t_output) = set_data(t_features, khouse_test['price'])

print(RSS(t_output, predict_output(t_features, m_1e7)))
print(RSS(t_output, predict_output(t_features, m_1e8)))
print(RSS(t_output, predict_output(t_features, m_1e4)))

275962058003747.12
537166151497322.75
228434052028129.0


In [259]:
weights1e7/norms

array([0.00000000e+00, 0.00000000e+00, 1.61317488e+02, 0.00000000e+00,
       0.00000000e+00, 2.87664706e+05, 6.91936984e+04, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 1.85285462e+05])

## Notes:
凡事出现链式赋值的情况，pandas都是不能够确定到底返回的是一个引用还是一个拷贝。所以遇到这种情况就干脆报warning


In [121]:
### NOTICE: ###
a1=pd.DataFrame([[1,2,3],[4,5,6],[7,8,9]])
a2=np.array([[1,2,3],[4,5,6],[7,8,9]])
print(a1[0])
print(a2[0])

0    1
1    4
2    7
Name: 0, dtype: int64
[1 2 3]
