In [1]:
import pandas as pd
import numpy as np

In [162]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':float, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}
data = pd.read_csv('data/kc_house_data.csv', dtype=dtype_dict)
train = pd.read_csv('data/kc_house_train_data.csv', dtype=dtype_dict)
test = pd.read_csv('data/kc_house_test_data.csv', dtype=dtype_dict)

In [4]:
def get_numpy_data(data, features, output):
    data['constant'] = 1 # add a constant column 

    # prepend variable 'constant' to the features list
    features = ['constant'] + features

    # select the columns of data_SFrame given by the ‘features’ list into the SFrame ‘features_sframe’
    features_matrix=data[features].to_numpy()

    # assign the column of data_sframe associated with the target to the variable ‘output_sarray’
    output_array = data[output].to_numpy()

    return(features_matrix, output_array)

In [5]:
def predict_outcome(feature_matrix, weights):
    predictions = np.dot(feature_matrix,weights)
    return(predictions)

In [60]:
def normalize_features(features):
    norms = np.linalg.norm(features, axis=0)
    normalized_features = features/norms
    return (normalized_features, norms)

In [63]:
features, norms = normalize_features(np.array([[3.,6.,9.],[4.,8.,12.]]))
print(features)
# should print
# [[ 0.6  0.6  0.6]
#  [ 0.8  0.8  0.8]]
print(norms)
# should print
# [5.  10.  15.]

[[0.6 0.6 0.6]
 [0.8 0.8 0.8]]
[ 5. 10. 15.]


In [64]:
features = ['sqft_living', 'bedrooms']
my_output = 'price'
features_matrix, output = get_numpy_data(data, features, my_output)


In [65]:
features_matrix, norms = normalize_features(features_matrix)

In [66]:
initial_weights = [1.,4.,1.]

In [67]:
prediction = predict_outcome(features_matrix, initial_weights)

In [68]:
ro = [np.sum(features_matrix[:,i]*(output - prediction + initial_weights[i]*features_matrix[:,i])) for i in range(3)]

In [69]:
ro

[79400300.0145229, 87939470.82325175, 80966698.66623947]

In [79]:
l1_penalty_range = [ro[2]*2+1 , ro[1]*2-1]
print(l1_penalty_range)

[161933398.33247894, 175878940.6465035]


In [80]:
def lasso_coordinate_descent_step(i, features_matrix, output, weights, l1_penalty):
    # compute prediction
    prediction = predict_outcome(features_matrix, weights)
    # compute ro[i] = SUM[ [feature_i]*(output - prediction + weight[i]*[feature_i]) ]
    ro_i = np.sum(features_matrix[:,i]*(output - prediction + weights[i]*features_matrix[:,i]))
    
    if i == 0: # intercept -- do not regularize
        new_weight_i = ro_i
    elif ro_i < -l1_penalty/2.:
        new_weight_i = ro_i + l1_penalty/2
    elif ro_i > l1_penalty/2.:
        new_weight_i = ro_i - l1_penalty/2
    else:
        new_weight_i = 0.
    
    return new_weight_i

In [117]:
# should print 0.425558846691
import math
print(lasso_coordinate_descent_step(1, np.array([[3./math.sqrt(13),1./math.sqrt(10)],[2./math.sqrt(13),3./math.sqrt(10)]]), 
                                   np.array([1., 1.]), np.array([1., 4.]), 0.1))

0.4255588466910251


In [140]:
def lasso_cyclical_coordinate_descent(features_matrix, output, initial_weights, l1_penalty, tolerance):
    weights = initial_weights.copy()  
    converged = False
    iter = 0
    
    while not converged:
        max_change = 0.0
        
        for i in range(len(weights)):
            old_weight = weights[i]
            # Actualiza el peso i usando los valores más recientes de los otros pesos
            weights[i] = lasso_coordinate_descent_step(i, features_matrix, output, weights, l1_penalty)
            # Calcula el cambio para este peso
            change = abs(old_weight - weights[i])
            max_change = max(max_change, change)
        
        # Verifica la convergencia después de cada ciclo completo
        converged = max_change < tolerance
        iter += 1
    
    print(iter)
    return weights

In [141]:
initial_weights = [0.,0.,0.]
l1_penalty = 1e7
tolerance = 1.0
features = ['sqft_living', 'bedrooms']
my_output = 'price'
features_matrix, output = get_numpy_data(data, features, my_output)

features_matrix, norms = normalize_features(features_matrix)

In [142]:
weights=lasso_cyclical_coordinate_descent(features_matrix,output,initial_weights, l1_penalty, tolerance)

93


In [143]:
prediction = predict_outcome(features_matrix, weights)

In [144]:
RSS = ((data['price']-prediction)**2).sum()

In [146]:
RSS

1630492476715386.5

In [145]:
weights 

[21624997.959519103, 63157247.20788956, 0.0]

# More feautures

In [163]:
all_features = ['bedrooms',
                'bathrooms',
                'sqft_living',
                'sqft_lot',
                'floors',
                'waterfront', 
                'view', 
                'condition', 
                'grade',
                'sqft_above',
                'sqft_basement',
                'yr_built', 
                'yr_renovated']
my_output = 'price'


In [164]:
(all_feature_train_matrix, all_train_output) = get_numpy_data(train, all_features, my_output)
normalized_all_train_feature_matrix, norms = normalize_features(all_feature_train_matrix)

In [172]:
l1_penalty=1e7
initial_weights = np.zeros(len(all_features)+1)
tolerance = 1.0
my_output = 'price'
features_matrix, output = get_numpy_data(train, all_features, my_output)
normalized_feature_matrix, norms = normalize_features(features_matrix)


In [173]:
weights1e7=lasso_cyclical_coordinate_descent(normalized_feature_matrix,output,initial_weights, l1_penalty, tolerance)

93


In [174]:
print(weights1e7)

[24429600.23440313        0.                0.         48389174.77154895
        0.                0.          3317511.21492165  7329961.81171426
        0.                0.                0.                0.
        0.                0.        ]


In [175]:
l1_penalty=1e8
initial_weights = np.zeros(len(all_features)+1)
tolerance = 1.0
weights1e8=lasso_cyclical_coordinate_descent(normalized_feature_matrix,output,initial_weights, l1_penalty, tolerance)

2


In [176]:
weights1e8

array([71114625.71488702,        0.        ,        0.        ,
              0.        ,        0.        ,        0.        ,
              0.        ,        0.        ,        0.        ,
              0.        ,        0.        ,        0.        ,
              0.        ,        0.        ])

In [177]:
l1_penalty = 1e4
tolerance = 5e5

In [178]:
initial_weights = np.zeros( len(all_features)+1 )
weights1e4 =lasso_cyclical_coordinate_descent(normalized_feature_matrix,output,initial_weights, l1_penalty, tolerance)


90


In [179]:
weights1e4

array([ 78564738.34156768, -22097398.92430537,  12791071.87278518,
        93808088.09281202,  -2013172.75704955,  -4219184.9326502 ,
         6482842.81753507,   7127408.53480688,   5001664.85469639,
        14327518.43714052, -15770959.15237399,  -5159591.22213149,
       -84495341.76843642,   2824439.49703684])

In [180]:
normalized_weights1e4 = weights1e4/norms
normalized_weights1e7 = weights1e7/norms
normalized_weights1e8 = weights1e8/norms

In [181]:
print(normalized_weights1e7[3])

161.3174576461176


In [182]:
(test_feature_matrix, test_output) = get_numpy_data(test, all_features, 'price')

In [183]:
(all_feature_test_matrix, all_test_output) = get_numpy_data(test, all_features, my_output)

In [184]:
weights1e4_predictions = predict_outcome(all_feature_test_matrix, normalized_weights1e4)


In [None]:
#weights1e7
residuals = weights1e4_predictions - all_test_output
squared = residuals**2
weights1e4_RSS = squared.sum()
print(weights1e4_RSS)

228459958971393.22


In [186]:
#weights1e7
weights1e7_predictions = predict_outcome(all_feature_test_matrix, normalized_weights1e7)
residuals = weights1e7_predictions - all_test_output
squared = residuals**2
weights1e7_RSS = squared.sum()
print(weights1e7_RSS)

275962075920366.8


In [187]:
#weights1e8
weights1e8_predictions = predict_outcome(all_feature_test_matrix, normalized_weights1e8)
residuals = weights1e8_predictions - all_test_output
squared = residuals**2
weights1e8_RSS = squared.sum()
print(weights1e8_RSS)

537166151497322.75
