In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib as plt
%matplotlib inline
khouse_train = pd.read_csv('kc_house_train_data.csv',dtype = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':str, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int})
khouse_test = pd.read_csv('kc_house_test_data.csv',dtype = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':str, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int})

In [2]:
def simple_linear_regression(input_feature, output):
    x_x = input_feature*input_feature
    x_y = input_feature*output
    x=input_feature
    y=output
    slope = (x_y.sum()-y.sum()*x.sum()/x.size)/(x_x.sum()-x.sum()**2/x.size)
    intercept = y.mean()-slope*x.mean()
    return(intercept, slope)


def get_regression_predictions(input_feature, intercept,slope):
    predicted_output = input_feature*slope+intercept
    return(predicted_output)


def get_residual_sum_of_squares(input_feature, output, intercept,slope):
    RSS = ((output-input_feature*slope-intercept)**2).sum()
    return(RSS)


def RSS(y,y_est):
    return ((y-y_est)**2).sum()

In [3]:
def set_data(features, output):
    df = pd.DataFrame(features).transpose()
    df['constant'] = 1
    df.set_axis([i for i in range(len(features)+1)],axis='columns',inplace=True)
    return (df, output)


def predict_outcome(feature_matrix, weights):
    return (feature_matrix*weights).sum(axis=1)


def feature_derivative(errors, feature):
    _ = -2*feature.transpose()*errors
    return _.sum(axis=1).reset_index()[0]


def regression_gradient_descent(feature_matrix, output, initial_weights, step_size, tolerance):
    converged = False
    weights = initial_weights
    i=1
    while not converged:
        print("\n",i)
        i+=1
        print(weights[0],weights[1])
        
        prediction = predict_outcome(feature_matrix, weights)
        errors = output-prediction
        gradient_of_RSS = feature_derivative(errors, feature_matrix)
        print(gradient_of_RSS)
        
        weights = weights - step_size*gradient_of_RSS
        g_value = np.sqrt(np.square(gradient_of_RSS).sum())
        if g_value < tolerance:
            converged = True
    return weights


def regression_gradient_descent2(feature_matrix, output, initial_weights, step_size, tolerance):
    converged = False
    weights = initial_weights
    i=1
    while not converged:
        print(weights)
        prediction = predict_outcome(feature_matrix, weights)
        errors = output-prediction
        gradient_sum_squares = 0
        
        for i in range(len(weights)):
            derivative = -2*(feature_matrix.iloc[:,i]*errors).sum()
            gradient_sum_squares += np.square(derivative)
            weights[i] = weights[i]-step_size*derivative
        gradient_magnitude = np.sqrt(gradient_sum_squares)
        if gradient_magnitude < tolerance:
            converged = True
        print(gradient_magnitude)
    return(weights)

## 兩個bug experience:
    1. pd.df乘法會拿兩矩陣的相同column name乘，有名字乘沒名字會爆炸
    2. dtype會影響結果，pd與np的dtype不盡相同

In [8]:
simple_features = [khouse_train['sqft_living']]
(simple_feature_matrix, output) = set_data(simple_features, khouse_train['price'])

initial_weights = np.array([1.,-47000.])
step_size = 4e-12
tolerance = 1e9
simple_weights = regression_gradient_descent2(simple_feature_matrix, output,initial_weights, step_size, tolerance)

[ 1.0e+00 -4.7e+04]
50551530784973.43
[   203.20610681 -46999.91874209]
14163541178533.521
[   259.86026695 -46999.89597714]
3968344689091.9404
[   275.73364443 -46999.88960076]
1111851857734.1477
[   280.1810515  -46999.88781613]
311518945656.9479
[   281.42712718 -46999.88731802]
87281460052.92308
[   281.776253   -46999.88718037]
24454542420.467026
[   281.87407116 -46999.88714371]
6851680150.731333
[   281.90147788 -46999.88713535]
1919705650.4940245
[   281.9091567  -46999.88713492]
537864010.0430896


In [9]:
# model one: 'sqrt_living' -> 'price'
(test_feature_matrix, test_output) = set_data([khouse_test['sqft_living']],khouse_test['price'])
(test_feature_matrix.loc[0]*simple_weights).sum()

356133.2835291987

In [10]:
RSS(test_output, predict_outcome(test_feature_matrix, simple_weights))

275400060699218.56

In [11]:
simple_weights

array([   281.91130816, -46999.8871367 ])

In [11]:
test_feature_matrix.loc[0]*simple_weights

0    347796.175797
1     39587.277847
2         2.075222
Name: 0, dtype: float64

In [5]:
# model two: 'sqrt_living', 'sqrt_living15' -> 'price' 
(test_feature_matrix, test_output) = set_data([khouse_test['sqft_living'],khouse_test['sqft_living15']],khouse_test['price'])
(test_feature_matrix.loc[0]*simple_weights).sum()

366651.4116294939

In [6]:
test_feature_matrix.loc[0]*simple_weights

0    350453.822954
1    116197.557524
2    -99999.968849
Name: 0, dtype: float64

In [127]:
test_output.loc[0]

310000.0

In [7]:
RSS(test_output, predict_outcome(test_feature_matrix, simple_weights))

270263443629803.56

In [None]:
RSS of model1: 275400060699218.56
RSS of model2: 270263443629803.56