In [1]:
import random 
import numpy as np
import time
import matplotlib.pyplot as plt

# make matplotlib figures appear inline
%matplotlib inline 

# Some more magic so that the notebook will reload external python modules;
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

In [2]:
# Load the data 
# The data is copied from the linear regression exercise of machine learning course from Andrew Ng
# The file ex1data2.txt contains a training set of housing prices in Portland, Oregon. The first column is the 
# size of the house (in square feet), the second column is the number of bedrooms, 
# and the third column is the price of the house, which we want to predict.
file_name = 'dataset/ex1data2.txt'
with open(file_name, 'r') as f:
    house_data = np.loadtxt(file_name, delimiter=',')
    
num_sample = house_data.shape[0] # number of all the samples
X = house_data[:, :2]
y = house_data[:, 2].reshape((-1,1))

# Add intercept term or bias to X
print('X shape: ', X.shape)
print('y shape: ', y.shape)
print('First 10 examples from the dataset')
print(house_data[0:10, :])

X shape:  (47, 2)
y shape:  (47, 1)
First 10 examples from the dataset
[[2.10400e+03 3.00000e+00 3.99900e+05]
 [1.60000e+03 3.00000e+00 3.29900e+05]
 [2.40000e+03 3.00000e+00 3.69000e+05]
 [1.41600e+03 2.00000e+00 2.32000e+05]
 [3.00000e+03 4.00000e+00 5.39900e+05]
 [1.98500e+03 4.00000e+00 2.99900e+05]
 [1.53400e+03 3.00000e+00 3.14900e+05]
 [1.42700e+03 3.00000e+00 1.98999e+05]
 [1.38000e+03 3.00000e+00 2.12000e+05]
 [1.49400e+03 3.00000e+00 2.42500e+05]]


In [3]:
# Feature Normalization
# By looking at the data, features differ by orders of magnitude
# we need to perform feature scaling to make gradient descent converge much more quickly
# Normalization: (x - mean) / std
X = (X - np.mean(X, axis=0)) / np.std(X, axis=0)
# Add bias dimension
X = np.hstack((X, np.ones((num_sample, 1))))
print('First 10 examples from the dataset')
print(X[0:10, :])
print('First 10 examples target')
print(y[0:10])

First 10 examples from the dataset
[[ 0.13141542 -0.22609337  1.        ]
 [-0.5096407  -0.22609337  1.        ]
 [ 0.5079087  -0.22609337  1.        ]
 [-0.74367706 -1.5543919   1.        ]
 [ 1.27107075  1.10220517  1.        ]
 [-0.01994505  1.10220517  1.        ]
 [-0.59358852 -0.22609337  1.        ]
 [-0.72968575 -0.22609337  1.        ]
 [-0.78946678 -0.22609337  1.        ]
 [-0.64446599 -0.22609337  1.        ]]
First 10 examples target
[[399900.]
 [329900.]
 [369000.]
 [232000.]
 [539900.]
 [299900.]
 [314900.]
 [198999.]
 [212000.]
 [242500.]]


In [4]:
# Now train linear regression model by BGD and SGD algorithms 
# load our implementation
from linreg import LinearRegression
lr_bgd = LinearRegression()
tic = time.time()
losses_bgd = lr_bgd.train(X, y, method='bgd', learning_rate=1e-2, num_iters=1000, verbose=True)
toc = time.time()
print('Traning time for BGD with vectorized version is %f \n' % (toc - tic))
print("BGD coefficient:")
print(lr_bgd.W)

lr_sgd = LinearRegression()
tic = time.time()
losses_sgd = lr_sgd.train(X, y, method='sgd', learning_rate=1e-2, num_iters=3000, verbose=True)
toc = time.time()
print('Traning time for SGD with vectorized version is %f' % (toc - tic))
print("SGD coefficient:")
print(lr_sgd.W)

iteration 0 / 1000 : loss 3082802778528.421875
iteration 100 / 1000 : loss 96034162378.332947
iteration 200 / 1000 : loss 96034162378.332977
iteration 300 / 1000 : loss 96034162378.332977
iteration 400 / 1000 : loss 96034162378.332977
iteration 500 / 1000 : loss 96034162378.332977
iteration 600 / 1000 : loss 96034162378.332977
iteration 700 / 1000 : loss 96034162378.332977
iteration 800 / 1000 : loss 96034162378.332977
iteration 900 / 1000 : loss 96034162378.332977
Traning time for BGD with vectorized version is 0.027220 

BGD coefficient:
[[109447.79646964]
 [ -6578.35485416]
 [340412.65957447]]
iteration 0 / 3000 : loss 60204499976.354958
iteration 300 / 3000 : loss 125093460.625119
iteration 600 / 3000 : loss 1103992656.372859
iteration 900 / 3000 : loss 7894993.529553
iteration 1200 / 3000 : loss 1188550840.746390
iteration 1500 / 3000 : loss 1878538418.457906
iteration 1800 / 3000 : loss 19454657539.127789
iteration 2100 / 3000 : loss 824715444.396756
iteration 2400 / 3000 : loss 

In [5]:
# use sklearn to validate
from sklearn import linear_model
sklr = linear_model.LinearRegression()

sklr.fit(X,y)
print("sklearn coefficient:")
print(sklr.coef_)
print(sklr.intercept_)

sklearn coefficient:
[[109447.79646964  -6578.35485416      0.        ]]
[340412.65957447]
