## Leren: Programming Assignment 2
#### Wim Berkelmans, 10793674
#### Philip Bouman, 10668667


### 1. Read data from file

In [337]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import numpy.linalg as linalg

def loadData(feature):
    # load input:
    # features: Bedrooms, Bathrooms, Size
    # target: Price
    data = np.loadtxt('HousesRegr.csv', skiprows = 1, delimiter=';');
    columns = ['MLS','Bedrooms','Bathrooms','Size','Price']
    index = columns.index(feature)
    return data[:,index]

bedroom = loadData('Bedrooms')
bathroom = loadData('Bathrooms')
size = loadData('Size')
price = loadData('Price')

# create easy format and add x_0's
def designMatrix(n):
    data = np.loadtxt('HousesRegr.csv', skiprows = 1, delimiter=';');
    m = len(data)
    X = np.ones((m,n))
    for i in range(m):
        for j in range(1,n):
            X[i, j] = data[i,j]
    return X

# vectorize predicted output
def targetVec(y):
    Y = y[:, np.newaxis] 
    return Y

In [494]:
def regressionVec(X, Y):
    
    # apply normal equation
    norm = np.dot(np.dot(linalg.inv(np.dot(X.T, X)), X.T), Y)
    return norm

# X = np.array([[1,2,3],[1,4,5],[1,4,3]])
# Y = np.array([6,6,10])

X = designMatrix(3)
Y = targetVec(price)


Theta = regressionVec(X, Y)
print Theta

[[ -64463.72009442]
 [ -32971.93736837]
 [ 234043.11668245]]


In [495]:
def calcCostVec(X, Y, Theta):
    m = len(Y)
    errorSquared = (np.dot(X, Theta) - Y)**2
    
    # sum over errorSquared by multiplying with a rowvector of ones
    ones = np.ones(m)
    cost = np.dot(ones.T, errorSquared)/(2.0*m)
    return cost

cost = calcCostVec(X, Y, Theta)
print cost

[  4.41188991e+10]


In [516]:
# Version 1
def gradientIter(X, Y, Thetas, alpha):
    # number of training examples
    m = len(X)
    # number of features
    n = len(X[0])
    
    theta_temp = np.zeros(n)
    new_thetas = np.zeros(n)
    
    # loop over thetas
    for i in range(n):
        index = 0
        # loop over training examples        
        for j in range(m):
            h = 0
            # loop over features
            for k in range(n):
                h += (Thetas[k] * X[j][k])
            
            theta_temp[i] += (h - Y[j]) * X[j][i]
            
        new_thetas[i] = Thetas[i] - alpha * 1.0/m * theta_temp[i]
        
    return new_thetas

# Version 2
def gradientIter2(X, Y, Thetas, alpha):
    m = len(X)
    n = len(X[0])
    
    theta_temp = np.zeros((m,n))
    new_thetas = np.zeros(n) 
    
    # loop over training examples
    for i in range(m):

        # loop over thetas/features
        # calculate and store intermediate values for thetas (all the calculations for one training example)
        for j in range(n):
            theta_temp[i][j] = (np.sum(Thetas[:n] * X[i,:n]) - Y[i]) * X[i][j]
    
    # sum over intermediate values and apply learning rate and normalization
    for i in range(n):
        new_thetas[i] = Thetas[i] - alpha * 1.0/m * np.sum(theta_temp[0:m,i])
    
    return new_thetas
    
def regressionIter(X, Y, Thetas, alpha, n_iter):
    while n_iter > 0:
        step = gradientIter(X, Y, Thetas, alpha)
        Thetas = step
        n_iter -= 1
    return step

def regressionIter2(X, Y, Thetas, alpha, n_iter):
    cost = np.zeros(n_iter)
    for i in range(n_iter):
        step = gradientIter2(X, Y, Thetas, alpha)
        cost[i] = calcCost(Thetas, X, Y)
        Thetas = step
        n_iter -= 1
        
    return step, cost


X = designMatrix(4)
Y = targetVec(price)

X = np.array([[1,2,3],[1,4,5],[1,4,3]])
Y = np.array([6,6,10])

Thetas = [0.2,0.2,0.2]

vals = regressionIter2(X, Y, Thetas, 0.01, 2)
print vals[0]
# print vals[1]

# val = regressionIter(X, Y, Thetas, 0.01, 2)
# print val


[ 0.30023111  0.54391111  0.54176   ]


In [515]:
def calcCost(Thetas, X, Y):
    m = len(X)
    n = len(Thetas)
    cost_sum = 0
    
    for i in range(n):  
        cost_sum += (np.sum(Thetas[:n] * X[i,:n]) - Y[i])**2
            
    cost = cost_sum / (2.0*m)    
    return cost