In [1]:
# Mount googl drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


1. Build a linear regression which predicts the land price using both the land_area and the distance_to_city feature. (land_price_1.csv)

In [25]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Read data from csv file
data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/fund_ML/land_price_1.csv').to_numpy()

print(data.shape)

x = data[:, :-1] # np(m, 2)
y = data[:, -1]

print('x:', x.shape)
print('y:', y.shape)

(30, 3)
x: (30, 2)
y: (30,)


In [26]:
# Hypothesis: h(x) = th0*x0 + th1*x1 + th2*x2
def h(x, theta):
  '''
  x: np(m, 3)
  theta: np(3)
  return: z = np(m)
  '''
  z = np.dot(x, theta.reshape(-1, 1))
  return z.flatten()

def cost(z, y):
  '''
  x: np(m, 3)
  y: np(m)
  theta: np(3)
  return: J
  '''
  m = x.shape[0]
  J = np.sum((z - y)**2) / m
  return J

def gradient(x, y, theta):
  m = x.shape[0]
  z = h(x, theta)
  g = np.dot(x.T, (z - y).reshape(-1, 1)) *2/ m # np(3,1)
  return g.flatten() #np(3)

### Feature Scaling

In [27]:
def feature_scaling(x):
  mu = np.mean(x, axis=0) #np(n)
  # sigma = np.std(x, axis=0)
  sigma = np.max(x, axis=0) - np.min(x, axis=0)
  x = (x - mu) / sigma
  return x

x_scaled = feature_scaling(x)
x_scaled = np.concatenate((np.ones((x_scaled.shape[0], 1)), x_scaled), axis=1)
print(x_scaled.shape)

(30, 3)


### Training

In [30]:
theta = np.zeros(3) # np(3)
alpha = 0.1

iter = 1000

for i in range(iter):
  z = h(x_scaled, theta)
  J = cost(z, y)
  print('iter:', i, 'J:', J)

  g = gradient(x_scaled, y, theta)
  theta += -alpha * g


iter: 0 J: 35052.92066666667
iter: 1 J: 24989.42499783066
iter: 2 J: 18454.66270584425
iter: 3 J: 14181.805441363766
iter: 4 J: 11359.949997341486
iter: 5 J: 9469.990356057084
iter: 6 J: 8179.575601714343
iter: 7 J: 7275.882794725609
iter: 8 J: 6622.591428897571
iter: 9 J: 6132.346908802384
iter: 10 J: 5749.137006914896
iter: 11 J: 5437.012634377785
iter: 12 J: 5172.868978806926
iter: 13 J: 4941.825283072093
iter: 14 J: 4734.267760132368
iter: 15 J: 4543.956920547581
iter: 16 J: 4366.816129482998
iter: 17 J: 4200.15615575859
iter: 18 J: 4042.1787607724427
iter: 19 J: 3891.6588777140173
iter: 20 J: 3747.7410931457716
iter: 21 J: 3609.8092865033764
iter: 22 J: 3477.4030948936925
iter: 23 J: 3350.1643501470803
iter: 24 J: 3227.8027020167774
iter: 25 J: 3110.073524263397
iter: 26 J: 2996.7636853898603
iter: 27 J: 2887.682356215128
iter: 28 J: 2782.655044351629
iter: 29 J: 2681.51969709747
iter: 30 J: 2584.1241311853573
iter: 31 J: 2490.3243146707423
iter: 32 J: 2399.983197024481
iter: 33 J

### Evaluation

In [33]:
z = h(x_scaled, theta)

for i, (zi, yi) in enumerate(zip(z, y)):
  print('i:', i, 'zi:', zi, 'yi:', yi, 'diff:', abs(zi - yi))


i: 0 zi: 16.194542966880746 yi: 15.7 diff: 0.49454296688074706
i: 1 zi: 10.244300290594186 yi: 11.3 diff: 1.0556997094058147
i: 2 zi: 41.954123276533096 yi: 42.0 diff: 0.04587672346690397
i: 3 zi: 35.08205638150936 yi: 35.0 diff: 0.08205638150936068
i: 4 zi: 38.08452147043654 yi: 37.7 diff: 0.38452147043653895
i: 5 zi: 75.44188555688574 yi: 75.5 diff: 0.05811444311426328
i: 6 zi: 77.72360539909812 yi: 77.1 diff: 0.6236053990981247
i: 7 zi: 89.34574938066557 yi: 88.7 diff: 0.6457493806655634
i: 8 zi: 121.07362432898378 yi: 122.0 diff: 0.926375671016217
i: 9 zi: 118.72908365769148 yi: 119.1 diff: 0.37091634230851867
i: 10 zi: 126.54228042675678 yi: 125.6 diff: 0.9422804267567813
i: 11 zi: 145.39965700865338 yi: 145.5 diff: 0.10034299134662206
i: 12 zi: 149.60354138024857 yi: 150.0 diff: 0.39645861975142793
i: 13 zi: 145.94801595908746 yi: 145.0 diff: 0.9480159590874564
i: 14 zi: 171.64641006071548 yi: 172.0 diff: 0.3535899392845181
i: 15 zi: 169.26720962165493 yi: 170.0 diff: 0.732790378