## Gradient descent for single feature linear regression

In [None]:
import math, copy
import numpy as np
import matplotlib.pyplot as plt
from lab_utils_uni import plt_house_x, plt_contour_wgrad, plt_divergence, plt_gradients

A simple data set with only two data points - a house with 1000 square feet(sqft) sold for `$300,000` and a house with 2000 square feet sold for `$500,000`. These two points will constitute our *data or training set*. Here, the units of size are 1000 sqft and the units of price are 1000s of dollars.

| Size (1000 sqft)     | Price (1000s of dollars) |
| -------------------| ------------------------ |
| 1.0               | 300                      |
| 2.0               | 500                      |

In [None]:
x_train=np.array([1.0, 2.0])
y_train=np.array([300.0, 500.0])

print(x_train, y_train)

In [None]:
def compute_cost_function(x, y, w, b):
  m=x.shape[0]

  cost_sum=0
  for i in range(m):
    f_wb=w*x[i]+b
    cost=(f_wb-y[i])**2
    cost_sum=cost_sum+cost

  total_cost=(1/(2*m))*cost_sum

  return total_cost

In [None]:
def compute_gradient(x, y, w, b):
  m=x.shape[0]
  dj_dw=0
  dj_db=0

  for i in range(m):
    f_wb=w*x[i]+b

    dj_dw_i=(f_wb-y[i])*x[i]
    dj_db_i=f_wb-y[i]

    dj_dw+=dj_dw_i
    dj_db+=dj_db_i

  dj_dw=dj_dw/m
  dj_db=dj_db/m

  return dj_dw, dj_db

In [None]:
plt_gradients(x_train, y_train, compute_cost_function, compute_gradient)
plt.show()

In [None]:
def gradient_descent(x, y, w_in, b_in, alpha, num_iters, cost_function, gradient_function):
  w=copy.deepcopy(w_in)

  J_history=[]
  p_history=[]
  b=b_in
  w=w_in

  for i in range(num_iters):
    dj_dw, dj_db=compute_gradient(x, y, w, b)

    b=b-alpha*dj_db
    w=w-alpha*dj_dw

    if i<100000:
      J_history.append(cost_function(x, y, w, b))
      p_history.append([w, b])

    # Print every cost at intervals 10 times or as many if <10
    if i%math.ceil(num_iters/10)==0:
      print(f'Iteration {i:4}: Cost {J_history[-1]:0.2e} ',
            f'dj_dw: {dj_dw: 0.3e}, dj_db: {dj_db: 0.3e}',
            f'w: {w: 0.3e}, b: {b: 0.3e}')
      
  return w, b, J_history, p_history

In [None]:
w_init=0
b_init=0

iterations=10000
tmp_alpha=1.0e-2

w_final, b_final, J_hist, p_hist=gradient_descent(x_train, y_train, w_init, b_init, tmp_alpha, iterations, compute_cost_function, compute_gradient)

print(f'(w, b) found by gradient descent: ({w_final: 8.4f}, {b_final: 8.4f})')

### Cost versus iterations of gradient descent

In [None]:
fig, (ax1, ax2)=plt.subplots(1, 2, constrained_layout=True, figsize=(12, 4))
ax1.plot(J_hist[:100])
ax2.plot(1000+np.arange(len(J_hist[1000:])), J_hist[1000:])

ax1.set_title('Cost vs iterations(start)')
ax2.set_title('Cost vs iterations(end)')

ax1.set_ylabel('Cost')
ax1.set_xlabel('Iteration step')

ax2.set_ylabel('Cost')
ax2.set_xlabel('Iteration step')

plt.show()

In [None]:
fig, ax=plt.subplots(1, 1, figsize=(12, 6))
plt_contour_wgrad(x_train, y_train, p_hist, ax)

Above, the contour plot shows the $cost(w,b)$ over a range of $w$ and $b$. Cost levels are represented by the rings. Overlayed, using red arrows, is the path of gradient descent. Here are some things to note:
- The path makes steady (monotonic) progress toward its goal.
- initial steps are much larger than the steps near the goal.

In [None]:
# To observe the final steps of gradient descent

fig, ax=plt.subplots(1, 1, figsize=(12, 4))
plt_contour_wgrad(x_train, y_train, p_hist, ax, w_range=[180, 220, 0.5],
                  b_range=[80, 120, 0.5], contours=[1, 5, 10, 20], resolution=0.5)

In [None]:
w_init=0
b_init=0

iterations=10
tmp_alpha=8.0e-1

w_final, b_final, J_hist, p_hist=gradient_descent(x_train, y_train, w_init, b_init, tmp_alpha, iterations, compute_cost_function, compute_gradient)

In [None]:
plt_divergence(p_hist, J_hist, x_train, y_train)
plt.show()