In [2]:
import numpy as np

#### Prediction: $\vec{h}_{\vec{w}}(\mathbf{X})$
- This is the model's prediction, calculated by $\mathbf{X}\vec{w}$

In [3]:
# Provide two cities and their populations
tmp_X = np.array([[1, 9],[1, 2]])
print("Provide two cities and their populations")
print(tmp_X)

# View the current parameter vector
tmp_w = np.array([[1],[2]])
print("View the current parameter vector")
print(tmp_w)
print()

# Calculate the model prediction h
tmp_h = np.dot(tmp_X, tmp_w)
print("Calculate the model prediction h")
print(tmp_h)
print()
print(f"The model predicts {tmp_h[0]} for city 0, and {tmp_h[1]} for city 1")

Provide two cities and their populations
[[1 9]
 [1 2]]
View the current parameter vector
[[1]
 [2]]

Calculate the model prediction h
[[19]
 [ 5]]

The model predicts [19] for city 0, and [5] for city 1


#### Error: $\vec{h}_{\vec{w}}(\mathbf{X}) - \vec{y}$
  - This is the difference between the model prediction and the actual value of y.


In [5]:
# View the model's predictions
print("Model prediction tmp_h")
print(tmp_h)
print()

# Get the true labels for these two cities
tmp_y = np.array([[10],[6]])
print("True labels for the profits per city")
print(tmp_y)
print()

# Calculate the error
tmp_error = tmp_h - tmp_y
print("Error")
print(tmp_error)
print(f"The error for city 0 prediction is {tmp_error[0]} and is positive; the error for city 1 prediction is {tmp_error[1]} and is negative")

Model prediction tmp_h
[[19]
 [ 5]]

True labels for the profits per city
[[10]
 [ 6]]

Error
[[ 9]
 [-1]]
The error for city 0 prediction is [9] and is positive; the error for city 1 prediction is [-1] and is negative


#### Gradient: $\frac{1}{m} \mathbf{X}^T \times Error$
- This is a vector containing the gradient for each element of the parameter vector $\vec{w}$
  - Since $\vec{w}$ is a column vector with 2 rows, this gradient is also a column vector with 2 rows.
  - The $\frac{1}{m}$ takes the average gradient across all 97 training examples (97 cities).


In [7]:
# Provide two cities and their populations
tmp_X = np.array([[1, 9],[1, 2]])
print("X: two cities and their populations")
print(tmp_X)
print()

# transpose of X
tmp_X_T = tmp_X.T
print("Transpose of X")
print(tmp_X_T)
print()

# The number of examples (cities)
tmp_m = tmp_X.shape[0]
print(f"The number of examples (number of cities) is {tmp_m}\n")

# error
print("Error")
print(tmp_error)

# Calculate the gradient
tmp_gradient = (1/tmp_m) * np.dot(tmp_X_T, tmp_error)
print("Gradient")
print(tmp_gradient)

X: two cities and their populations
[[1 9]
 [1 2]]

Transpose of X
[[1 1]
 [9 2]]

The number of examples (number of cities) is 2

Error
[[ 9]
 [-1]]
Gradient
[[ 4. ]
 [39.5]]


#### Scale by the learning rate: $\alpha$
- $\alpha$ is a positive number smaller than 1 that reduces the magnitude of the update to be smaller than the actual gradient.


In [8]:
tmp_alpha = 0.01
print(f"Learning rate alpha: {tmp_alpha}")

print("Gradient before scaling by the learning rate:")
print(tmp_gradient)
print()

gradient_scaled_by_learning_rate = tmp_alpha * tmp_gradient
print("Gradient after scaling by the learning rate")
print(gradient_scaled_by_learning_rate)

Learning rate alpha: 0.01
Gradient before scaling by the learning rate:
[[ 4. ]
 [39.5]]

Gradient after scaling by the learning rate
[[0.04 ]
 [0.395]]


- Subtract the gradient: $-$
  - Recall that the gradient points in the direction that would INCREASE the cost, negative one multiplied by the gradient will point in the direction that REDUCES the cost.
  - So, to update the weight in the direction that reduces the cost, subtract the gradient.

In [9]:
gradient_scaled_by_learning_rate = tmp_alpha * tmp_gradient
print("Gradient after scaling by the learning rate")
print(gradient_scaled_by_learning_rate)
print()

direction_of_update = -1 * gradient_scaled_by_learning_rate
print("The direction to update the parameter vector")
print(direction_of_update)

Gradient after scaling by the learning rate
[[0.04 ]
 [0.395]]

The direction to update the parameter vector
[[-0.04 ]
 [-0.395]]
