- Vectorizing across multiple examples.

In [44]:
# code for data preparation

import pandas as pd
import numpy as np
import time

sigmoid = lambda x: 1 / (1 + np.exp(-x))
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data', header=None)
df[4] = np.where(df[4] == 'Iris-setosa', 1, 0)
# print(df.head())

X = np.array(df[[0, 1, 2]]).T;
Y = np.array(df[[4]]).T

J = 0  # total average loss

앞서 비디오에서는 '한건의 샘플 데이터가 주어졌을때 for-loop를 이용한 각 레이어의 노드값($z$, $a$) 계산하는 것'을 벡터화하 방법에 대해 알아봤었다. (이 내용은 아래 첫번째 코드가 두번째 코드로 바뀌는 것이다.) 

- Algorithm for Step 1
$$
\begin{align}
&m = \text{number of samples} \\
&n_h = \text{number of nodes in the hidden layer} \\
&n_y = \text{number of nodes in the output layer} \\
&\textbf{for} ~ i=1 ~ \text{to} ~ m ~ \textbf{do} \\
&\qquad \textbf{for} ~ j=1 ~ \text{to} ~ n_h ~ \textbf{do} \\
&\qquad \qquad z^{[1](i)}_j = W^{[1]T}_j \cdot x^{(i)} + b^{[1]}_j \\
&\qquad \qquad a^{[1](i)}_j = \sigma(z^{[1](i)}_j)
&\\
&\qquad \textbf{for} ~ j=1 ~ \text{to} ~ n_y ~ \textbf{do} \\
&\qquad \qquad z^{[2](i)}_j = W^{[2]T}_j \cdot z^{[1](i)} + b^{[2]}_j \\
&\qquad \qquad a^{[2](i)}_j = \sigma(z^{[2](i)}_j)
\end{align}
$$

In [47]:
# Step 1
# using for-loop to compute values of each node
# but still using for-loop for each sample

m = X.shape[1]  # number of samples (=150)
n_x = X.shape[0]  # number of features (=3)
n_h = 4  # number of units in the hidden layer
n_y = Y.shape[0]  # size of output layer (= 1)

W1 = np.random.randn(n_h, n_x) * 0.01
b1 = np.zeros((n_h, 1))
W2 = np.random.randn(n_y, n_h) * 0.01
b2 = np.zeros((1, 1))

for i in range(m):  # for each sample
    z1 = np.empty((n_h, 1))  # weighted sum for 1st hidden layer (n_h x 1 column vector)
    a1 = np.empty((n_h, 1))  # activation of 1st hidden layer (n_h x 1 column vector)
    i = 0
    for j in range(n_h):  # for each node in the hidden layer
        w1_j = W1[j,:].reshape(-1, 1)  # weights[1] for j'th node (column vector)
        z1[j] = np.dot(w1_j.T, X[:, i].reshape(-1, 1)) + b1[j]
        a1[j] = np.tanh(z1[j])

    z2 = np.empty((n_y, 1))  # weighted sum for 2nd hidden layer (n_y x 1 column vector)
    a2 = np.empty((n_y, 1))  # activation of 2nd hidden layer (n_y x 1 column vector)
    for j in range(n_y):  # for each node in the output layer
        w2_j = W2[j,:].reshape(-1, 1)  # weights[2] for j'th node (column vector)
        z2[j] = np.dot(w2_j.T, a1) + b2[j]
        a2[j] = sigmoid(z2[j])

- Algorithm for Step 2
$$
\begin{align}
&m = \text{number of samples} \\
&n_h = \text{number of nodes in the hidden layer} \\
&n_y = \text{number of nodes in the output layer} \\
&\textbf{for} ~ i=1 ~ \text{to} ~ m ~ \textbf{do} \\
&\qquad z^{[1](i)} = W^{[1]} \cdot x^{(i)} + b^{[1]} \\
&\qquad a^{[1](i)} = \sigma(z^{[1](i)})
&\\
&\qquad z^{[2](i)} = W^{[2]T}_j \cdot z^{[1](i)} + b^{[2]} \\
&\qquad a^{[2](i)} = \sigma(z^{[2](i)})
\end{align}
$$

In [46]:
# Step 2
# vectorized computation for each node
# but still using for-loop for each sample

m = X.shape[1]  # number of samples (=150)
n_x = X.shape[0]  # number of features (=3)
n_h = 4  # number of units in the hidden layer
n_y = Y.shape[0]  # size of output layer (= 1)

W1 = np.random.randn(n_h, n_x) * 0.01
b1 = np.zeros((n_h, 1))
W2 = np.random.randn(n_y, n_h) * 0.01
b2 = np.zeros((1, 1))

for i in range(m):  # for each sample
    z1 = np.empty((n_h, 1))  # weighted sum for 1st hidden layer (n_h x 1 column vector)
    a1 = np.empty((n_h, 1))  # activation of 1st hidden layer (n_h x 1 column vector)
    i = 0
    
    z1 = np.dot(W1, X[:, i].reshape(-1, 1)) + b1
    a1 = np.tanh(z1)
    
    z2 = np.dot(W2, z1) + b2
    a2 = sigmoid(z2)
    
    loss = -1 * (np.log(a2) * Y[:,i] + (1-Y[:,i]) * np.log(1-a2))

하지만 이렇게 벡터화하더라도 각 데이터 샘플에 대해 위 계산을 반복 하는 것이 필요하다.

이번 비디오에서는 (각 노드 연산을 벡터화하는 것에 추가하여) 각 데이터 샘플에 대한 반복 연산을 벡터화 하는 방법에 대해 알아본다.

(각 노드 연산을 벡터화 하는 것과 무관하게) 입력값 $X \in \mathbb{R}^{n_x \times m}$가 주어졌을 때 각 데이터 샘플 $x \in \mathbb{R}^{n_x \times 1}$에 대해 $z^{[1]} \in \mathbb{R}^{n^{[1]} \times 1}$, $a^{[1]} \in \mathbb{R}^{n^{[1]} \times 1}$, $z^{[2]} \in \mathbb{R}^{n^{[2]} \times 1}$, $a^{[2]} \in \mathbb{R}^{n^{[2]} \times 1}$를 차례로 구하는 과정(+역전파)을 데이터 샘플수(m) 만큼 반복하게 된다.(Step 1과 Step 2 코드에서의 'for i in range(m):')

이 반복을 벡터화 하려면 
입력 데이터를 n_x by m 행렬 X로 바꾸면 이후 계산 값은 n^{[1]} by m 행렬인 Z^{[1]}, A^{[1]}과 n^{[2]} by m 행렬인 Z^{[2]}, A^{[2]}로 반복 없이 계산 가능하다. bias항 b^{[l]}은 n^{[l]} by 1행렬인데 Z^{[l]}=W^{[l]}X + b^{[l]}를 계산할 때에는 (파이썬 넘파이에 의해) n^{[l]} by m 행렬로 broadcast된다.

(파이썬 구현 추가)

- Explanation for vectorized implementation

In [35]:
# Step1 : implementation with for-loop, 1 epoch

import pandas as pd
import numpy as np
import time

sigmoid = lambda x: 1 / (1 + np.exp(-x))
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data', header=None)
df[4] = np.where(df[4] == 'Iris-setosa', 1, 0)
# print(df.head())

X = np.array(df[[0, 1, 2]]).T;
Y = np.array(df[[4]]).T

m = X.shape[1]  # number of samples (=150)
n_x = X.shape[0]  # number of features (=3)
n_h = 4  # number of units in the hidden layer
n_y = Y.shape[0]  # size of output layer (= 1)

W1 = np.random.randn(n_h, n_x) * 0.01
b1 = np.zeros((n_h, 1))
W2 = np.random.randn(n_y, n_h) * 0.01
b2 = np.zeros((1, 1))

J = 0  # total average loss
for i in range(m):  # for each sample
    z1 = np.empty((n_h, 1))  # weighted sum for 1st hidden layer (n_h x 1 column vector)
    a1 = np.empty((n_h, 1))  # activation of 1st hidden layer (n_h x 1 column vector)
    i = 0
    for j in range(n_h):  # for each node in the hidden layer
        w1_j = W1[j,:].reshape(-1, 1)  # weights[1] for j'th node (column vector)
        z1[j] = np.dot(w1_j.T, X[:, i].reshape(-1, 1)) + b1[j]
        a1[j] = np.tanh(z1[j])

    z2 = np.empty((n_y, 1))  # weighted sum for 2nd hidden layer (n_y x 1 column vector)
    a2 = np.empty((n_y, 1))  # activation of 2nd hidden layer (n_y x 1 column vector)
    for j in range(n_y):  # for each node in the output layer
        w2_j = W2[j,:].reshape(-1, 1)  # weights[2] for j'th node (column vector)
        z2[j] = np.dot(w2_j.T, a1) + b2[j]
        a2[j] = sigmoid(z2[j])

    loss = -1 * (np.log(a2) * Y[:,i] + (1-Y[:,i]) * np.log(1-a2))
    
    # skiped back prop
    
    J = (J * i + loss)/(i+1)  # computing $$J = - \frac{1}{m} \sum\limits_{i = 0}^{m} \large{(} \small y^{(i)}\log\left(a^{[2] (i)}\right) + (1-y^{(i)})\log\left(1- a^{[2] (i)}\right) \large{)} \small\tag{13}$$
J

array([[ 0.69295259]])

In [36]:
# Step2 : implementation with for-loop, 1 epoch

import pandas as pd
import numpy as np
import time

sigmoid = lambda x: 1 / (1 + np.exp(-x))
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data', header=None)
df[4] = np.where(df[4] == 'Iris-setosa', 1, 0)
# print(df.head())

X = np.array(df[[0, 1, 2]]).T;
Y = np.array(df[[4]]).T

m = X.shape[1]  # number of samples (=150)
n_x = X.shape[0]  # number of features (=3)
n_h = 4  # number of units in the hidden layer
n_y = Y.shape[0]  # size of output layer (= 1)

W1 = np.random.randn(n_h, n_x) * 0.01
b1 = np.zeros((n_h, 1))
W2 = np.random.randn(n_y, n_h) * 0.01
b2 = np.zeros((1, 1))

J = 0  # total average loss
for i in range(m):  # for each sample
    z1 = np.empty((n_h, 1))  # weighted sum for 1st hidden layer (n_h x 1 column vector)
    a1 = np.empty((n_h, 1))  # activation of 1st hidden layer (n_h x 1 column vector)
    i = 0
    
    z1 = np.dot(W1, X[:, i].reshape(-1, 1)) + b1
    a1 = np.tanh(z1)
    
    z2 = np.dot(W2, z1) + b2
    a2 = sigmoid(z2)
    
    loss = -1 * (np.log(a2) * Y[:,i] + (1-Y[:,i]) * np.log(1-a2))
    
    # skiped back prop
    
    J = (J * i + loss)/(i+1)  # computing $$J = - \frac{1}{m} \sum\limits_{i = 0}^{m} \large{(} \small y^{(i)}\log\left(a^{[2] (i)}\right) + (1-y^{(i)})\log\left(1- a^{[2] (i)}\right) \large{)} \small\tag{13}$$
J

array([[ 0.69320526]])

In [1]:




b = 0;
w = np.zeros((n_x, 1))

tic = time.time()

J = 0;
dw1 = 0;
dw2 = 0;
db = 0
for i in range(m):
    xi = X[:, i]
    yi = Y[:, i]

    zi = np.dot(w.T, xi) + b
    ai = sigmoid(zi)
    J += -(yi * np.log(ai) + (1 - yi) * np.log(1 - ai))
    # if i % 10 == 0:
    #    print('Actual: {}, Predicted:{}, Average cost up to now:{}'.format(yi, ai, J/(i+1)))

    dzi = ai - yi
    dw1 += xi[0] * dzi
    dw2 += xi[1] * dzi
    db += dzi
J = J / m;
dw1 = dw1 / m;
dw2 = dw2 / m;
db = db / m

toc = time.time()
print('Average cost:{}'.format(J))
print('Execution time for loop version  : ', str(1000 * (toc - tic)) + 'ms')

Average cost:[ 0.69314718]
Execution time for loop version  :  16.011476516723633ms
