In [1]:
import tensorflow as tf
import numpy as np

# Multi-feature regression

### Hypothesis

$$ H(x) = w x + b $$

$$ H(x_1, x_2, x_3) = w_1 x_1 + w_2 x_2 + w_3 x_3 + b $$

# Hypothesis without b

$$ H(x_1, x_2, x_3) = w_1 x_1 + w_2 x_2 + w_3 x_3 + b$$

$$ = b + w_1 x_1 + w_2 x_2 + w_3 x_3 $$

$$ = \begin{pmatrix} b & x_{ 1 } & x_{ 2 } & x_{ 3 } \end{pmatrix}\cdot \begin{pmatrix} 1 \\ w_{ 1 } \\ w_{ 2 } \\ w_{ 3 } \end{pmatrix} $$

$$ = XW $$



# Hypothesis using matrix 

### Many x instances

$$ \begin{pmatrix} x_{ 11 } & x_{ 12 } & x_{ 13 } \\ x_{ 21 } & x_{ 22 } & x_{ 23 } \\ x_{ 31 } & x_{ 32 } & x_{ 33 }\\ x_{ 41 } & x_{ 42 } & x_{ 43 }\\ x_{ 51 } & x_{ 52 } & x_{ 53 }\end{pmatrix} \cdot \begin{pmatrix} w_{ 1 } \\ w_{ 2 } \\ w_{ 3 } \end{pmatrix}=\begin{pmatrix} x_{ 11 }w_{ 1 }+x_{ 12 }w_{ 2 }+x_{ 13 }w_{ 3 } \\ x_{ 21 }w_{ 1 }+x_{ 22 }w_{ 2 }+x_{ 23 }w_{ 3 }\\ x_{ 31 }w_{ 1 }+x_{ 32 }w_{ 2 }+x_{ 33 }w_{ 3 } \\ x_{ 41 }w_{ 1 }+x_{ 42 }w_{ 2 }+x_{ 43 }w_{ 3 } \\ x_{ 51 }w_{ 1 }+x_{ 52 }w_{ 2 }+x_{ 53 }w_{ 3 } \end{pmatrix} $$

$$ [5, 3] \cdot [3, 1] = [5, 1] $$

$$ H(X) = XW $$

5는 데이터(instance)의 수, 3은 변수(feature)의 수, 1은 결과

# Hypothesis using matrix (n output)

$$ [n, 3] \cdot [?, ?] = [n, 2] $$

$$ H(X) = XW $$

* n은 데이터(instance)의 개수, 2는 결과 값의 개수로 주어진다.
* 이때, W [?, ?] ⇒ [3, 2]

# WX vs XW

### Theory (Lecture) :
 $$ H(x) = Wx + b  $$

### TensorFlow (Implementation) :

$$ H(X) = XW $$

# Example 1

In [4]:
tf.random.set_seed(0)  # for reproducibility

In [5]:
x1_data = [1, 0, 3, 0, 5]
x2_data = [0, 2, 0, 4, 0]
y_data  = [1, 2, 3, 4, 5]

In [7]:
W1 = tf.Variable(tf.random.uniform((1,),-10.0,10.0))
W2 = tf.Variable(tf.random.uniform((1,),-10.0,10.0))
b = tf.Variable(tf.random.uniform((1,),-10.0,10.0))

learning_rate = tf.Variable(0.001)

for i in range(1000+1):
    with tf.GradientTape() as tape:
        hypothesis = W1*x1_data + W2*x2_data +b
        cost = tf.reduce_mean(tf.square(hypothesis - y_data))
        
    W1_grad, W2_grad, b_grad = tape.gradient(cost,[W1,W2,b])
    W1.assign_sub(learning_rate * W1_grad)
    W2.assign_sub(learning_rate * W2_grad)
    b.assign_sub(learning_rate * b_grad)
    
    if i % 50 == 0:
         print("i: {:5} | cost: {:10.6f} | W1: {:10.4f} | W2: {:10.4f} | b: {:10.6f}".format(i, cost.numpy(), W1.numpy()[0], W2.numpy()[0], b.numpy()[0]))

i:     0 | cost: 966.489624 | W1:    -6.3849 | W2:    -9.6386 | b:  -1.997182
i:    50 | cost: 290.719482 | W1:    -2.5520 | W2:    -6.0406 | b:   0.083186
i:   100 | cost:  97.416336 | W1:    -0.8520 | W2:    -3.7830 | b:   1.185548
i:   150 | cost:  36.060600 | W1:    -0.1155 | W2:    -2.3524 | b:   1.777458
i:   200 | cost:  14.622516 | W1:     0.1927 | W2:    -1.4381 | b:   2.096439
i:   250 | cost:   6.551729 | W1:     0.3151 | W2:    -0.8494 | b:   2.265759
i:   300 | cost:   3.350713 | W1:     0.3601 | W2:    -0.4675 | b:   2.350938
i:   350 | cost:   2.032307 | W1:     0.3750 | W2:    -0.2176 | b:   2.387711
i:   400 | cost:   1.469417 | W1:     0.3797 | W2:    -0.0524 | b:   2.396086
i:   450 | cost:   1.216593 | W1:     0.3823 | W2:     0.0583 | b:   2.387553
i:   500 | cost:   1.092846 | W1:     0.3854 | W2:     0.1338 | b:   2.368865
i:   550 | cost:   1.023482 | W1:     0.3899 | W2:     0.1865 | b:   2.344070
i:   600 | cost:   0.977404 | W1:     0.3956 | W2:     0.2244 | 

# Example2

In [8]:
x_data = [
    [1., 0., 3., 0., 5.],
    [0., 2., 0., 4., 0.]
]
y_data  = [1, 2, 3, 4, 5]

In [9]:
W = tf.Variable(tf.random.uniform((1, 2), -1.0, 1.0))
b = tf.Variable(tf.random.uniform((1,), -1.0, 1.0))


In [12]:
W,b

(<tf.Variable 'Variable:0' shape=(1, 2) dtype=float32, numpy=array([[-0.89377856, -0.62153316]], dtype=float32)>,
 <tf.Variable 'Variable:0' shape=(1,) dtype=float32, numpy=array([0.5532179], dtype=float32)>)

In [13]:
learning_rate = tf.Variable(0.001)

In [15]:
for i in range(1000+1):
    with tf.GradientTape() as tape:
        hypothesis = tf.matmul(W, x_data) + b
        cost = tf.reduce_mean(tf.square(hypothesis-y_data))
        
        
        W_grad, b_grad = tape.gradient(cost, [W, b])
        W.assign_sub(learning_rate * W_grad)
        b.assign_sub(learning_rate * b_grad)
    
    if i % 50 == 0:
        print("{:5} | {:10.6f} | {:10.4f} | {:10.4f} | {:10.6f}".format(
            i, cost.numpy(), W.numpy()[0][0], W.numpy()[0][1], b.numpy()[0]))

    0 |  30.003744 |    -0.8693 |    -0.6099 |   0.562821
   50 |   7.836334 |    -0.0238 |    -0.1530 |   0.904018
  100 |   2.377153 |     0.3633 |     0.1291 |   1.070535
  150 |   0.924887 |     0.5399 |     0.3063 |   1.149366
  200 |   0.495451 |     0.6204 |     0.4197 |   1.183373
  250 |   0.350969 |     0.6575 |     0.4935 |   1.194003
  300 |   0.294564 |     0.6755 |     0.5426 |   1.192192
  350 |   0.268330 |     0.6851 |     0.5760 |   1.183638
  400 |   0.253377 |     0.6913 |     0.5994 |   1.171381
  450 |   0.242959 |     0.6961 |     0.6164 |   1.157086
  500 |   0.234499 |     0.7004 |     0.6293 |   1.141689
  550 |   0.226969 |     0.7046 |     0.6394 |   1.125728
  600 |   0.219946 |     0.7087 |     0.6478 |   1.109518
  650 |   0.213252 |     0.7129 |     0.6551 |   1.093248
  700 |   0.206807 |     0.7171 |     0.6615 |   1.077033
  750 |   0.200576 |     0.7213 |     0.6674 |   1.060942
  800 |   0.194542 |     0.7254 |     0.6730 |   1.045016
  850 |   0.18

# Hypothesis without b

In [17]:
import tensorflow as tf

# 앞의 코드에서 bias(b)를 행렬에 추가
x_data = [
    [1., 1., 1., 1., 1.], # bias(b)
    [1., 0., 3., 0., 5.], 
    [0., 2., 0., 4., 0.]
]
y_data  = [1, 2, 3, 4, 5]

W = tf.Variable(tf.random.uniform((1, 3), -1.0, 1.0)) # [1, 3]으로 변경하고, b 삭제

learning_rate = 0.001
optimizer = tf.keras.optimizers.SGD(learning_rate)

for i in range(1000+1):
    with tf.GradientTape() as tape:
        hypothesis = tf.matmul(W, x_data) # b가 없다
        cost = tf.reduce_mean(tf.square(hypothesis - y_data))

    grads = tape.gradient(cost, [W])
    optimizer.apply_gradients(grads_and_vars=zip(grads,[W]))
    if i % 50 == 0:
        print("{:5} | {:10.6f} | {:10.4f} | {:10.4f} | {:10.4f}".format(
            i, cost.numpy(), W.numpy()[0][0], W.numpy()[0][1], W.numpy()[0][2]))

    0 |  21.931831 |     0.0647 |    -0.7119 |     0.4413
   50 |   4.901238 |     0.3252 |     0.1250 |     0.6045
  100 |   1.138956 |     0.4462 |     0.5157 |     0.6960
  150 |   0.301578 |     0.5006 |     0.6983 |     0.7491
  200 |   0.112023 |     0.5231 |     0.7840 |     0.7811
  250 |   0.067243 |     0.5302 |     0.8246 |     0.8011
  300 |   0.055370 |     0.5299 |     0.8443 |     0.8143
  350 |   0.051222 |     0.5258 |     0.8544 |     0.8233
  400 |   0.049003 |     0.5200 |     0.8601 |     0.8299
  450 |   0.047323 |     0.5133 |     0.8637 |     0.8349
  500 |   0.045832 |     0.5061 |     0.8664 |     0.8389
  550 |   0.044429 |     0.4988 |     0.8687 |     0.8424
  600 |   0.043084 |     0.4915 |     0.8707 |     0.8454
  650 |   0.041785 |     0.4842 |     0.8727 |     0.8481
  700 |   0.040528 |     0.4769 |     0.8746 |     0.8507
  750 |   0.039309 |     0.4698 |     0.8765 |     0.8531
  800 |   0.038127 |     0.4627 |     0.8784 |     0.8555
  850 |   0.03

# Custom Gradient
* tf.train.GradientDescentOptimizer(): optimizer
* optimizer.apply_gradients(): update

In [18]:
# Multi-variable linear regression (1)

X = tf.constant([[1., 2.], 
                 [3., 4.]])
y = tf.constant([[1.5], [3.5]])

W = tf.Variable(tf.random.normal((2, 1)))
b = tf.Variable(tf.random.normal((1,)))

# Create an optimizer
optimizer = tf.keras.optimizers.SGD(learning_rate=0.01)

n_epoch = 1000+1
print("epoch | cost")

for i in range(n_epoch):
    with tf.GradientTape() as tape:
        y_pred = tf.matmul(X, W) + b
        cost = tf.reduce_mean(tf.square(y_pred - y))
        
    grads = tape.gradient(cost, [W, b])
    optimizer.apply_gradients(grads_and_vars=zip(grads, [W, b]))
    if i % 50 == 0:
        print("{:5} | {:10.6f}".format(i, cost.numpy()))


epoch | cost
    0 |   0.932096
   50 |   0.612641
  100 |   0.418868
  150 |   0.286384
  200 |   0.195804
  250 |   0.133873
  300 |   0.091530
  350 |   0.062580
  400 |   0.042786
  450 |   0.029253
  500 |   0.020001
  550 |   0.013675
  600 |   0.009350
  650 |   0.006392
  700 |   0.004371
  750 |   0.002988
  800 |   0.002043
  850 |   0.001397
  900 |   0.000955
  950 |   0.000653
 1000 |   0.000446


# Multi-variable linear regression

In [20]:
data = np.array([
    # X1,   X2,    X3,   y
    [ 73.,  80.,  75., 152. ],
    [ 93.,  88.,  93., 185. ],
    [ 89.,  91.,  90., 180. ],
    [ 96.,  98., 100., 196. ],
    [ 73.,  66.,  70., 142. ]
], dtype=np.float32)

# slice data
X = data[:, :-1]
y = data[:, [-1]]

W = tf.Variable(tf.random.normal((3, 1)))
b = tf.Variable(tf.random.normal((1,)))

learning_rate = 0.000001

# hypothesis, prediction function
def predict(X):
    return tf.matmul(X, W) + b

print("epoch | cost")

n_epochs = 2000
for i in range(n_epochs+1):
    # tf.GradientTape() to record the gradient of the cost function
    with tf.GradientTape() as tape:
        cost = tf.reduce_mean((tf.square(predict(X) - y)))

    # calculates the gradients of the loss
    W_grad, b_grad = tape.gradient(cost, [W, b])

    # updates parameters (W and b)
    W.assign_sub(learning_rate * W_grad)
    b.assign_sub(learning_rate * b_grad)
    
    if i % 100 == 0:
        print("{:5} | {:10.4f}".format(i, cost.numpy()))

epoch | cost
    0 | 195206.6562
  100 |    40.8115
  200 |    16.6931
  300 |    16.6015
  400 |    16.5132
  500 |    16.4254
  600 |    16.3381
  700 |    16.2514
  800 |    16.1649
  900 |    16.0791
 1000 |    15.9937
 1100 |    15.9088
 1200 |    15.8243
 1300 |    15.7403
 1400 |    15.6566
 1500 |    15.5736
 1600 |    15.4908
 1700 |    15.4087
 1800 |    15.3268
 1900 |    15.2455
 2000 |    15.1645


In [21]:
W.numpy()

array([[0.25667503],
       [1.2582879 ],
       [0.5000149 ]], dtype=float32)

In [22]:
b.numpy()

array([-0.13203251], dtype=float32)

In [23]:
tf.matmul(X, W) + b

<tf.Tensor: shape=(5, 1), dtype=float32, numpy=
array([[156.7694 ],
       [180.96947],
       [182.21758],
       [197.82248],
       [136.65329]], dtype=float32)>

# predict

In [26]:
y # labels

array([[152.],
       [185.],
       [180.],
       [196.],
       [142.]], dtype=float32)

In [24]:
predict(X).numpy() # prediction, 예측값

array([[156.7694 ],
       [180.96947],
       [182.21758],
       [197.82248],
       [136.65329]], dtype=float32)

In [27]:
predict([[ 89.,  95.,  92.],[ 84.,  92.,  85.]]).numpy() 

array([[188.25078],
       [179.69243]], dtype=float32)