In [None]:
import numpy as np
np.random.seed(320)
x_train = np.linspace(-1, 1, 50)
f = lambda x: 0.5 * x + 1.0
y_train = f(x_train) + 0.4 * np.random.rand(len(x_train))

In [None]:
# 손실함수
def loss(w, x_set, y_set):
    N = len(x_set)
    val = 0.0
    for i in range(len(x_set)):
        val += 0.5 * ( w[0] * x_set[i] + w[1] - y_set[i] )**2
    return val / N

#손실함수의 그래디언트
def loss_grad(w, x_set, y_set):
    N = len(x_set)
    val = np.zeros(len(w))
    for i in range(len(x_set)):
        er = w[0] * x_set[i] + w[1] - y_set[i]
        val += er * np.array([x_set[i], 1.0])
    return val / N

In [None]:
def generate_batches(batch_size, features, labels):
    outout_batches = []
    sample_size = len(features)
    for start_i in range(0, sample_size, batch_size):
        end_i = start_i + batch_size
        batch = [features[start_i:end_i], labels[start_i:end_i]]
        outout_batches.append(batch)
    return outout_batches

In [None]:
# SGD
batch_size = 5 # 배치 크기
lr = 0.1 # 학습률
MaxEpochs = 10 # 반복 횟수

paths = []
batch_loss = []
w0 = np.array([4.0, -1.0]) # 1) 초깃값
search_direction = np.zeros_like(w0)

# 2) 데이터 셔플링
np.random.seed(320)
idx = np.arange(len(x_train))
np.random.shuffle(idx)
shuffled_x_train = x_train[idx]
shuffled_y_train = y_train[idx]

# 알고리즘
for epoch in range(MaxEpochs+1): # 5) MaxEpochs번 반복
    for x_batch, y_batch in generate_batches(batch_size, shuffled_x_train, shuffled_y_train): # 3) 미니 배치 생성
        paths.append(w0)
        batch_loss.append(loss(w0, x_batch, y_batch))
        grad = loss_grad(w0, x_batch, y_batch) # 4)-1 미니 치에서 그래디언트 계산
        search_direction = -grad # 4)-2 탐색 방향 설정
        lr = lr # 4)-3 학습률 설정
        # 4)-4 파라미터 업데이트
        dw = lr * search_direction
        w0 = w0 + dw
    print('{:02d}\t{}\t{:5.4f}'.format(epoch, w0, loss(w0, x_train, y_train)))

In [None]:
from matplotlib.colors import LogNorm
import matplotlib.pyplot as plt
def contour_with_path(l, x, y, paths, norm=LogNorm(),level=np.logspace(0, 5, 35), minima=None):
    paths = np.array(paths).T
    fig, ax = plt.subplots(figsize=(7, 4))
    ax.contour(x, y, l, levels=level, norm=norm, cmap=plt.cm.jet)
    ax.quiver(paths[0, :-1], paths[1, :-1], paths[0, 1:]-paths[0, :-1], paths[1, 1:]-paths[1, :-1],
        scale_units='xy', angles='xy', scale=1, color='k')
    if minima is not None:
        ax.plot(*minima, 'r*', markersize=18)

    ax.set_xlabel('$a$')
    ax.set_ylabel('$b$')
    plt.show()

W0 = np.linspace(-5, 7, 101)
W1 = np.linspace(-2, 5, 101)
W0, W1 = np.meshgrid(W0, W1)
LOSSW = W0 * 0
for i in range(W0.shape[0]):
    for j in range(W0.shape[1]):
        wij = np.array([W0[i, j], W1[i, j]])
        LOSSW[i, j] = loss(wij, x_train, y_train)

contour_with_path(LOSSW, W0, W1, paths, norm=None, level=np.linspace(0, 10, 10))

In [None]:
plt.plot(batch_loss, '.-k', markerfacecolor='none')
plt.grid()
plt.xlabel('step')
plt.ylabel('loss')
plt.title('loss on a batch by SGD')
plt.show()

In [None]:
from matplotlib.patches import Rectangle
def visualize_l2(w, b, x_train, y_train, loss):
    from matplotlib.patches import Rectangle
    loss_name = 'loss'
    plt.plot(x_train, y_train, '.k', markerfacecolor='none')
    plt.plot(x_train, w * x_train + b, '--k')
    currentAxis = plt.gca()
    for xx,yy in zip(x_train, y_train):
        currentAxis.add_patch(Rectangle((xx, yy),
                                        w * xx + b - yy, w * xx + b - yy,
                              alpha=0.1, facecolor='gray', edgecolor='k'))
    plt.grid()
    plt.axis('equal')
    plt.xlabel('x')
    plt.ylabel('y')
    plt.title("a={:1.2f}, b={:1.2f}(loss={:5.4f})".format(w, b, loss))

visualize_l2(w0[0], w0[1], x_train, y_train, loss(w0, x_train, y_train))

In [None]:
# Momentum
batch_size = 5 # 배치 크기
epsilon = 0.03 # 학습률
MaxEpochs = 10 # 반복 횟수

w0 = np.array([4.0, -1.0]) # 1) 초깃값
paths = []
batch_loss = []
alpha = 0.9

v = np.zeros_like(w0)

for epoch in range(MaxEpochs + 1):
    for x_batch, y_batch in generate_batches(batch_size, shuffled_x_train, shuffled_y_train):
        paths.append(w0)
        batch_loss.append(loss(w0, x_batch, y_batch))
        grad = loss_grad(w0, x_batch, y_batch)

        v = alpha * v - epsilon * grad
        w0 = w0 + v

    print('{:02d}\t{}\t{:5.4f}'.format(epoch, w0, loss(w0, x_train, y_train)))

W0 = np.linspace(-5, 7, 101)
W1 = np.linspace(-2, 5, 101)
W0, W1 = np.meshgrid(W0, W1)
LOSSW = W0 * 0
for i in range(W0.shape[0]):
    for j in range(W0.shape[1]):
        wij = np.array([W0[i, j], W1[i, j]])
        LOSSW[i, j] = loss(wij, x_train, y_train)

contour_with_path(LOSSW, W0, W1, paths, norm=None, level=np.linspace(0, 10, 10))
plt.plot(batch_loss, '.-k', markerfacecolor='none')
plt.grid()
plt.xlabel('step')
plt.ylabel('loss')
plt.title('loss on a batch by Momentum')
plt.show()

In [None]:
# Nesterov
batch_size = 5 # 뱃치 크기
epsilon = 0.03 # 학습율
MaxEpochs = 10 # 반복 횟수

w0 = np.array([4.0, -1.0]) # 1) 초기값
paths = []
batch_loss = []
alpha = 0.9

v = np.zeros_like(w0)

for epoch in range(MaxEpochs + 1):
    for x_batch, y_batch in generate_batches(batch_size, shuffled_x_train, shuffled_y_train):
        paths.append(w0)
        batch_loss.append(loss(w0, x_batch, y_batch))

        grad = loss_grad(w0 + alpha * v , x_batch, y_batch)

        v = alpha * v - epsilon * grad
        w0 = w0 + v

    print('{:02d}\t{}\t{:5.4f}'.format(epoch, w0, loss(w0, x_train, y_train)))


W0 = np.linspace(-5, 7, 101)
W1 = np.linspace(-2, 5, 101)
W0, W1 = np.meshgrid(W0, W1)
LOSSW = W0 * 0
for i in range(W0.shape[0]):
    for j in range(W0.shape[1]):
        wij = np.array([W0[i, j], W1[i, j]])
        LOSSW[i, j] = loss(wij, x_train, y_train)

contour_with_path(LOSSW, W0, W1, paths, norm=None, level=np.linspace(0, 10, 10))
plt.plot(batch_loss, '.-k', markerfacecolor='none')
plt.grid()
plt.xlabel('step')
plt.ylabel('loss')
plt.title('loss on a batch by Nesterov')
plt.show()

In [None]:
# Adagrad
batch_size = 5 # 배치 크기
MaxEpochs = 10 # 반복 횟수

w0 = np.array([2.0, 4.0]) # 1) 초깃값
epsilon = 1.0
delta = 1E-7
r = np.zeros_like(w0)

paths = []
batch_loss = []


for epoch in range(MaxEpochs + 1):
    for x_batch, y_batch in generate_batches(batch_size, shuffled_x_train, shuffled_y_train):
        paths.append(w0)
        batch_loss.append(loss(w0, x_batch, y_batch))

        grad = loss_grad(w0, x_batch, y_batch)
        r += grad ** 2

        adjusted_lr = epsilon / (np.sqrt(r) + delta)
        w0 = w0 - adjusted_lr * grad

    print('{:02d}\t{}\t{:5.4f}'.format(epoch, w0, loss(w0, x_train, y_train)))


W0 = np.linspace(-5, 7, 101)
W1 = np.linspace(-2, 5, 101)
W0, W1 = np.meshgrid(W0, W1)
LOSSW = W0 * 0
for i in range(W0.shape[0]):
    for j in range(W0.shape[1]):
        wij = np.array([W0[i, j], W1[i, j]])
        LOSSW[i, j] = loss(wij, x_train, y_train)

contour_with_path(LOSSW, W0, W1, paths, norm=None, level=np.linspace(0, 10, 10))
plt.plot(batch_loss, '.-k', markerfacecolor='none')
plt.grid()
plt.xlabel('step')
plt.ylabel('loss')
plt.title('loss on a batch by Adagrad')
plt.show()

In [None]:
# RMSProp
batch_size = 5 # 배치 크기
MaxEpochs = 10 # 반복 횟수

w0 = np.array([2.0, 4.0]) # 1) 초깃값
epsilon = 0.25
delta = 1E-10
rho = 0.9
r = np.zeros_like(w0)

paths = []
batch_loss = []

for epoch in range(MaxEpochs + 1):
    for x_batch, y_batch in generate_batches(batch_size, shuffled_x_train, shuffled_y_train):
        paths.append(w0)
        batch_loss.append(loss(w0, x_batch, y_batch))

        grad = loss_grad(w0, x_batch, y_batch)
        r = rho * r + (1 - rho) * grad ** 2

        adjusted_lr = epsilon / (np.sqrt(r) + delta)
        w0 = w0 - adjusted_lr * grad

    print('{:02d}\t{}\t{:5.4f}'.format(epoch, w0, loss(w0, x_train, y_train)))


W0 = np.linspace(-5, 7, 101)
W1 = np.linspace(-2, 5, 101)
W0, W1 = np.meshgrid(W0, W1)
LOSSW = W0 * 0
for i in range(W0.shape[0]):
    for j in range(W0.shape[1]):
        wij = np.array([W0[i, j], W1[i, j]])
        LOSSW[i, j] = loss(wij, x_train, y_train)

contour_with_path(LOSSW, W0, W1, paths, norm=None, level=np.linspace(0, 10, 10))
plt.plot(batch_loss, '.-k', markerfacecolor='none')
plt.grid()
plt.xlabel('step')
plt.ylabel('loss')
plt.title('loss on a batch by RMSProp')
plt.show()

In [None]:
# Adam
batch_size = 5 # 배치 크기
MaxEpochs = 10 # 반복 횟수

w0 = np.array([2.0, 4.0]) # 1) 초깃값
epsilon = 1.0
delta = 1E-8
rho1 = 0.9
rho2 = 0.999
s = np.zeros_like(w0)
r = np.zeros_like(w0)
t = 0
paths = []
batch_loss = []

for epoch in range(MaxEpochs + 1):
    for x_batch, y_batch in generate_batches(batch_size, shuffled_x_train, shuffled_y_train):
        t+=1
        paths.append(w0)
        batch_loss.append(loss(w0, x_batch, y_batch))

        grad = loss_grad(w0, x_batch, y_batch)
        s = rho1 * s + (1 - rho1) * grad
        r = rho2 * r + (1 - rho2) * (grad ** 2)

        s_hat = s / (1 - rho1 ** (t))
        r_hat = r / (1 - rho2 ** (t))

        w0 = w0 - epsilon * s_hat / (np.sqrt(r_hat) + delta)

    print('{:02d}\t{}\t{:5.4f}'.format(epoch, w0, loss(w0, x_train, y_train)))

W0 = np.linspace(-5, 7, 101)
W1 = np.linspace(-2, 5, 101)
W0, W1 = np.meshgrid(W0, W1)
LOSSW = W0 * 0
for i in range(W0.shape[0]):
    for j in range(W0.shape[1]):
        wij = np.array([W0[i, j], W1[i, j]])
        LOSSW[i, j] = loss(wij, x_train, y_train)

contour_with_path(LOSSW, W0, W1, paths, norm=None, level=np.linspace(0, 10, 10))
plt.plot(batch_loss, '.-k', markerfacecolor='none')
plt.grid()
plt.xlabel('step')
plt.ylabel('loss')
plt.title('loss on a batch by Adam')
plt.show()

In [None]:
# Data
np.random.seed(327)
x_train = np.linspace(-1,1,50)
y_train = 0.25 * np.cos(np.pi * x_train) + 0.3 * np.sin(np.pi * x_train) + 0.2 * (2 * np.random.rand(len(x_train)) - 1)

In [None]:
plt.plot(x_train, y_train, '.k')
plt.grid()
plt.xlabel('x')
plt.ylabel('y')
plt.show()

In [None]:
# 다항함수용 함수들
def loss(w, x_set, y_set):
    N = len(x_set)
    val = 0.0
    for i in range(len(x_set)):
        y_pred = np.polyval(w, x_set[i])
        val += 0.5 * (y_pred - y_set[i])**2
    return val / N

# 손실 함수의 그래디언트
def loss_grad(w, x_set, y_set):
    N = len(x_set)
    grad = np.zeros(len(w))
    for i in range(len(x_set)):
        y_pred = np.polyval(w, x_set[i])
        err = y_pred - y_set[i]
        x_powers = np.array([x_set[i]**p for p in range(len(w) - 1, -1, -1)])
        grad += err * x_powers
    return grad / N

# 미니 배치 생성 함수
def generate_batches(batch_size, features, labels):
    output_batches = []
    sample_size = len(features)
    for start_i in range(0, sample_size, batch_size):
        end_i = start_i + batch_size
        batch = [features[start_i:end_i], labels[start_i:end_i]]
        output_batches.append(batch)
    return output_batches
# 플로팅
def plot_regression(x_train, y_train, w):
    plt.figure(figsize=(8, 6))

    plt.plot(x_train, y_train, '.k', label='Training data')
    x_fit = np.linspace(-1, 1, 200)
    y_fit = np.polyval(w, x_fit)
    plt.plot(x_fit, y_fit, '-r', label='Regression result')

    plt.grid()
    plt.xlabel('x')
    plt.ylabel('y')
    plt.title('4th Degree Polynomial Regression')
    plt.legend()
    plt.show()

# "Adam"
batch_size = 10
lr = 0.1
MaxEpochs = 100

w0 = np.ones(5)  # 4차 다항식 초기값

np.random.seed(916)
idx = np.arange(len(x_train))
np.random.shuffle(idx)
shuffled_x_train = x_train[idx]
shuffled_y_train = y_train[idx]

epsilon = 1.0
delta = 1E-8
rho1 = 0.9
rho2 = 0.999
s = np.zeros_like(w0)
r = np.zeros_like(w0)
t = 0
paths = []
batch_loss = []

for epoch in range(MaxEpochs + 1):
    for x_batch, y_batch in generate_batches(batch_size, shuffled_x_train, shuffled_y_train):
        paths.append(w0)
        batch_loss.append(loss(w0, x_batch, y_batch))

        grad = loss_grad(w0, x_batch, y_batch)
        s = rho1 * s + (1 - rho1) * grad
        r = rho2 * r + (1 - rho2) * grad ** 2

        s_hat = s / (1 - rho1 ** (epoch + 1))
        r_hat = r / (1 - rho2 ** (epoch + 1))

        w0 = w0 - epsilon * s_hat / (np.sqrt(r_hat) + delta)
    if epoch%10==0:
      print('{:02d}\t{}\t{:5.4f}'.format(epoch, w0, loss(w0, x_train, y_train)))


# 최종결과 플로팅
plot_regression(x_train, y_train, w0)
# loss curve 플로팅
plt.figure(figsize=(10, 6))
plt.plot(batch_loss, label="Adam Loss Curve")
plt.xlabel("Iterations")
plt.ylabel("Loss")
plt.title("Loss Curve for Adagrad Optimization")
plt.legend()
plt.grid()
plt.show()


In [None]:
# "Adagrad"

batch_size = 10
lr = 0.1
MaxEpochs = 100

w0 = np.ones(5)  # 4차 다항식 초기값

np.random.seed(916)
idx = np.arange(len(x_train))
np.random.shuffle(idx)
shuffled_x_train = x_train[idx]
shuffled_y_train = y_train[idx]

epsilon = 1.0
delta = 1E-8
rho1 = 0.9
rho2 = 0.999
s = np.zeros_like(w0)
r = np.zeros_like(w0)
t = 0
paths = []
batch_loss = []

for epoch in range(MaxEpochs + 1):
    for x_batch, y_batch in generate_batches(batch_size, shuffled_x_train, shuffled_y_train):
         paths.append(w0)
         batch_loss.append(loss(w0, x_batch, y_batch))

         grad = loss_grad(w0, x_batch, y_batch)
         r += grad ** 2

         adjusted_lr = epsilon / (np.sqrt(r) + delta)
         w0 = w0 - adjusted_lr * grad

    if epoch%10==0:
      print('{:02d}\t{}\t{:5.4f}'.format(epoch, w0, loss(w0, x_train, y_train)))

#최종결과 플로팅
plot_regression(x_train, y_train, w0)
#loss curve 플로팅
plt.figure(figsize=(10, 6))
plt.plot(batch_loss, label="Adagrad Loss Curve")
plt.xlabel("Iterations")
plt.ylabel("Loss")
plt.title("Loss Curve for Adagrad Optimization")
plt.legend()
plt.grid()
plt.show()