In [97]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
import seaborn as sns

import warnings
warnings.filterwarnings(action = 'ignore')

In [98]:
from sklearn.datasets import fetch_openml
mnist = fetch_openml('mnist_784', version=1, cache=True)
mnist.keys()

dict_keys(['data', 'target', 'frame', 'categories', 'feature_names', 'target_names', 'DESCR', 'details', 'url'])

In [99]:
X, y = mnist["data"], mnist["target"]

In [101]:
y = y.astype(np.uint8)

In [102]:
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder()

In [103]:
enc.fit(y[:,np.newaxis])

OneHotEncoder()

In [104]:
Y = enc.transform(y[:,np.newaxis]).toarray()

훈련 데이터 50,000개<br>
검증 데이터, 테스트 데이터 10,000개

In [110]:
X_train, X_valid, X_test, y_train, y_valid, y_test = X[:50000], X[50000:60000], X[60000:], Y[:50000], Y[50000:60000], Y[60000:]

In [111]:
X_train = X_train / 255
X_valid = X_valid / 255
X_test = X_test / 255

In [113]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

In [211]:
def softmax(X, W):
    K = np.size(W, 1)
    A = np.exp(X @ W)
    B = np.diag(1 / (np.reshape(A @ np.ones((K,1)), -1)))
    Y = B @ A
    return Y

In [414]:
def compute_cost(X, T, W, l = 0):
    epsilon = 1e-5
    N = len(T)
    K = np.size(T, 1)
    cost = - (1/N) * np.ones((1,N)) @ (np.multiply(np.log(softmax(X, W) + epsilon), T)) @ np.ones((K,1)) + 1 / 2 * l * np.sum(W ** 2)
    return cost

기존 cost에 규제항 추가<br>
$\mbox{cost} = \mbox{cost} - \lambda \left\Vert \boldsymbol{W} \right\Vert_2^2$

In [213]:
def predict(X, W):
    return np.argmax((X @ W), axis=1)

In [214]:
## 검증함수 구현
def validate(X_valid, y_valid, W_optimal):
    X_val = np.hstack((np.ones((np.size(X_valid, 0), 1)), X_valid))
    T_val = y_valid
    y_pred = predict(X_val, W_optimal)
    score = float(sum(y_pred == np.argmax(T_val, axis=1))) / float(len(y_valid))

    print(f"score = {score}")
    return score

In [368]:
def batch_gd(X, T, W, learning_rate, iterations, batch_size):
    N = len(T)
    cost_history = np.zeros((iterations,1))
    shuffled_indices = np.random.RandomState(seed=42).permutation(N)
    X_shuffled = X[shuffled_indices]
    T_shuffled = T[shuffled_indices]
    
    acc_history = []
    
    lambdas = [-(10 ** -x) for x in range(10, 0, -1)] + [0] + [10 ** -x for x in range(1, 11)]
    
    for idx, l in enumerate(lambdas):
        K = np.size(T, 1)
        M = np.size(X, 1)
        W = np.zeros((M,K))
        for i in range(iterations):
            j = i % N
            X_batch = X_shuffled[j:j+batch_size]
            T_batch = T_shuffled[j:j+batch_size]
            # batch가 epoch 경계를 넘어가는 경우, 앞 부분으로 채워줌
            if X_batch.shape[0] < batch_size:
                X_batch = np.vstack((X_batch, X_shuffled[:(batch_size - X_batch.shape[0])]))
                T_batch = np.vstack((T_batch, T_shuffled[:(batch_size - T_batch.shape[0])]))
            W = W - (learning_rate/batch_size) * (X_batch.T @ (softmax(X_batch, W) - T_batch)) - l * W
            cost_history[i] = compute_cost(X_batch, T_batch, W, l)
            # 10000번 학습당 cost 출력
#             if i % 10000 == 0:
#                 print(cost_history[i][0])
                
        # 검증 데이터에 대해 결과를 검증하고
        # 정확도를 기록 
        print('lamda:', l)
        acc = validate(X_valid, y_valid, W)
        print('accuracy:', acc)
#         print('가중치', W)
        acc_history.append((lambdas[idx], acc))
        print(acc_history)
        print('\n\n')

    return (cost_history, W)

위의 손실함수 $\mbox{cost} = \mbox{cost} + \lambda \left\Vert \boldsymbol{W} \right\Vert_2^2$에서 규제항을 미분<br>
$\nabla_{\boldsymbol{W}} = \lambda \left\Vert \boldsymbol{W} \right\Vert_2^2 = \nabla_{\boldsymbol{W}} (\lambda \boldsymbol{W}^T \boldsymbol{W}) = 2\lambda \boldsymbol{W} \rightarrow \lambda \boldsymbol{W}$<br>
이에 따라 손실함수는 다음과 같이 변경(상수항을 $\lambda$로 계산하기 때문)<br>
$\displaystyle \mbox{cost} = \mbox{cost} + \frac{1}{2}\lambda \left\Vert \boldsymbol{W} \right\Vert_2^2$<br>
<br>
따라서 가중치 업데이트 시 $\lambda \boldsymbol{W}$ 항을 추가하여 최적의 정확도를 보장하는 $\lambda$를 구하면됨

In [371]:
X = np.hstack((np.ones((np.size(X_train, 0),1)),X_train))
T = y_train

K = np.size(T, 1)
M = np.size(X, 1)
W = np.zeros((M,K))

iterations = 50000
learning_rate = 0.01
print('X shape:', X.shape)
print('T shape:', T.shape)

initial_cost = compute_cost(X, T, W)

print("Initial Cost is: {} \n".format(initial_cost[0][0]))

(cost_history, W_optimal) = batch_gd(X, T, W, learning_rate, iterations, 64)

X shape: (50000, 785)
T shape: (50000, 10)
Initial Cost is: 2.3024850979937375 

lamda: -1e-10
score = 0.9201
accuracy: 0.9201
[(-1e-10, 0.9201)]



lamda: -1e-09
score = 0.9201
accuracy: 0.9201
[(-1e-10, 0.9201), (-1e-09, 0.9201)]



lamda: -1e-08
score = 0.9201
accuracy: 0.9201
[(-1e-10, 0.9201), (-1e-09, 0.9201), (-1e-08, 0.9201)]



lamda: -1e-07
score = 0.9201
accuracy: 0.9201
[(-1e-10, 0.9201), (-1e-09, 0.9201), (-1e-08, 0.9201), (-1e-07, 0.9201)]



lamda: -1e-06
score = 0.9203
accuracy: 0.9203
[(-1e-10, 0.9201), (-1e-09, 0.9201), (-1e-08, 0.9201), (-1e-07, 0.9201), (-1e-06, 0.9203)]



lamda: -1e-05
score = 0.9214
accuracy: 0.9214
[(-1e-10, 0.9201), (-1e-09, 0.9201), (-1e-08, 0.9201), (-1e-07, 0.9201), (-1e-06, 0.9203), (-1e-05, 0.9214)]



lamda: -0.0001
score = 0.0991
accuracy: 0.0991
[(-1e-10, 0.9201), (-1e-09, 0.9201), (-1e-08, 0.9201), (-1e-07, 0.9201), (-1e-06, 0.9203), (-1e-05, 0.9214), (-0.0001, 0.0991)]



lamda: -0.001
score = 0.0991
accuracy: 0.0991
[(-1e-10, 0.9201)

$\lambda = -0.00001$ 부근에서 높은 accuracy

In [372]:
def batch_gd(X, T, W, learning_rate, iterations, batch_size):
    N = len(T)
    cost_history = np.zeros((iterations,1))
    shuffled_indices = np.random.RandomState(seed=42).permutation(N)
    X_shuffled = X[shuffled_indices]
    T_shuffled = T[shuffled_indices]
    
    acc_history = []
    
    lambdas = [- x / 10 ** 6 for x in range(5, 99)]
    
    for idx, l in enumerate(lambdas):
        K = np.size(T, 1)
        M = np.size(X, 1)
        W = np.zeros((M,K))
        for i in range(iterations):
            j = i % N
            X_batch = X_shuffled[j:j+batch_size]
            T_batch = T_shuffled[j:j+batch_size]
            # batch가 epoch 경계를 넘어가는 경우, 앞 부분으로 채워줌
            if X_batch.shape[0] < batch_size:
                X_batch = np.vstack((X_batch, X_shuffled[:(batch_size - X_batch.shape[0])]))
                T_batch = np.vstack((T_batch, T_shuffled[:(batch_size - T_batch.shape[0])]))
            W = W - (learning_rate/batch_size) * (X_batch.T @ (softmax(X_batch, W) - T_batch)) - l * W
            cost_history[i] = compute_cost(X_batch, T_batch, W, l)
            # 10000번 학습당 cost 출력
#             if i % 10000 == 0:
#                 print(cost_history[i][0])
                
        # 검증 데이터에 대해 결과를 검증하고
        # 정확도를 기록 
        acc = validate(X_valid, y_valid, W)
#         print('가중치', W)
        acc_history.append(acc)

    return (cost_history, W), acc_history

In [373]:
X = np.hstack((np.ones((np.size(X_train, 0),1)),X_train))
T = y_train

K = np.size(T, 1)
M = np.size(X, 1)
W = np.zeros((M,K))

iterations = 50000
learning_rate = 0.01
print('X shape:', X.shape)
print('T shape:', T.shape)

initial_cost = compute_cost(X, T, W)

print("Initial Cost is: {} \n".format(initial_cost[0][0]))

(cost_history, W_optimal), acc_li = batch_gd(X, T, W, learning_rate, iterations, 64)

X shape: (50000, 785)
T shape: (50000, 10)
Initial Cost is: 2.3024850979937375 

score = 0.9208
score = 0.9206
score = 0.921
score = 0.9208
score = 0.9211
score = 0.9214
score = 0.922
score = 0.9223
score = 0.9226
score = 0.9223
score = 0.9219
score = 0.922
score = 0.9223
score = 0.9219
score = 0.9221
score = 0.9222
score = 0.9224
score = 0.9227
score = 0.9225
score = 0.9227
score = 0.9225
score = 0.9221
score = 0.9221
score = 0.9219
score = 0.9218
score = 0.9218
score = 0.9216
score = 0.9219
score = 0.9218
score = 0.9216
score = 0.9215
score = 0.9214
score = 0.921
score = 0.9208
score = 0.9214
score = 0.9213
score = 0.9217
score = 0.9214
score = 0.9213
score = 0.9209
score = 0.9211
score = 0.9209
score = 0.9203
score = 0.9201
score = 0.9199
score = 0.92
score = 0.9201
score = 0.9198
score = 0.9196
score = 0.9197
score = 0.9194
score = 0.919
score = 0.9185
score = 0.9182
score = 0.9181
score = 0.918
score = 0.9177
score = 0.9176
score = 0.9178
score = 0.9174
score = 0.9177
score = 0.91

In [387]:
lambdas = [- x / 10 ** 6 for x in range(5, 99)]
lambda_max = lambdas[np.argmax(acc_li)]
acc_max = np.max(acc_li)
lambda_max, acc_max

(-2.2e-05, 0.9227)

최대 정확도 일 때에는 $\lambda = -0.000022$

In [412]:
def batch_gd(X, T, W, learning_rate, iterations, batch_size, l):
    N = len(T)
    cost_history = np.zeros((iterations,1))
    shuffled_indices = np.random.permutation(N)
    X_shuffled = X[shuffled_indices]
    T_shuffled = T[shuffled_indices]
    
    for i in range(iterations):
        j = i % N
        X_batch = X_shuffled[j:j+batch_size]
        T_batch = T_shuffled[j:j+batch_size]
        # batch가 epoch 경계를 넘어가는 경우, 앞 부분으로 채워줌
        if X_batch.shape[0] < batch_size:
            X_batch = np.vstack((X_batch, X_shuffled[:(batch_size - X_batch.shape[0])]))
            T_batch = np.vstack((T_batch, T_shuffled[:(batch_size - T_batch.shape[0])]))
        W = W - (learning_rate/batch_size) * (X_batch.T @ (softmax(X_batch, W) - T_batch)) - l * W
        cost_history[i] = compute_cost(X_batch, T_batch, W, l)
        # 1000번 학습당 cost 출력
        if i % 5000 == 0:
            print(cost_history[i][0])
            

    return (cost_history, W)

In [415]:
X = np.hstack((np.ones((np.size(X_train, 0),1)),X_train))
T = y_train

K = np.size(T, 1)
M = np.size(X, 1)
W = np.zeros((M,K))

iterations = 50000
learning_rate = 0.01
print('X shape:', X.shape)
print('T shape:', T.shape)


l = -0.000022
initial_cost = compute_cost(X, T, W, l)
print("Initial Cost is: {} \n".format(initial_cost[0][0]))
(cost_history, W_optimal) = batch_gd(X, T, W, learning_rate, iterations, 64, l)
reg_acc_val = validate(X_valid, y_valid, W_optimal)
reg_acc_test = validate(X_test, y_test, W_optimal)

l = 0
initial_cost = compute_cost(X, T, W, l)
print("Initial Cost is: {} \n".format(initial_cost[0][0]))
(cost_history, W_optimal) = batch_gd(X, T, W, learning_rate, iterations, 64, l)
acc_val = validate(X_valid, y_valid, W_optimal)
acc_test = validate(X_test, y_test, W_optimal)

X shape: (50000, 785)
T shape: (50000, 10)
Initial Cost is: 2.3024850979937375 

2.2811737901504423
0.2890821509838653
0.17526655310082961
0.16062665797344375
0.23256748743398256
0.13629100291513951
0.4234400327849822
0.2792979511099775
0.32546071891895767
0.10408271648352072
score = 0.9211
score = 0.9175
Initial Cost is: 2.3024850979937375 

2.274773453446265
0.39507944751277546
0.3656071111535034
0.1702711549024994
0.3186276078715601
0.25194470438946714
0.20912738704593808
0.21200073534096853
0.2908217090680837
0.18797097157237674
score = 0.9181
score = 0.9135


In [416]:
# 정확도 비교
print('valid acc with regularization:', reg_acc_val)
print('test acc with regularization:', reg_acc_test)
print()
print('valid acc without regularization:', acc_val)
print('test acc without regularization:', acc_test)

valid acc with regularization: 0.9211
test acc with regularization: 0.9175

valid acc without regularization: 0.9181
test acc without regularization: 0.9135


눈에 띄게 성능이 향상된건 아니지만 약간의 성능 향상이 이루어졌다고 볼 수 있음(0.9135에서 0.9175로)