In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
import seaborn as sns

In [2]:
from sklearn.datasets import fetch_openml


mnist = fetch_openml('mnist_784', version=1, cache=True)
mnist.keys()

  warn(


dict_keys(['data', 'target', 'frame', 'categories', 'feature_names', 'target_names', 'DESCR', 'details', 'url'])

In [3]:
X, y = mnist["data"].to_numpy(), mnist["target"].to_numpy()

In [4]:
y = y.astype(np.uint8)

In [5]:
from sklearn.preprocessing import OneHotEncoder 

enc = OneHotEncoder()

In [6]:
enc.fit(y[:,np.newaxis])

In [7]:
Y = enc.transform(y[:,np.newaxis]).toarray()

In [8]:
X_train, X_test, y_train, y_test = X[:60000], X[60000:], Y[:60000], Y[60000:]

In [9]:
X_train = X_train / 255
X_test = X_test / 255

In [10]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

In [11]:
def softmax(X, W):
    K = np.size(W, 1)
    A = np.exp(X @ W)
    B = np.diag(1 / (np.reshape(A @ np.ones((K,1)), -1)))
    Y = B @ A
    return Y

In [14]:
def compute_cost(X, T, W, Lambda=0):
    epsilon = 1e-5
    N = len(T)
    K = np.size(T, 1)
    l2 = (Lambda / 2 * np.linalg.norm(W) ** 2)
    cost = - (1/N) * np.ones((1,N)) @ (np.multiply(np.log(softmax(X, W) + epsilon), T)) @ np.ones((K,1)) + l2
    return cost

In [18]:
def predict(X, W):
    return np.argmax((X @ W), axis=1)

In [29]:
def batch_gd(X, T, W, learning_rate, iterations, batch_size, Lambda=0):
    N = len(T)
    cost_history = np.zeros((iterations,1))
    shuffled_indices = np.random.RandomState(42).permutation(N)
    X_shuffled = X[shuffled_indices]
    T_shuffled = T[shuffled_indices]

    for i in range(iterations):
        j = i % N
        X_batch = X_shuffled[j:j+batch_size]
        T_batch = T_shuffled[j:j+batch_size]
        # batch가 epoch 경계를 넘어가는 경우, 앞 부분으로 채워줌
        if X_batch.shape[0] < batch_size:
            X_batch = np.vstack((X_batch, X_shuffled[:(batch_size - X_batch.shape[0])]))
            T_batch = np.vstack((T_batch, T_shuffled[:(batch_size - T_batch.shape[0])]))
        l2 = Lambda * W
        W = W - (learning_rate/batch_size) * (X_batch.T @ (softmax(X_batch, W) - T_batch) + l2)
        cost_history[i] = compute_cost(X_batch, T_batch, W, Lambda)
        # if i % 1000 == 0:
        #     print(cost_history[i][0])

    return (cost_history, W)

In [16]:
X = np.hstack((np.ones((np.size(X_train, 0),1)),X_train))
T = y_train

K = np.size(T, 1)
M = np.size(X, 1)
W = np.zeros((M,K))

iterations = 50000
learning_rate = 0.01

# initial_cost = compute_cost(X, T, W, 0.5)

# print("Initial Cost is: {} \n".format(initial_cost[0][0]))

(cost_history, W_optimal) = batch_gd(X, T, W, learning_rate, iterations, 64, 0)

In [19]:
## Accuracy
X_ = np.hstack((np.ones((np.size(X_test, 0),1)), X_test))
T_ = y_test
y_pred = predict(X_, W_optimal)
score = float(np.sum(y_pred == np.argmax(T_, axis=1))) / float(len(y_test))

print(score)

0.9155


# Best Lambda

In [20]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
import seaborn as sns

In [None]:
from sklearn.datasets import fetch_openml


mnist = fetch_openml('mnist_784', version=1, cache=True)
mnist.keys()

In [21]:
# 최적 lambda 찾기
## validation 세트 나누기
X, y = mnist["data"].to_numpy(), mnist["target"].to_numpy()
y = y.astype(np.uint8)

from sklearn.preprocessing import OneHotEncoder 


enc = OneHotEncoder()

enc.fit(y[:,np.newaxis])
Y = enc.transform(y[:,np.newaxis]).toarray()

X_train, X_test, y_train, y_test = X[:60000], X[60000:], Y[:60000], Y[60000:]

X_train = X_train / 255
X_test = X_test / 255

X_train, X_val, y_train, y_val = X_train[:50000], X_train[50000:], y_train[:50000], y_train[50000:]
X_train.shape, X_val.shape, y_train.shape, y_val.shape

((50000, 784), (10000, 784), (50000, 10), (10000, 10))

In [31]:
best_val_score = 0
best_lamb = 0
best_W = None

for lam in np.arange(0, 0.05, 0.005):
    X = np.hstack((np.ones((np.size(X_train, 0),1)),X_train))
    T = y_train

    K = np.size(T, 1)
    M = np.size(X, 1)
    W = np.zeros((M,K))

    iterations = 50000
    learning_rate = 0.01

    (cost_history, W_optimal) = batch_gd(X, T, W, learning_rate, iterations, 64, lam)

    ## Accuracy
    X_ = np.hstack((np.ones((np.size(X_val, 0),1)), X_val))
    T_ = y_val
    y_pred = predict(X_, W_optimal)
    score = float(np.sum(y_pred == np.argmax(T_, axis=1))) / float(len(y_test))
    print(f"lambda: {lam:0.03f}, validation score: {score:0.04f}")
    if best_val_score < score:
        best_val_score = score
        best_lamb = lam
        best_W = W_optimal

print(f"best lambda: {best_lamb}, best_score: {best_val_score}")

lambda: 0.000, validation score: 0.9201
lambda: 0.005, validation score: 0.9199
lambda: 0.010, validation score: 0.9197
lambda: 0.015, validation score: 0.9196
lambda: 0.020, validation score: 0.9194
lambda: 0.025, validation score: 0.9193
lambda: 0.030, validation score: 0.9192
lambda: 0.035, validation score: 0.9190
lambda: 0.040, validation score: 0.9189
lambda: 0.045, validation score: 0.9190
best lambda: 0.0, best_score: 0.9201


In [32]:
## Accuracy
X_ = np.hstack((np.ones((np.size(X_test, 0),1)), X_test))
T_ = y_test
y_pred = predict(X_, W_optimal)
score = float(np.sum(y_pred == np.argmax(T_, axis=1))) / float(len(y_test))

print(score)

0.9146
