In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
import seaborn as sns

In [2]:
from sklearn.datasets import fetch_openml
mnist = fetch_openml('mnist_784', version=1, cache=True)
mnist.keys()

dict_keys(['data', 'target', 'frame', 'categories', 'feature_names', 'target_names', 'DESCR', 'details', 'url'])

In [3]:
X, y = mnist["data"], mnist["target"]

In [4]:
y = y.astype(np.uint8)

In [5]:
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder()

In [6]:
enc.fit(y[:,np.newaxis])

OneHotEncoder()

In [7]:
Y = enc.transform(y[:,np.newaxis]).toarray()

In [8]:
# MemoryError: Unable to allocate 26.1 GiB
# 메모리 에러 때문에 데이터셋을 40000개로 줄이고, train / test의 비율은 80:20으로 했습니다.
X_train, X_test, y_train, y_test = X[:32000], X[32000:40000], Y[:32000], Y[32000:40000]

# train / validation / test 각각 60 : 20 : 20
X_validation, X_train = X_train[:8000], X_train[8000:]
y_validation, y_train = y_train[:8000], y_train[8000:]

In [9]:
X_train = X_train / 255
X_test = X_test / 255

# validation
X_validation = X_validation / 255

In [10]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

In [11]:
def softmax(X, W):
    K = np.size(W, 1)
    A = np.exp(X @ W)
    B = np.diag(1 / (np.reshape(A @ np.ones((K,1)), -1)))
    Y = B @ A
    return Y

In [12]:
def compute_cost(X, T, W, lambd):
    epsilon = 1e-5
    N = len(T)
    K = np.size(T, 1)
    
    # 원본 cost function 
    cost = - (1/N) * np.ones((1,N)) @ (np.multiply(np.log(softmax(X, W) + epsilon), T)) @ np.ones((K,1)) 
    
    # L2 regularization 적용
    m = T.shape[1]
    cost += (lambd / (2*m)) * (np.sum(np.square(W)))
    
    return cost

In [13]:
def predict(X, W):
    return np.argmax((X @ W), axis=1)

In [20]:
def batch_gd(X, T, W, learning_rate, iterations, batch_size):
    N = len(T)
    cost_history = np.zeros((iterations,1))
    shuffled_indices = np.random.permutation(N)
    X_shuffled = X[shuffled_indices]
    T_shuffled = T[shuffled_indices]

    for i in range(iterations):
        j = i % N
        X_batch = X_shuffled[j:j+batch_size]
        T_batch = T_shuffled[j:j+batch_size]
        # batch가 epoch 경계를 넘어가는 경우, 앞 부분으로 채워줌
        if X_batch.shape[0] < batch_size:
            X_batch = np.vstack((X_batch, X_shuffled[:(batch_size - X_batch.shape[0])]))
            T_batch = np.vstack((T_batch, T_shuffled[:(batch_size - T_batch.shape[0])]))
        W = W - (learning_rate/batch_size) * (X_batch.T @ (softmax(X_batch, W) - T_batch))
        cost_history[i] = compute_cost(X_batch, T_batch, W, lambd)
        if i % 1000 == 0:
            print(cost_history[i][0])

    return (cost_history, W)

In [21]:
X = np.hstack((np.ones((np.size(X_train, 0),1)),X_train))
T = y_train

K = np.size(T, 1)
M = np.size(X, 1)
W = np.zeros((M,K))

iterations = 50000
learning_rate = 0.01

lambds = [0.01, 0.1, 0, 1, 10, 100]
lambds_dict = {}
for lambd in lambds:
    initial_cost = compute_cost(X, T, W, lambd)
    # print("Initial Cost is: {} \n".format(initial_cost[0][0]))
    (cost_history, W_optimal) = batch_gd(X, T, W, learning_rate, iterations, 64)
    
    # Validation set에 대한 Accuracy
    X_ = np.hstack((np.ones((np.size(X_test, 0),1)), X_validation))
    T_ = y_validation
    y_pred = predict(X_, W_optimal)
    score = float(sum(y_pred == np.argmax(T_, axis=1)))/ float(len(y_validation))
    
    lambds_dict[lambd] = score
    print(f'lambd: {lambd}, accuracy score: {score}')

print(lambds_dict)

2.2841203043910494
0.7265323517443094
0.5738705527601549
0.6272789836192995
0.4479204516932398
0.6336586629860627
0.3318540806543483
0.4273903659884456
0.2643448672704315
0.34573919057826075
0.4405870168671068
0.598817955861763
0.43862288435201824
0.4247571480390066
0.2549633994257604
0.27926420249097117
0.4473041817150034
0.31446178117212664
0.1842005191158812
0.5672942612151745
0.36540798048471046
0.40593534883077653
0.33533947424427335
0.4497985730282319
0.3689269035706434
0.4366908891109206
0.5505118191322016
0.4910361478416822
0.4034128581821485
0.5954375550220933
0.2337924403873565
0.3665434002573862
0.2283205508067881
0.3286272392952185
0.4395247554819194
0.5360652564070197
0.3687234437070499
0.40203235263343207
0.24936259870054622
0.262734643726269
0.45710718430598507
0.2624584179243252
0.19289970662537537
0.5801302468538696
0.3517729740674669
0.3763822740508503
0.3118911087330379
0.45791990858037734
0.35175424958043705
0.43544004873972236
lambd: 0.01, accuracy score: 0.917
2.2

In [23]:
max_score = max(lambds_dict.values())
for key, val in lambds_dict.items():
    if val == max_score:
        lambd = key
        print(f'optimal lambd: {lambd}')

optimal lambd: 1


In [24]:
## Accuracy
X_ = np.hstack((np.ones((np.size(X_test, 0),1)),X_test))
T_ = y_test
y_pred = predict(X_, W_optimal)
score = float(sum(y_pred == np.argmax(T_, axis=1))) / float(len(y_test))

print(score)

0.905625
