In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
import seaborn as sns

import pandas as pd
from tqdm.notebook import tqdm

In [2]:
from sklearn.datasets import fetch_openml
mnist = fetch_openml('mnist_784', version=1, cache=True)
mnist.keys()

dict_keys(['data', 'target', 'frame', 'categories', 'feature_names', 'target_names', 'DESCR', 'details', 'url'])

In [3]:
X, y = mnist["data"], mnist["target"]

In [4]:
y = y.astype(np.uint8)

In [5]:
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder()

In [6]:
enc.fit(y[:,np.newaxis])

OneHotEncoder()

In [7]:
Y = enc.transform(y[:,np.newaxis]).toarray()

In [8]:
X_train, X_test, y_train, y_test = X[:60000], X[60000:], Y[:60000], Y[60000:]

In [9]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((60000, 784), (10000, 784), (60000, 10), (10000, 10))

In [10]:
X_train = X_train / 255
X_test = X_test / 255

In [11]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

In [12]:
def softmax(X, W):
    K = np.size(W, 1)
    A = np.exp(X @ W)
    B = np.diag(1 / (np.reshape(A @ np.ones((K,1)), -1)))
    Y = B @ A
    return Y

In [13]:
def compute_cost(X, T, W, lambda_value):
    epsilon = 1e-5
    N = len(T)
    K = np.size(T, 1)
    #cost = - (1/N) * np.ones((1,N)) @ (np.multiply(np.log(softmax(X, W) + epsilon), T)) @ np.ones((K,1))
    cost = - (1/N) * np.ones((1,N)) @ (np.multiply(np.log(softmax(X, W) + epsilon), T)) @ np.ones((K,1)) + (lambda_value * np.sum(np.square(W)))
    
    return cost

In [14]:
def predict(X, W):
    return np.argmax((X @ W), axis=1)

In [15]:
def batch_gd(X, T, W, learning_rate, iterations, batch_size, lambda_value):
    N = len(T)
    cost_history = np.zeros((iterations,1))
    shuffled_indices = np.random.permutation(N)
    X_shuffled = X[shuffled_indices]
    T_shuffled = T[shuffled_indices]

    for i in tqdm(range(iterations)):
        j = i % N
        X_batch = X_shuffled[j:j+batch_size]
        T_batch = T_shuffled[j:j+batch_size]
        # batch가 epoch 경계를 넘어가는 경우, 앞 부분으로 채워줌
        if X_batch.shape[0] < batch_size:
            X_batch = np.vstack((X_batch, X_shuffled[:(batch_size - X_batch.shape[0])]))
            T_batch = np.vstack((T_batch, T_shuffled[:(batch_size - T_batch.shape[0])]))
        W = W - (learning_rate/batch_size) * (X_batch.T @ (softmax(X_batch, W) - T_batch))
        cost_history[i] = compute_cost(X_batch, T_batch, W, lambda_value)
        if i % 1000 == 0:
            print(cost_history[i][0])

    return (cost_history, W)

In [16]:
X = np.hstack((np.ones((np.size(X_train, 0),1)),X_train))
T = y_train

K = np.size(T, 1)
M = np.size(X, 1)
W = np.zeros((M,K))

iterations = 50000
learning_rate = 0.01
lambda_value = 0.3

initial_cost = compute_cost(X, T, W, lambda_value)

print("Initial Cost is: {} \n".format(initial_cost[0][0]))

(cost_history, W_optimal) = batch_gd(X, T, W, learning_rate, iterations, 64, lambda_value)

Initial Cost is: 2.30248509799353 



  0%|          | 0/50000 [00:00<?, ?it/s]

2.2773910131939794
4.470476940961929
6.592902827502613
8.590275991011698
10.11673208638778
11.249425511481128
12.486540691975383
13.712689346607442
14.617871816684843
15.552523218580856
16.709525812913213
17.273666070361262
18.318698884894484
18.97005068386087
19.74112132145537
20.411869415421513
21.34868935842162
22.185144924528228
22.99952039466421
23.49526473327301
24.081467844740885
24.593507167413293
25.21993181121561
25.570066262733402
26.282472873533628
26.864471023971173
27.52998966114429
27.759852137067732
28.402572635432175
28.739172150895083
29.549095324660392
30.007583067492565
30.789927574611013
31.4333523976577
31.288996578985273
32.03010590675814
32.098142383013524
32.828101628841004
32.97484592745215
33.499211172218914
33.77504660427997
34.21753243286563
34.6261204849965
34.993146255540665
35.15429520466814
35.877844715003874
36.49793845482982
36.82632891360961
37.23839417366869
37.53835586034911


In [17]:
## Accuracy
X_ = np.hstack((np.ones((np.size(X_test, 0),1)),X_test))
T_ = y_test
y_pred = predict(X_, W_optimal)
score = float(sum(y_pred == np.argmax(T_, axis=1)))/ float(len(y_test))

print('Accuracy Score : {}'.format(score))

Accuracy Score : 0.9164


##  Validation

In [18]:
X_train, X_valid = X_train[:55000], X_train[55000:]
y_train, y_valid = y_train[:55000], y_train[55000:]

In [24]:
def new_batch_gd(X, T, W, learning_rate, iterations, batch_size, lambda_value):
    N = len(T)
    cost_history = np.zeros((iterations,1))
    shuffled_indices = np.random.permutation(N)
    X_shuffled = X[shuffled_indices]
    T_shuffled = T[shuffled_indices]

    for i in tqdm(range(iterations)):
        j = i % N
        X_batch = X_shuffled[j:j+batch_size]
        T_batch = T_shuffled[j:j+batch_size]
        # batch가 epoch 경계를 넘어가는 경우, 앞 부분으로 채워줌
        if X_batch.shape[0] < batch_size:
            X_batch = np.vstack((X_batch, X_shuffled[:(batch_size - X_batch.shape[0])]))
            T_batch = np.vstack((T_batch, T_shuffled[:(batch_size - T_batch.shape[0])]))
        W = W - (learning_rate/batch_size) * (X_batch.T @ (softmax(X_batch, W) - T_batch))
        cost_history[i] = compute_cost(X_batch, T_batch, W, lambda_value)

    return (cost_history, W)

In [25]:
def valid(lambda_value) :
    X = np.hstack((np.ones((np.size(X_train, 0),1)),X_train))
    T = y_train

    K = np.size(T, 1)
    M = np.size(X, 1)
    W = np.zeros((M,K))

    iterations = 50000
    learning_rate = 0.01

    initial_cost = compute_cost(X, T, W, lambda_value)

    print("Initial Cost : {}".format(initial_cost[0][0]))

    (cost_history, W_optimal) = new_batch_gd(X, T, W, learning_rate, iterations, 64, lambda_value)
    print("Result Cost : {} \n".format(cost_history[-1][0]))
    
    X_ = np.hstack((np.ones((np.size(X_valid, 0),1)),X_valid))
    T_ = y_valid
    y_pred = predict(X_, W_optimal)
    score = float(sum(y_pred == np.argmax(T_, axis=1)))/ float(len(y_valid))
    
    return score

In [26]:
def find_lambda(n) :
    lambda_list = np.random.rand(n)
    
    score_list = []
    for lambda_val in tqdm(lambda_list) :
        score_list.append(valid(lambda_val))
    
    lambda_df = pd.concat([pd.Series(lambda_list), pd.Series(score_list)], axis = 1)
    lambda_df.columns = ['lambda', 'Score']
    lambda_df = lambda_df.sort_values('Score', ascending=False)
    
    return lambda_df 

In [27]:
n = 10
result_df = find_lambda(n)

  0%|          | 0/10 [00:00<?, ?it/s]

Initial Cost is: 2.3024850979938565 



  0%|          | 0/50000 [00:00<?, ?it/s]

Initial Cost is: 2.3024850979938565 



  0%|          | 0/50000 [00:00<?, ?it/s]

Initial Cost is: 2.3024850979938565 



  0%|          | 0/50000 [00:00<?, ?it/s]

Initial Cost is: 2.3024850979938565 



  0%|          | 0/50000 [00:00<?, ?it/s]

Initial Cost is: 2.3024850979938565 



  0%|          | 0/50000 [00:00<?, ?it/s]

Initial Cost is: 2.3024850979938565 



  0%|          | 0/50000 [00:00<?, ?it/s]

Initial Cost is: 2.3024850979938565 



  0%|          | 0/50000 [00:00<?, ?it/s]

Initial Cost is: 2.3024850979938565 



  0%|          | 0/50000 [00:00<?, ?it/s]

Initial Cost is: 2.3024850979938565 



  0%|          | 0/50000 [00:00<?, ?it/s]

Initial Cost is: 2.3024850979938565 



  0%|          | 0/50000 [00:00<?, ?it/s]

Unnamed: 0,lambda,Score
8,0.453561,0.935
4,0.457318,0.9336
7,0.105998,0.9322
1,0.939925,0.9312
6,0.675132,0.9312
9,0.800783,0.9312
2,0.935682,0.9304
3,0.382616,0.9296
5,0.129916,0.9286
0,0.058149,0.9264


In [28]:
result_df

Unnamed: 0,lambda,Score
8,0.453561,0.935
4,0.457318,0.9336
7,0.105998,0.9322
1,0.939925,0.9312
6,0.675132,0.9312
9,0.800783,0.9312
2,0.935682,0.9304
3,0.382616,0.9296
5,0.129916,0.9286
0,0.058149,0.9264
