# Note : 
ID : Percobaan ini dilakukan untuk mengetahui cara kerja dari Softmax dan bagaimana aplikasi back propagation pada Softmax jika digabungkan dengan persamaan linear <br>
EN : This experiment was conducted to find out how Softmax works and how to apply back propagation to Softmax when combined with linear equations. <br>

**Source** <br>
Blog : 
- [Medium Thomas Kurbiel: Cross Entropy & Softmax Derivative](https://medium.com/towards-data-science/derivative-of-the-softmax-function-and-the-categorical-cross-entropy-loss-ffceefc081d1)
- [Raúl Gómez blog](https://gombru.github.io/2018/05/23/cross_entropy_loss/)
- [Eli Bendersky's website](https://eli.thegreenplace.net/2016/the-softmax-function-and-its-derivative/)

In [2]:
import numpy as np 
from matplotlib import pyplot as plt

# Dataset

In [3]:
import torchvision
from torchvision import transforms
from torch.utils.data import DataLoader

from torch.nn import functional as F

In [4]:
transform = transforms.Compose([
    transforms.ToTensor(),
])

In [5]:
trainset = torchvision.datasets.MNIST('data', download=True, train=True, transform=transform)
trainloader = DataLoader(trainset, 512, True, drop_last=True)

In [6]:
num_class = len(trainset.classes)
num_class

10

In [7]:
def transform_data(feature, target):
    feature = np.array(feature).flatten() / 255
    target_onehot = np.array([ 1 if i == target else 0 for i in range(num_class) ])
    return feature, target_onehot

def transform_data(feature, target):
    batch_size = feature.size(0)
    feature = feature.view(batch_size, -1).numpy()
    target_onehot = F.one_hot(target, 10).numpy()
    return feature, target_onehot

In [8]:
for feature, target in trainloader:
    feature, target = transform_data(feature, target)
    print(feature.shape, target.shape)
    
    break 

(512, 784) (512, 10)


# Modeling

In [9]:
class Linear:
    def __init__(self, input_size, output_size):
        self.weights = np.random.randn(input_size, output_size)
        self.bias = np.random.randn(1, output_size)

    def __call__(self, x):
        return x @ self.weights + self.bias

In [10]:
class Softmax:
    def __init__(self, dim):
        self.dim = dim

    def __call__(self, x):
        x = np.exp(x)
        summation = x.sum(axis=self.dim, keepdims=True)
        return x / summation

In [11]:
class CrossEntropyLoss:
    def __call__(self, prediction, ground_truth):
        entropy = ground_truth * np.log(prediction)
        loss = - entropy.sum(axis=-1)
        return loss.mean()

In [1]:
# def derivative(feature, target, prediction, num_class): # didn't work...
#     A = (target / prediction)
#     A = A.T @ np.ones((1, 10))

#     first_term = p * (1 - p)
#     second_term = -p * p

#     I = np.eye(num_class)
#     for i, vector in enumerate(I):
#         for j, scalar in enumerate(vector): 
#             if I[i, j] == 1.0:
#                 I[i, j] = first_term[0, j] 
#             else : 
#                 I[i, j] = second_term[0, j]
#     B = I 
#     C = feature.T @ np.ones((1, num_class))

#     derivative = A * B @ C.T
#     return derivative.T

# Derivative 

In [12]:
def derivative(feature, target, prediction):
    batch_size = feature.shape[0]
    cross_entropy_and_softmax_derivative = prediction - target
    d_weight = cross_entropy_and_softmax_derivative.T @ feature
    d_bias = cross_entropy_and_softmax_derivative.T @ np.ones((batch_size, 1, ))
    return d_weight.T, d_bias.T

In [13]:
class MainModel:
    def __init__(self, input_size, output_size):
        self.linear = Linear(input_size, output_size)
        self.softmax = Softmax(dim=-1)

    def __call__(self, x):
        x = self.linear(x)
        return self.softmax(x)

In [14]:
model = MainModel(28 * 28, num_class)
criterion = CrossEntropyLoss()

# Get Data

In [15]:
iterator = iter(trainloader)

In [16]:
feature, target = next(iterator)
feature, target = transform_data(feature, target)
feature.shape, target.shape

((512, 784), (512, 10))

In [17]:
feature.shape, target.shape 

((512, 784), (512, 10))

# Forward Propagation

In [18]:
prediction = model(feature)

In [19]:
prediction.shape

(512, 10)

# Optimizing

In [20]:
learning_rate = 0.001

In [21]:
d, _ = derivative(feature, target, prediction)
model.linear.weights = model.linear.weights - learning_rate * d

In [22]:
d.shape, model.linear.weights.shape

((784, 10), (784, 10))

In [23]:
criterion(prediction, target)

np.float64(14.116689562351368)

# Loop Train

In [24]:
from tqdm import tqdm

In [25]:
learning_rate = 0.001

for epoch in range(20):
    cost = 0
    for indices, (feature, target) in (pbar := tqdm(enumerate(trainloader, start=1), total=len(trainloader))):
        feature, target = transform_data(feature, target)
    
        prediction = model(feature)
        loss = criterion(prediction, target)
    
        derivative_weight, derivative_bias = derivative(feature, target, prediction)
    
        # optimize SGD
        model.linear.weights = model.linear.weights - learning_rate * derivative_weight
        model.linear.bias = model.linear.bias - learning_rate * derivative_bias

        cost = cost + loss
        pbar.set_postfix(loss = loss, cost = cost / indices)


100%|██████████████████████████████████████████████████████████| 117/117 [00:15<00:00,  7.44it/s, cost=2.72, loss=1.24]
100%|█████████████████████████████████████████████████████████| 117/117 [00:16<00:00,  7.25it/s, cost=1.08, loss=0.866]
100%|████████████████████████████████████████████████████████| 117/117 [00:16<00:00,  7.23it/s, cost=0.852, loss=0.665]
100%|████████████████████████████████████████████████████████| 117/117 [00:15<00:00,  7.45it/s, cost=0.747, loss=0.736]
100%|████████████████████████████████████████████████████████| 117/117 [00:15<00:00,  7.41it/s, cost=0.683, loss=0.699]
100%|████████████████████████████████████████████████████████| 117/117 [00:15<00:00,  7.44it/s, cost=0.637, loss=0.621]
100%|████████████████████████████████████████████████████████| 117/117 [00:15<00:00,  7.40it/s, cost=0.603, loss=0.473]
100%|████████████████████████████████████████████████████████| 117/117 [00:16<00:00,  7.29it/s, cost=0.575, loss=0.541]
100%|███████████████████████████████████