Full version of this code is at 
https://colab.research.google.com/github/d2l-ai/d2l-en-colab/blob/master/chapter_linear-networks/softmax-regression-scratch.ipynb

In [1]:
!pip install d2l==0.16.1
!pip install -U mxnet-cu101==1.7.0

Requirement already up-to-date: mxnet-cu101==1.7.0 in /usr/local/lib/python3.6/dist-packages (1.7.0)


In [3]:
# For automatically computing gradients of numpy functions
from mxnet import autograd
# mxnet's version of numpy, which is autograd-friendly
from mxnet import np
# mxnet's neural network + accelerator extensions for numpy
from mxnet import npx
# mxnet's neural network framework (only for data loading here)
from mxnet import gluon
# Tell mxnet to be as numpy-compatible as possible
npx.set_np()

from IPython import display

In [4]:
def filter_shirt_trousers(sample):
    #if data is t-shirt
    if sample[1] == 0:
      return True
    #if data is t-shirt
    if sample[1] == 1:
      return True
    return False

In [5]:
def transform_shirt_trouser_label(data, label):
    #if data is t-shirt
    if label == 0:
      return data, -1
    #if data is t-shirt
    if label == 1:
      return data, 1
    return data, label  

In [6]:
def load_data_fashion_mnist(batch_size=256, resize=None):
    """Download the Fashion-MNIST dataset and then load it into memory."""
    dataset = gluon.data.vision
    trans = [dataset.transforms.ToTensor()]
    if resize:
        trans.insert(0, dataset.transforms.Resize(resize))
    trans = dataset.transforms.Compose(trans)
    mnist_train = dataset.FashionMNIST(train=True).transform_first(trans).filter(filter_shirt_trousers)
    mnist_test = dataset.FashionMNIST(train=False).transform_first(trans).filter(filter_shirt_trousers)

    mnist_train = mnist_train.transform(transform_shirt_trouser_label)
    mnist_test = mnist_test.transform(transform_shirt_trouser_label)

    return (gluon.data.DataLoader(mnist_train, batch_size, shuffle=True,
                                  num_workers=4),
            gluon.data.DataLoader(mnist_test, batch_size, shuffle=False,
                                  num_workers=4))

train_iter, test_iter = load_data_fashion_mnist()

In [7]:
num_inputs = 784
#num_outputs = 10
num_outputs = 2

# Initialize values of parameters in typical numpy syntax
W = np.random.normal(0, 0.01, (num_inputs, num_outputs))
b = np.zeros(num_outputs)
# Tell mxnet we want to keep track of gradients for these parameters
W.attach_grad()
b.attach_grad()

In [8]:
def softmax(X):
    X_exp = np.exp(X)
    partition = X_exp.sum(axis=1, keepdims=True)
    return X_exp / partition  # The broadcasting mechanism is applied here

In [9]:
def net(X):
    # Reshape is necessary because data is originally images
    return softmax(np.dot(X.reshape((-1, W.shape[0])), W) + b)

In [10]:
def cross_entropy(y_hat, y):
    o = y_hat[range(len(y_hat)), y]
    #for i in range(len(y)):
    #  if y[i]==0:
    #    y[i] = -1
    return np.log(1+np.exp(-y * o))
    #return - np.log(y_hat[range(len(y_hat)), y])

In [13]:
def accuracy(y_hat, y):  #@save
    """Compute the number of correct predictions."""
    if len(y_hat.shape) > 1 and y_hat.shape[1] > 1:
        y_hat = y_hat.argmax(axis=1)
    #print(y_hat)
    #print(y)
    for i in range(len(y_hat)):
      if y_hat[i]==0:
        y_hat[i] = -1
    cmp = y_hat.astype(y.dtype) == y
    print(float(cmp.astype(y.dtype).sum()))
    return float(cmp.astype(y.dtype).sum())

In [14]:
def evaluate_accuracy(net, data_iter):  #@save
    """Compute the accuracy for a model on a dataset."""
    metric = Accumulator(2)  # No. of correct predictions, no. of predictions
    for X, y in data_iter:
        metric.add(accuracy(net(X), y), y.size)
    return metric[0] / metric[1]

In [15]:
class Accumulator:  #@save
    """For accumulating sums over `n` variables."""
    def __init__(self, n):
        self.data = [0.0] * n

    def add(self, *args):
        self.data = [a + float(b) for a, b in zip(self.data, args)]

    def reset(self):
        self.data = [0.0] * len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

In [16]:
def train_epoch_ch3(net, train_iter, loss, updater):  #@save
    """Train a model within one epoch (defined in Chapter 3)."""
    # Sum of training loss, sum of training accuracy, no. of examples
    metric = Accumulator(3)
    for X, y in train_iter:
        # Compute gradients and update parameters
        with autograd.record():
            y_hat = net(X)
            l = loss(y_hat, y)
        l.backward()
        updater(X.shape[0])
        metric.add(float(l.sum()), accuracy(y_hat, y), y.size)
    # Return training loss and training accuracy
    return metric[0] / metric[2], metric[1] / metric[2]

In [17]:
def train_ch3(net, train_iter, test_iter, loss, num_epochs, updater):  #@save
    for epoch in range(num_epochs):
        train_metrics = train_epoch_ch3(net, train_iter, loss, updater)
        test_acc = evaluate_accuracy(net, test_iter)
        train_loss, train_acc = train_metrics
        print(f"{epoch + 1}:\ttrain {train_acc:0.2f}%\ttest {test_acc:0.2f}%")

In [18]:
lr = 0.1

def sgd(params, lr, batch_size):
    """Minibatch stochastic gradient descent."""
    for param in params:
        param[:] = param - lr * param.grad / batch_size

def updater(batch_size):
    return sgd([W, b], lr, batch_size)

In [None]:
num_epochs = 100
train_ch3(net, train_iter, test_iter, cross_entropy, num_epochs, updater)

127.0
136.0
133.0
121.0
119.0
133.0
146.0
122.0
128.0
145.0
127.0
131.0
145.0
127.0
171.0
139.0
136.0
193.0
189.0
185.0
215.0
224.0
212.0
224.0
226.0
227.0
238.0
234.0
226.0
233.0
231.0
234.0
241.0
230.0
241.0
228.0
241.0
234.0
242.0
237.0
229.0
237.0
236.0
241.0
233.0
246.0
208.0
237.0
239.0
236.0
244.0
237.0
245.0
246.0
194.0
1:	train 0.76%	test 0.94%
242.0
228.0
238.0
236.0
242.0
232.0
238.0
242.0
239.0
233.0
238.0
231.0
238.0
243.0
243.0
233.0
239.0
236.0
244.0
236.0
242.0
237.0
240.0
239.0
239.0
239.0
240.0
240.0
242.0
238.0
249.0
236.0
245.0
240.0
237.0
245.0
246.0
238.0
241.0
243.0
232.0
242.0
247.0
240.0
238.0
240.0
210.0
242.0
242.0
241.0
247.0
239.0
247.0
246.0
196.0
2:	train 0.93%	test 0.95%
242.0
243.0
238.0
235.0
238.0
246.0
243.0
237.0
241.0
241.0
230.0
241.0
238.0
237.0
237.0
241.0
239.0
241.0
243.0
247.0
242.0
238.0
241.0
245.0
240.0
241.0
238.0
237.0
239.0
248.0
241.0
246.0
240.0
246.0
248.0
243.0
248.0
246.0
249.0
238.0
245.0
242.0
238.0
237.0
232.0
244.0
218.0
243.0
