In [1]:
import os

import numpy as np
import torch
import torchvision.datasets as dset
import torch.nn as nn
import torchvision.transforms as transforms

import pyro
import pyro.distributions as dist
import pyro.contrib.examples.util  # patches torchvision
from pyro.infer import SVI, Trace_ELBO
from pyro.optim import Adam

In [2]:
#assert pyro.__version__.startswith('1.3.0')
pyro.enable_validation(True)
pyro.distributions.enable_validation(False)
pyro.set_rng_seed(0)
# Enable smoke test - run the notebook cells on CI.
#smoke_test = 'CI' in os.environ

In [3]:
def setup_data_loaders(batch_size=128, use_cuda=False):
    root = './data'
    download = True
    trans = transforms.ToTensor()
    train_set = dset.MNIST(root=root, train=True, transform=trans,
                           download=download)
    test_set = dset.MNIST(root=root, train=False, transform=trans)

    kwargs = {'num_workers': 1, 'pin_memory': use_cuda}
    train_loader = torch.utils.data.DataLoader(dataset=train_set,
        batch_size=batch_size, shuffle=True, **kwargs)
    test_loader = torch.utils.data.DataLoader(dataset=test_set,
        batch_size=batch_size, shuffle=False, **kwargs)
    return train_loader, test_loader

In [4]:
class Decoder(nn.Module):
    def __init__(self, z_dim, hidden_dim):
        super().__init__()
        # setup the two linear transformations used
        self.fc1 = nn.Linear(z_dim, hidden_dim)
        self.fc21 = nn.Linear(hidden_dim, 784)
        # setup the non-linearities
        self.softplus = nn.Softplus()
        self.sigmoid = nn.Sigmoid()

    def forward(self, z):
        # define the forward computation on the latent z
        # first compute the hidden units
        hidden = self.softplus(self.fc1(z))
        # return the parameter for the output Bernoulli
        # each is of size batch_size x 784
        loc_img = self.sigmoid(self.fc21(hidden))
        return loc_img

In [5]:
class Encoder(nn.Module):
    def __init__(self, z_dim, hidden_dim):
        super().__init__()
        # setup the three linear transformations used
        self.fc1 = nn.Linear(784, hidden_dim)
        self.fc21 = nn.Linear(hidden_dim, z_dim)
        self.fc22 = nn.Linear(hidden_dim, z_dim)
        # setup the non-linearities
        self.softplus = nn.Softplus()

    def forward(self, x):
        # define the forward computation on the image x
        # first shape the mini-batch to have pixels in the rightmost dimension
        x = x.reshape(-1, 784)
        # then compute the hidden units
        hidden = self.softplus(self.fc1(x))
        # then return a mean vector and a (positive) square root covariance
        # each of size batch_size x z_dim
        z_loc = self.fc21(hidden)
        z_scale = torch.exp(self.fc22(hidden))
        return z_loc, z_scale

In [6]:
# define the model p(x|z)p(z)
def model(self, x):
    # register PyTorch module `decoder` with Pyro
    pyro.module("decoder", self.decoder)
    with pyro.plate("data", x.shape[0]):
        # setup hyperparameters for prior p(z)
        z_loc = x.new_zeros(torch.Size((x.shape[0], self.z_dim)))
        # z loc torch.Size([256, 50])
        z_scale = x.new_ones(torch.Size((x.shape[0], self.z_dim)))
        # sample from prior (value will be sampled by guide when computing the ELBO)
        z = pyro.sample("latent", dist.Normal(z_loc, z_scale).to_event(1))
        # we sample a Z from a (0, I) normal distribution
        # then we pass it though a nn
        # mu = nn(z)
        # then this mu is used in another dist
        # p(x|z) where z is samples
        # then we sample an x from this
        # the idea is, this nn function learns a distribution
        # that is, what would it be like to sample z from P(z|X)
        # 
        # z shape torch.Size([256, 50])
        # decode the latent code z
        loc_img = self.decoder.forward(z)
        #loc img torch.Size([256, 784])
        # score against actual images
        # bern shape Independent(Bernoulli(probs: torch.Size([256, 784])), 1)
        # 784 is the batch size
        # 256 is the image size
        pyro.sample("obs", dist.Bernoulli(loc_img).to_event(1), obs=x.reshape(-1, 784))

In [7]:
# define the guide (i.e. variational distribution) q(z|x)
def guide(self, x):
    # register PyTorch module `encoder` with Pyro
    pyro.module("encoder", self.encoder)
    with pyro.plate("data", x.shape[0]):
        # use the encoder to get the parameters used to define q(z|x)
        z_loc, z_scale = self.encoder.forward(x)
        # p(z,b) = q(b)mult(i=1 to i=N)q(zi|f(xi))
        
        # given an image, we output a distribution for z
        # then we sample a z. because the guide always gives the
        # approximate posterior, the variational inference
        # sample the latent code z
        pyro.sample("latent", dist.Normal(z_loc, z_scale).to_event(1))

In [8]:
class VAE(nn.Module):
    # by default our latent space is 50-dimensional
    # and we use 400 hidden units
    def __init__(self, z_dim=50, hidden_dim=400, use_cuda=False):
        super().__init__()
        # create the encoder and decoder networks
        self.encoder = Encoder(z_dim, hidden_dim)
        self.decoder = Decoder(z_dim, hidden_dim)

        if use_cuda:
            # calling cuda() here will put all the parameters of
            # the encoder and decoder networks into gpu memory
            self.cuda()
        self.use_cuda = use_cuda
        self.z_dim = z_dim

    # define the model p(x|z)p(z)
    
    
    def model(self, x):
        # register PyTorch module `decoder` with Pyro
        pyro.module("decoder", self.decoder)
        with pyro.plate("data", x.shape[0]):
            # setup hyperparameters for prior p(z)
            z_loc = x.new_zeros(torch.Size((x.shape[0], self.z_dim)))
            z_scale = x.new_ones(torch.Size((x.shape[0], self.z_dim)))
            # sample from prior (value will be sampled by guide when computing the ELBO)
            z = pyro.sample("latent", dist.Normal(z_loc, z_scale).to_event(1))
            # decode the latent code z
            loc_img = self.decoder.forward(z)
            # score against actual images
            # decoder is where the image goes 
            pyro.sample("obs", dist.Bernoulli(loc_img).to_event(1), obs=x.reshape(-1, 784))

    # define the guide (i.e. variational distribution) q(z|x)
    def guide(self, x):
        # register PyTorch module `encoder` with Pyro
        pyro.module("encoder", self.encoder)
        with pyro.plate("data", x.shape[0]):
            # use the encoder to get the parameters used to define q(z|x)
            z_loc, z_scale = self.encoder.forward(x)
            # sample the latent code z
            pyro.sample("latent", dist.Normal(z_loc, z_scale).to_event(1))

    # define a helper function for reconstructing images
    def reconstruct_img(self, x):
        # encode image x
        z_loc, z_scale = self.encoder(x)
        # sample in latent space
        z = dist.Normal(z_loc, z_scale).sample()
        # decode the image (note we don't sample in image space)
        loc_img = self.decoder(z)
        return loc_img

In [9]:
vae = VAE()

optimizer = Adam({"lr": 1.0e-3})

svi = SVI(vae.model, vae.guide, optimizer, loss=Trace_ELBO())

def train(svi, train_loader, use_cuda=False):
    # initialize loss accumulator
    epoch_loss = 0.
    # do a training epoch over each mini-batch x returned
    # by the data loader
    for x, _ in train_loader:
        # if on GPU put mini-batch into CUDA memory
        if use_cuda:
            x = x.cuda()
        # do ELBO gradient and accumulate loss
        epoch_loss += svi.step(x)

    # return epoch loss
    normalizer_train = len(train_loader.dataset)
    total_epoch_loss_train = epoch_loss / normalizer_train
    return total_epoch_loss_train

def evaluate(svi, test_loader, use_cuda=False):
    # initialize loss accumulator
    test_loss = 0.
    # compute the loss over the entire test set
    for x, _ in test_loader:
        # if on GPU put mini-batch into CUDA memory
        if use_cuda:
            x = x.cuda()
        # compute ELBO estimate and accumulate loss
        test_loss += svi.evaluate_loss(x)
    normalizer_test = len(test_loader.dataset)
    total_epoch_loss_test = test_loss / normalizer_test
    return total_epoch_loss_test

In [10]:
LEARNING_RATE = 1.0e-3
USE_CUDA = False
smoke_test = False

# Run only for a single iteration for testing
NUM_EPOCHS = 250 if smoke_test else 250
TEST_FREQUENCY = 5
train_loader, test_loader = setup_data_loaders(batch_size=256, use_cuda=USE_CUDA)
print(len(test_loader))

# clear param store
pyro.clear_param_store()

# setup the VAE
vae = VAE(use_cuda=USE_CUDA)

# setup the optimizer
adam_args = {"lr": LEARNING_RATE}
optimizer = Adam(adam_args)

# setup the inference algorithm
svi = SVI(vae.model, vae.guide, optimizer, loss=Trace_ELBO())

train_elbo = []
test_elbo = []
# training loop
for epoch in range(NUM_EPOCHS):
    total_epoch_loss_train = train(svi, train_loader, use_cuda=USE_CUDA)
    train_elbo.append(-total_epoch_loss_train)
    print("[epoch %03d]  average training loss: %.4f" % (epoch, total_epoch_loss_train))

    if epoch % TEST_FREQUENCY == 0:
        # report test diagnostics
        total_epoch_loss_test = evaluate(svi, test_loader, use_cuda=USE_CUDA)
        test_elbo.append(-total_epoch_loss_test)
        print("[epoch %03d] average test loss: %.4f" % (epoch, total_epoch_loss_test))

40
[epoch 000]  average training loss: 191.0216
[epoch 000] average test loss: 156.0872
[epoch 001]  average training loss: 146.8141
[epoch 002]  average training loss: 133.2540
[epoch 003]  average training loss: 124.6775
[epoch 004]  average training loss: 119.5152
[epoch 005]  average training loss: 116.1240
[epoch 005] average test loss: 113.7908
[epoch 006]  average training loss: 113.7285
[epoch 007]  average training loss: 112.0445
[epoch 008]  average training loss: 110.7292
[epoch 009]  average training loss: 109.7455
[epoch 010]  average training loss: 108.9070
[epoch 010] average test loss: 107.7720
[epoch 011]  average training loss: 108.2513
[epoch 012]  average training loss: 107.6953
[epoch 013]  average training loss: 107.2849
[epoch 014]  average training loss: 106.8870
[epoch 015]  average training loss: 106.4983
[epoch 015] average test loss: 105.9786
[epoch 016]  average training loss: 106.1872
[epoch 017]  average training loss: 105.9363
[epoch 018]  average traini

[epoch 155]  average training loss: 99.9329
[epoch 155] average test loss: 100.3317
[epoch 156]  average training loss: 99.9232
[epoch 157]  average training loss: 99.8928
[epoch 158]  average training loss: 99.8881
[epoch 159]  average training loss: 99.8582
[epoch 160]  average training loss: 99.8505
[epoch 160] average test loss: 100.2576
[epoch 161]  average training loss: 99.8534
[epoch 162]  average training loss: 99.8147
[epoch 163]  average training loss: 99.8133
[epoch 164]  average training loss: 99.7991
[epoch 165]  average training loss: 99.7704
[epoch 165] average test loss: 100.5118
[epoch 166]  average training loss: 99.7751
[epoch 167]  average training loss: 99.7759
[epoch 168]  average training loss: 99.7354
[epoch 169]  average training loss: 99.7310
[epoch 170]  average training loss: 99.7082
[epoch 170] average test loss: 100.2337
[epoch 171]  average training loss: 99.6867
[epoch 172]  average training loss: 99.6605
[epoch 173]  average training loss: 99.6814
[epo

In [11]:
from pyro.nn import PyroSample, PyroModule
from pyro.distributions import Normal, Categorical

class ClassifierBnn(PyroModule):
    
    def __init__(self, num_in = 100, num_hidden = 200, num_out = 10, prior_std = 1.):
        
        # call to father constructor
        super().__init__()
        
        # define prior
        prior = Normal(0, prior_std)
        
        # Define layers
        
        # linear layer 1
        self.linear_layer = PyroModule[torch.nn.Linear](num_in, num_hidden)
        
        # linear alyer parameters as random variables
        self.linear_layer.weights = PyroSample(prior.expand([num_hidden, num_in]).to_event(2))
        self.linear_layer.bias = PyroSample(prior.expand([num_hidden]).to_event(1))
        
        # linear layer 2
        # output dimension is 3 because of the number of classes
        self.output_layer = PyroModule[torch.nn.Linear](num_hidden, num_out)
        
        # linear alyer parameters as random variables****
        self.output_layer.weights = PyroSample(prior.expand([num_out, num_hidden]).to_event(2))
        self.output_layer.bias = PyroSample(prior.expand([num_out]).to_event(1))
        
        # activation function
        #self.activation = torch.nn.functional.softmax()
        
    def forward(self, x, y = None):
            
        # latent variable
        z = self.linear_layer(x)
        z = self.output_layer(z)
        z = torch.nn.functional.log_softmax(z, dim=1)
        # likelihood
        with pyro.plate("data",size = x.shape[0], dim = -1):
            # I think this means each batch is independent            
            # z is the input to the distribution (categorical)
            obs = pyro.sample("obs", Categorical(logits = z), obs=y)
        # return latent variable
        return z

In [12]:
# validate NN

pyro.enable_validation(True)

model = ClassifierBnn()
x, y = next(iter(train_loader))
z_loc, z_scale = vae.encoder(x)
combined_z = torch.cat((z_loc, z_scale), 1)


print(pyro.poutine.trace(model).get_trace(combined_z, y).format_shapes())

         Trace Shapes:            
          Param Sites:            
   linear_layer.weight 200 100    
   output_layer.weight  10 200    
         Sample Sites:            
linear_layer.bias dist       | 200
                 value       | 200
output_layer.bias dist       |  10
                 value       |  10
             data dist       |    
                 value 256   |    
              obs dist 256   |    
                 value 256   |    


In [13]:
pyro.enable_validation(True)
pyro.clear_param_store()
model = ClassifierBnn(num_hidden = 10, prior_std = 1.)

# define guide
from pyro.infer.autoguide import AutoDiagonalNormal
guide = AutoDiagonalNormal(model, init_scale=1e-1)

# define SVI (model for training)
svi = pyro.infer.SVI(model,
                    guide,
                    optim=pyro.optim.ClippedAdam({'lr':1e-3}),
                    # Define conventional ELBO
                     loss=pyro.infer.Trace_ELBO())

In [14]:
from pyro.infer import Predictive
predictive = Predictive(model, guide=guide, num_samples=20)

def predict(x):
    # for a single image, output a mean and sd for category
    yhats = predictive(x)["obs"].double()
    # yhats[0] seems to be integers 0 to 9, len 256
    # prediction for one model, for all items in batch
    # 20, 256
    mean = torch.mean(yhats, axis=0)
    std = torch.std(yhats.float(), 0).numpy()
    # yhats outputs a batch size number of predictions for 20 models
    # yhats seem to be a dictionary of weights
    return mean, std

In [15]:
def evaluate_test(test_loader, encoder):
    accuracy = 0
    for x, y in test_loader:
        z_loc, z_scale = encoder(x)
        combined_z = torch.cat((z_loc, z_scale), 1)
        mean, std = predict(combined_z)
        num_correct_in_batch = torch.sum(torch.eq(mean.int(),y))
        accuracy += num_correct_in_batch.numpy()/len(y)
    return accuracy / (len(test_loader))
        

In [16]:
# Training

num_epochs = 1000

# Define number of epochs
epoch_loss = np.zeros(shape=(num_epochs,))

test_freq = 10
# training
for epoch in range(25):
    i = 0
    for x, y in train_loader:
        i +=1
        # batches of size 256 are being fed in 
        z_loc, z_scale = vae.encoder(x)
        combined_z = torch.cat((z_loc, z_scale), 1)
        loss = svi.step(combined_z, y)
        if i % test_freq == 0:
            test_acc = evaluate_test(test_loader, vae.encoder)
            print("test acc", test_acc)
            print("loss", loss)
            print("mean", mean[0], "y is", y[0])
            print("train acc", accuracy_per_batch)
        mean, std = predict(combined_z)
        accuracy_per_batch = torch.sum(torch.eq(mean.int(),y)).numpy()/len(y)


	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha)


test acc 0.09609375
loss 632.3417816162109
mean tensor(5., dtype=torch.float64) y is tensor(8)
train acc 0.1015625
test acc 0.10517578125
loss 616.0413837432861
mean tensor(3.6500, dtype=torch.float64) y is tensor(5)
train acc 0.09375
test acc 0.10478515625
loss 608.466549873352
mean tensor(4.5000, dtype=torch.float64) y is tensor(9)
train acc 0.09375
test acc 0.10087890625
loss 592.7398691177368
mean tensor(5.5000, dtype=torch.float64) y is tensor(3)
train acc 0.11328125
test acc 0.10458984375
loss 588.482907295227
mean tensor(4.4500, dtype=torch.float64) y is tensor(1)
train acc 0.11328125
test acc 0.103515625
loss 577.2355165481567
mean tensor(5.0500, dtype=torch.float64) y is tensor(6)
train acc 0.09765625
test acc 0.1078125
loss 556.0070905685425
mean tensor(5.0500, dtype=torch.float64) y is tensor(1)
train acc 0.109375
test acc 0.1126953125
loss 552.1666259765625
mean tensor(4.0500, dtype=torch.float64) y is tensor(4)
train acc 0.12890625
test acc 0.1126953125
loss 541.1621026992

test acc 0.55751953125
loss 119.4060320854187
mean tensor(7.0500, dtype=torch.float64) y is tensor(4)
train acc 0.53515625
test acc 0.55263671875
loss 110.86316311359406
mean tensor(4.6000, dtype=torch.float64) y is tensor(4)
train acc 0.55859375
test acc 0.553515625
loss 109.69799137115479
mean tensor(3.9500, dtype=torch.float64) y is tensor(3)
train acc 0.55859375
test acc 0.5560546875
loss 122.8629379272461
mean tensor(4.6500, dtype=torch.float64) y is tensor(3)
train acc 0.57421875
test acc 0.5701171875
loss 112.3459005355835
mean tensor(2.9500, dtype=torch.float64) y is tensor(4)
train acc 0.53125
test acc 0.5716796875
loss 112.2608585357666
mean tensor(2., dtype=torch.float64) y is tensor(6)
train acc 0.5390625
test acc 0.58505859375
loss 108.30894565582275
mean tensor(0.9000, dtype=torch.float64) y is tensor(0)
train acc 0.58203125
test acc 0.5818359375
loss 111.4146671295166
mean tensor(5.2500, dtype=torch.float64) y is tensor(2)
train acc 0.61328125
test acc 0.580078125
loss 1

test acc 0.6755859375
loss 96.33040523529053
mean tensor(0., dtype=torch.float64) y is tensor(7)
train acc 0.6640625
test acc 0.67294921875
loss 84.93203067779541
mean tensor(2.5000, dtype=torch.float64) y is tensor(6)
train acc 0.60546875
test acc 0.67314453125
loss 91.83746993541718
mean tensor(4., dtype=torch.float64) y is tensor(8)
train acc 0.64453125
test acc 0.66845703125
loss 103.55753606557846
mean tensor(2., dtype=torch.float64) y is tensor(8)
train acc 0.671875
test acc 0.67294921875
loss 72.43836784362793
mean tensor(8., dtype=torch.float64) y is tensor(3)
train acc 0.6484375
test acc 0.67548828125
loss 97.56249523162842
mean tensor(1., dtype=torch.float64) y is tensor(5)
train acc 0.67578125
test acc 0.67724609375
loss 76.6351466178894
mean tensor(3.1000, dtype=torch.float64) y is tensor(1)
train acc 0.63671875
test acc 0.67666015625
loss 94.24039816856384
mean tensor(6.8000, dtype=torch.float64) y is tensor(7)
train acc 0.609375
test acc 0.679296875
loss 83.81433773040771

test acc 0.7041015625
loss 69.08839797973633
mean tensor(0.8500, dtype=torch.float64) y is tensor(1)
train acc 0.671875
test acc 0.705078125
loss 95.87399673461914
mean tensor(4., dtype=torch.float64) y is tensor(7)
train acc 0.6328125
test acc 0.70859375
loss 93.88260054588318
mean tensor(1., dtype=torch.float64) y is tensor(8)
train acc 0.6484375
test acc 0.7025390625
loss 78.99757766723633
mean tensor(7., dtype=torch.float64) y is tensor(1)
train acc 0.69140625
test acc 0.70439453125
loss 68.32080459594727
mean tensor(7.6000, dtype=torch.float64) y is tensor(0)
train acc 0.73046875
test acc 0.7169921875
loss 90.9014139175415
mean tensor(4., dtype=torch.float64) y is tensor(3)
train acc 0.7578125
test acc 0.7078125
loss 116.1570143699646
mean tensor(0.5000, dtype=torch.float64) y is tensor(3)
train acc 0.67578125
test acc 0.70107421875
loss 86.52250814437866
mean tensor(9., dtype=torch.float64) y is tensor(1)
train acc 0.7578125
test acc 0.70556640625
loss 63.75248336791992
mean tens

test acc 0.702734375
loss 100.08395195007324
mean tensor(1., dtype=torch.float64) y is tensor(6)
train acc 0.67578125
test acc 0.70849609375
loss 73.46789455413818
mean tensor(2.4000, dtype=torch.float64) y is tensor(9)
train acc 0.65625
test acc 0.71259765625
loss 90.9276533126831
mean tensor(7.1000, dtype=torch.float64) y is tensor(7)
train acc 0.65234375
test acc 0.716015625
loss 84.08003234863281
mean tensor(8.2500, dtype=torch.float64) y is tensor(5)
train acc 0.67578125
test acc 0.72275390625
loss 75.26463508605957
mean tensor(4.2500, dtype=torch.float64) y is tensor(9)
train acc 0.67578125
test acc 0.72275390625
loss 71.55908346176147
mean tensor(2., dtype=torch.float64) y is tensor(3)
train acc 0.69140625
test acc 0.71015625
loss 97.07212162017822
mean tensor(2., dtype=torch.float64) y is tensor(1)
train acc 0.72265625
test acc 0.713671875
loss 98.61316108703613
mean tensor(3.4500, dtype=torch.float64) y is tensor(1)
train acc 0.71875
test acc 0.71396484375
loss 89.688741683959

test acc 0.7181640625
loss 83.18731927871704
mean tensor(4., dtype=torch.float64) y is tensor(1)
train acc 0.6953125
test acc 0.723828125
loss 79.00269031524658
mean tensor(2., dtype=torch.float64) y is tensor(1)
train acc 0.75
test acc 0.73291015625
loss 70.18265056610107
mean tensor(7.5000, dtype=torch.float64) y is tensor(7)
train acc 0.703125
test acc 0.73193359375
loss 79.4325532913208
mean tensor(0., dtype=torch.float64) y is tensor(8)
train acc 0.7109375
test acc 0.73173828125
loss 71.27159690856934
mean tensor(0., dtype=torch.float64) y is tensor(9)
train acc 0.75390625
test acc 0.72509765625
loss 83.1881217956543
mean tensor(7.9000, dtype=torch.float64) y is tensor(1)
train acc 0.703125
test acc 0.72509765625
loss 62.61549186706543
mean tensor(7., dtype=torch.float64) y is tensor(4)
train acc 0.70703125
test acc 0.7193359375
loss 75.17232465744019
mean tensor(8.7500, dtype=torch.float64) y is tensor(0)
train acc 0.69921875
test acc 0.71943359375
loss 97.43040943145752
mean ten

test acc 0.73515625
loss 77.76183891296387
mean tensor(7.2000, dtype=torch.float64) y is tensor(0)
train acc 0.76953125
test acc 0.730078125
loss 96.44160604476929
mean tensor(6., dtype=torch.float64) y is tensor(9)
train acc 0.74609375
test acc 0.7296875
loss 78.185959815979
mean tensor(6.2500, dtype=torch.float64) y is tensor(5)
train acc 0.75390625
test acc 0.7255859375
loss 75.09706211090088
mean tensor(0.5000, dtype=torch.float64) y is tensor(7)
train acc 0.69921875
test acc 0.72666015625
loss 80.95607662200928
mean tensor(5., dtype=torch.float64) y is tensor(3)
train acc 0.66015625
test acc 0.72763671875
loss 88.07360792160034
mean tensor(5., dtype=torch.float64) y is tensor(1)
train acc 0.71484375
test acc 0.733203125
loss 72.94491481781006
mean tensor(5., dtype=torch.float64) y is tensor(1)
train acc 0.69921875
test acc 0.7376953125
loss 59.3075590133667
mean tensor(0., dtype=torch.float64) y is tensor(5)
train acc 0.7109375
test acc 0.731640625
loss 78.73131227493286
mean tens

test acc 0.73212890625
loss 68.95630073547363
mean tensor(4.7500, dtype=torch.float64) y is tensor(0)
train acc 0.7265625
test acc 0.73271484375
loss 84.07904434204102
mean tensor(7., dtype=torch.float64) y is tensor(3)
train acc 0.734375
test acc 0.73515625
loss 73.63449287414551
mean tensor(2., dtype=torch.float64) y is tensor(0)
train acc 0.73828125
test acc 0.74072265625
loss 96.68097114562988
mean tensor(6.6500, dtype=torch.float64) y is tensor(9)
train acc 0.7265625
test acc 0.72724609375
loss 70.44693756103516
mean tensor(7.7000, dtype=torch.float64) y is tensor(8)
train acc 0.71875
test acc 0.73095703125
loss 73.69923830032349
mean tensor(5.0500, dtype=torch.float64) y is tensor(6)
train acc 0.75
test acc 0.73994140625
loss 86.64864611625671
mean tensor(0., dtype=torch.float64) y is tensor(5)
train acc 0.71484375
test acc 0.73662109375
loss 79.24667167663574
mean tensor(5.6000, dtype=torch.float64) y is tensor(1)
train acc 0.73046875
test acc 0.72451171875
loss 71.8776826858520

test acc 0.7408203125
loss 69.76453304290771
mean tensor(8., dtype=torch.float64) y is tensor(7)
train acc 0.71875
test acc 0.741015625
loss 71.40305137634277
mean tensor(3.1500, dtype=torch.float64) y is tensor(6)
train acc 0.7265625
test acc 0.73544921875
loss 84.42366743087769
mean tensor(7.7000, dtype=torch.float64) y is tensor(1)
train acc 0.7109375
test acc 0.74345703125
loss 73.76571083068848
mean tensor(0., dtype=torch.float64) y is tensor(3)
train acc 0.71875
test acc 0.74599609375
loss 69.79293918609619
mean tensor(7.6500, dtype=torch.float64) y is tensor(4)
train acc 0.76953125
test acc 0.729296875
loss 83.04655456542969
mean tensor(8.7000, dtype=torch.float64) y is tensor(1)
train acc 0.7109375
test acc 0.73779296875
loss 87.88827610015869
mean tensor(6., dtype=torch.float64) y is tensor(5)
train acc 0.71484375
test acc 0.7357421875
loss 58.933932304382324
mean tensor(2., dtype=torch.float64) y is tensor(3)
train acc 0.7265625
test acc 0.73095703125
loss 82.6318769454956
me