In [1]:
import torch as t 
import torchvision
import numpy as np
from torch.utils.data.sampler import SubsetRandomSampler
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import matplotlib.pylab as plt
from torch.nn.utils import clip_grad_value_
%matplotlib inline
import pickle
from torchvision import datasets, transforms
import random

In [2]:
device = 'cuda' # cuda or cpu

In [44]:
batch_size = 64
init_log_sigma = -3.0 # логарифм дисперсии вариационного распределения при инициализации
prior_sigma = 0.1 # априорная дисперсия
epoch_num = 2 #количество эпох
lamb = [0, 0.1, 0.5, 1, 5, 10, 100, 1000]
# lam = 1.0 # коэффициент перед дивергенцией
hidden_num = 100 # количество нейронов на скрытом слое
t.manual_seed(42) # задаем значение генератора случайных чисел для повторяемости экспериментов
acc_delete = [] 
filename = 'Hypernet_linear_1_sn3' # куда сохранять
lam_hidden_num = 1
start_num = 1
log_lam_low = -2.0
log_lam_high = 2.0
mode = 'linear'

In [24]:
# сохранение данных
def save(file):
    outfile = open(filename, 'wb')
    pickle.dump(file, outfile)
    outfile.close()
    
def load(path = filename):
    infile = open(path, 'rb')
    file = pickle.load(infile)
    infile.close()
    return file
    
    

In [25]:
# загрузка данных
train_data = torchvision.datasets.MNIST('./files/', train=True, download=True,
                             transform = transforms.Compose([transforms.ToTensor(),
                              transforms.Normalize((0.5,), (0.5,)),
                                  torchvision.transforms.Lambda(lambda x: x.view(-1))
                              ]))

test_data = torchvision.datasets.MNIST('./files/', train=False, download=True,
                             transform = transforms.Compose([transforms.ToTensor(),
                              transforms.Normalize((0.5,), (0.5,)),
                                  torchvision.transforms.Lambda(lambda x: x.view(-1))
                              ]))


train_loader = t.utils.data.DataLoader(train_data, batch_size=batch_size, pin_memory=True )
test_loader = t.utils.data.DataLoader(test_data, batch_size=batch_size)


In [26]:
class LowRankNet(nn.Module):
    def __init__(self, size, hidden, gain_const = 1.0, gain_lamb = 1.0,
                 gain_lowrank = .0001,  act= F.relu):    
        nn.Module.__init__(self)        
        self.w = nn.Linear(1, hidden).to(device)
        t.nn.init.xavier_uniform(self.w.weight, gain_lamb)
        # проверка на вектор или матрица
        if isinstance(size, tuple) and len(size) == 2: # если сайз неизменяемый список и его длина 2
            self.in_, self.out_ = size
            self.diagonal = False
        else:
            self.out_ = size
            self.diagonal = True
            
        
        self.one = t.ones(1,device=device) # для упрощения работы с лямбдой. Костыль, можно сделать проще
        self.act = act
        
        if self.diagonal:
            self.w_d = nn.Linear(hidden, self.out_).to(device)
            t.nn.init.xavier_uniform(self.w_d.weight, gain_lowrank)
            # независимая от параметра lambda часть
            self.const = nn.Parameter(t.randn(self.out_, device=device)) 
            
        else:
            self.w_a1 = nn.Linear(hidden, self.in_).to(device)
            t.nn.init.xavier_uniform(self.w_a1.weight, gain_lowrank)
            
            self.w_a2 = nn.Linear(hidden, self.out_).to(device)
            t.nn.init.xavier_uniform(self.w_a2.weight, gain_lowrank)
            
            self.const = nn.Parameter(t.randn(self.in_, self.out_, device=device)) 
            t.nn.init.xavier_uniform(self.const,  gain_const)
            
            
    def forward(self, lam):
        h = self.act(self.w(self.one * lam))        
        if self.diagonal:
            return self.const + self.w_d(h)
        else:
            a1 = self.w_a1(h)
            a2 = self.w_a2(h)
         
            return self.const +  t.matmul(a1.view(-1, 1), a2.view(1, -1))

        
class LinearApprNet(nn.Module):
    def __init__(self, size,  gain_const = 1.0, gain_const2 = 0.000001,  act= lambda x: x):    
        nn.Module.__init__(self)        
        if isinstance(size, tuple) and len(size) == 2:
            self.in_, self.out_ = size
            self.diagonal = False
        else:
            self.out_ = size
            self.diagonal = True
            
        
        self.one = t.ones(1, device=device) # для упрощения работы с лямбдой. Костыль, можно сделать проще
        self.act = act
        
        if self.diagonal:
            # независимая от параметра lambda часть
            self.const = nn.Parameter(t.randn(self.out_, device=device)) 
            self.const2 = nn.Parameter(t.ones(self.out_, device=device) * gain_const2) 
            
            
        else:
            self.const = nn.Parameter(t.randn(self.in_, self.out_, device=device)) 
            t.nn.init.xavier_uniform(self.const,  gain_const)
            self.const2 = nn.Parameter(t.randn(self.in_, self.out_, device=device)) 
            t.nn.init.xavier_uniform(self.const2,  gain_const2)
            
            
    def forward(self, lam):        
        if self.diagonal:
            return self.const + self.const2 * lam
        else:
            return self.const + self.const2 * lam 
                

In [27]:
# проверка что все работает
# случай вектора
n = LowRankNet(100, 10)
n(100)- n(0)

  


tensor([ 1.1579e-03,  4.4358e-04, -3.1865e-04,  3.2437e-04,  9.0420e-05,
         3.2893e-04,  4.0364e-04,  8.0103e-04,  3.8385e-04, -1.4532e-04,
         9.3460e-05, -8.5342e-04, -4.9203e-04,  1.0955e-03,  2.6686e-04,
        -5.0107e-04, -6.9356e-04, -3.5524e-05,  1.4722e-04, -4.9192e-04,
         1.3196e-04,  7.8094e-04, -5.0154e-04,  3.7491e-04, -7.2670e-04,
        -4.2802e-04, -2.1482e-04, -6.1572e-05,  7.6455e-04, -8.5819e-04,
        -5.1409e-04, -3.8001e-04, -2.3258e-04, -5.6469e-04, -4.0627e-04,
        -3.2903e-04,  1.0856e-03, -8.3484e-04, -9.5175e-04,  6.8972e-04,
         1.2767e-04, -1.1337e-03,  3.4571e-04,  6.0070e-04,  5.2279e-04,
        -3.9132e-04, -1.2994e-04, -1.7625e-04,  7.2122e-06, -2.2772e-04,
        -2.9835e-04, -1.7881e-07,  6.0558e-04,  4.9710e-05,  6.5863e-04,
         7.4020e-04,  2.1636e-04,  2.3735e-04,  4.2582e-04,  1.0834e-03,
        -4.0892e-04, -7.6619e-04, -2.2078e-04,  4.9284e-04, -5.7805e-04,
         9.3067e-04,  1.1504e-04,  5.8293e-04, -4.3

In [28]:
# проверка что все работает
# случай матрицы
n = LowRankNet((100, 20), 10)
n(100) - n(0)

  


tensor([[-6.1625e-04,  4.0583e-04, -6.5077e-05,  ...,  3.8743e-07,
          3.6196e-04,  3.4638e-04],
        [ 6.1025e-04, -4.2704e-04, -5.9105e-05,  ..., -2.4839e-04,
         -5.2603e-04, -4.3452e-05],
        [-5.3713e-04,  3.5816e-04, -3.5007e-05,  ...,  4.3936e-05,
          3.4495e-04,  2.4928e-04],
        ...,
        [-6.3423e-04,  4.0828e-04, -1.1307e-04,  ..., -9.2089e-05,
          3.1003e-04,  4.6821e-04],
        [ 6.0985e-04, -4.0704e-04,  3.7774e-05,  ..., -5.3838e-05,
         -3.9433e-04, -2.7824e-04],
        [ 8.3520e-04, -5.5788e-04,  4.9610e-05,  ..., -7.7993e-05,
         -5.4292e-04, -3.7590e-04]], device='cuda:0', grad_fn=<SubBackward0>)

In [29]:
class VarLayer(nn.Module): # вариационная однослойная сеть
    def __init__(self, in_,  out_,   act=F.relu):         
        nn.Module.__init__(self)                    
        self.mean = nn.Parameter(t.randn(in_, out_, device=device)) # параметры средних
        t.nn.init.xavier_uniform(self.mean) 
        self.log_sigma = nn.Parameter(t.ones(in_, out_, device = device)*init_log_sigma) # логарифм дисперсии
        self.mean_b = nn.Parameter(t.randn(out_, device=device)) # то же самое для свободного коэффициента
        self.log_sigma_b = nn.Parameter(t.ones(out_, device=device) * init_log_sigma)
                
        self.in_ = in_
        self.out_ = out_
        self.act = act
        
    def forward(self,x):
        if self.training: # во время обучения - сэмплируем из нормального распределения
            self.eps_w = t.distributions.Normal(self.mean, t.exp(self.log_sigma))
            self.eps_b = t.distributions.Normal(self.mean_b, t.exp(self.log_sigma_b))
        
            w = self.eps_w.rsample()
            b = self.eps_b.rsample()
             
        else:  # во время контроля - смотрим средние значения параметра        
            w = self.mean 
            b = self.mean_b
            
        # функция активации 
        return self.act(t.matmul(x, w)+b)

    def KLD(self):        
        # подсчет дивергенции
        size = self.in_, self.out_
        out = self.out_
        self.eps_w = t.distributions.Normal(self.mean, t.exp(self.log_sigma))
        self.eps_b = t.distributions.Normal(self.mean_b,  t.exp(self.log_sigma_b))
        self.h_w = t.distributions.Normal(t.zeros(size, device=device), t.ones(size, device=device)*prior_sigma)
        self.h_b = t.distributions.Normal(t.zeros(out, device=device), t.ones(out, device=device)*prior_sigma)                
        k1 = t.distributions.kl_divergence(self.eps_w,self.h_w).sum()        
        k2 = t.distributions.kl_divergence(self.eps_b,self.h_b).sum()        
        return k1+k2
    
class VarLayerLowRank(nn.Module): # вариационная однослойная сеть
    def __init__(self, in_,  out_,   act=F.relu):         
        nn.Module.__init__(self)                    
        self.mean = LowRankNet((in_, out_), lam_hidden_num) # параметры средних            
        self.log_sigma = LowRankNet((in_, out_), lam_hidden_num) # логарифм дисперсии
        self.mean_b = LowRankNet( out_, lam_hidden_num) # то же самое для свободного коэффициента
        self.log_sigma_b = LowRankNet( out_, lam_hidden_num)
     
        self.log_sigma.const.data*= 0 # забьем константу нужными нам значениями
        self.log_sigma.const.data+= init_log_sigma
     
        self.log_sigma_b.const.data*= 0 # забьем константу нужными нам значениями
        self.log_sigma_b.const.data+= init_log_sigma
        
        
                
        self.in_ = in_
        self.out_ = out_
        self.act = act
        
    def forward(self,x, l):
        if self.training: # во время обучения - сэмплируем из нормального распределения
            self.eps_w = t.distributions.Normal(self.mean(l), t.exp(self.log_sigma(l)))
            self.eps_b = t.distributions.Normal(self.mean_b(l), t.exp(self.log_sigma_b(l)))
        
            w = self.eps_w.rsample()
            b = self.eps_b.rsample()
             
        else:  # во время контроля - смотрим средние значения параметра        
            w = self.mean(l) 
            b = self.mean_b(l)
            
        # функция активации 
        return self.act(t.matmul(x, w)+b)

    def KLD(self, l):        
        # подсчет дивергенции
        size = self.in_, self.out_
        out = self.out_
        self.eps_w = t.distributions.Normal(self.mean(l), t.exp(self.log_sigma(l)))
        self.eps_b = t.distributions.Normal(self.mean_b(l),  t.exp(self.log_sigma_b(l)))
        self.h_w = t.distributions.Normal(t.zeros(size, device=device), t.ones(size, device=device)*prior_sigma)
        self.h_b = t.distributions.Normal(t.zeros(out, device=device), t.ones(out, device=device)*prior_sigma)                
        k1 = t.distributions.kl_divergence(self.eps_w,self.h_w).sum()        
        k2 = t.distributions.kl_divergence(self.eps_b,self.h_b).sum()        
        return k1+k2
    

class VarLayerLinearAppr(nn.Module): # вариационная однослойная сеть
    def __init__(self, in_,  out_,   act=F.relu):         
        nn.Module.__init__(self)                    
        self.mean = LinearApprNet((in_, out_)) # параметры средних            
        self.log_sigma = LinearApprNet((in_, out_)) # логарифм дисперсии
        self.mean_b = LinearApprNet( out_) # то же самое для свободного коэффициента
        self.log_sigma_b = LinearApprNet( out_)
     
        self.log_sigma.const.data*= 0 # забьем константу нужными нам значениями
        self.log_sigma.const.data+= init_log_sigma
     
        self.log_sigma_b.const.data*= 0 # забьем константу нужными нам значениями
        self.log_sigma_b.const.data+= init_log_sigma
        
        
                
        self.in_ = in_
        self.out_ = out_
        self.act = act
        
    def forward(self, x, l):
        if self.training: # во время обучения - сэмплируем из нормального распределения
            self.eps_w = t.distributions.Normal(self.mean(l), t.exp(self.log_sigma(l)))
            self.eps_b = t.distributions.Normal(self.mean_b(l), t.exp(self.log_sigma_b(l)))
        
            w = self.eps_w.rsample()
            b = self.eps_b.rsample()
             
        else:  # во время контроля - смотрим средние значения параметра        
            w = self.mean(l) 
            b = self.mean_b(l)
            
        # функция активации 
        return self.act(t.matmul(x, w)+b)

    def KLD(self, l):        
        # подсчет дивергенции
        size = self.in_, self.out_
        out = self.out_
        self.eps_w = t.distributions.Normal(self.mean(l), t.exp(self.log_sigma(l)))
        self.eps_b = t.distributions.Normal(self.mean_b(l),  t.exp(self.log_sigma_b(l)))
        self.h_w = t.distributions.Normal(t.zeros(size, device=device), t.ones(size, device=device)*prior_sigma)
        self.h_b = t.distributions.Normal(t.zeros(out, device=device), t.ones(out, device=device)*prior_sigma)                
        k1 = t.distributions.kl_divergence(self.eps_w,self.h_w).sum()        
        k2 = t.distributions.kl_divergence(self.eps_b,self.h_b).sum()        
        return k1+k2    

In [30]:
l = VarLayerLinearAppr(784, 10)



In [31]:
l.log_sigma_b(100)

tensor([-2.9999, -2.9999, -2.9999, -2.9999, -2.9999, -2.9999, -2.9999, -2.9999,
        -2.9999, -2.9999], device='cuda:0', grad_fn=<AddBackward0>)

In [32]:
class VarSeqNet(nn.Sequential):    
    # класс-обертка на случай, если у нас многослойная нейронная сеть
    def KLD(self, lam = None):
        k = 0
        for l in self: 
            if lam is None:
                k+=l.KLD()
            else:
                k+=l.KLD(lam)
                
        return k
    
    def forward(self, x, lam = None):
        if lam is None:
            for l in self:
                x = l(x)
            return x
        else:
            for l in self:
                x = l(x, lam)
            return x
    


In [46]:
def train_batches(net, loss_fn, optimizer, i, out, out_loss, kld, loss, epoch):
    for id, (x,y) in enumerate(train_loader):  
            id+=1
            if device == 'cuda':
                x = x.cuda()
                y = y.cuda()            
            optimizer.zero_grad() 
            loss[i] = 0 
            #for _ in range(5):
            log_lam = np.random.uniform(low=log_lam_low, high=log_lam_high)
            lam = 10**log_lam 
            #lam = 1.0
            lam_param = lam/10**(log_lam_high) # нормируем вход
            out[i] = net(x, lam_param)
            # правдоподобие должно суммироваться по всей обучающей выборке
            # в случае батчей - она приводится к тому же порядку 
            out_loss[i] = loss_fn(out[i], y)* len(train_data)                
            kld[i] =  net.KLD(lam_param) *lam
        


            loss[i] += (out_loss[i]+kld[i])       
            if id %100 == 0:           
                print ("Number of net:",i, loss[i].data, out_loss[i].data, kld[i].data, lam)            
                    
            loss[i].backward()       
            clip_grad_value_(net.parameters(), 1.0) # для стабильности градиента. С этим можно играться
            optimizer.step()

In [34]:
def statistic(net, loss_fn, i, kld, loss, out, out_loss):
    net.eval()  
    kld[i] =  net.KLD(1) 
    loss[i] = kld[i]
    for x,y in test_loader:
         if device == 'cuda':
            x = x.cuda()
            y = y.cuda()          
    out[i] = net(x, 1)   
    out_loss[i] = loss_fn(out[i], y)* len(train_data)
    #  print(out_loss[i])
    # print(loss[i])
    loss[i] += out_loss[i]
    net.train()
    print (loss[i])
    return loss[i]


    

In [35]:
# рассмотрим для примера сеть, состояющую из двух слоев
# второй слой - softmax. По сути для обучения задавать активацию явно не нужно, она забита в nn.CrossEntropyLoss
def init_nets(loss_fn_nets):
    for i in range(start_num):
        if mode == 'lowrank':
            nets.append(VarSeqNet(VarLayerLowRank(784,  hidden_num), VarLayerLowRank(hidden_num, 10, act=lambda x:x)))
        elif mode == 'linear':
            nets.append(VarSeqNet(VarLayerLinearAppr(784,  hidden_num), VarLayerLinearAppr(hidden_num, 10, act=lambda x:x)))
        else:
            raise ValueError('Bad mode')
        optimizer_nets.append(optim.Adam(nets[i].parameters(), lr=0.001))
        loss_fn_nets.append(nn.CrossEntropyLoss())
    loss_graph=[[],[],[]]
    out = [None, None, None]
    out_loss = [None, None, None]
    kld = [None, None, None]
    loss = [None, None, None]
    return out, out_loss, kld, loss, loss_graph

def train_nets(out, out_loss, kld, loss, loss_graph):
    for epoch in range(epoch_num):             
        for i,net in enumerate(nets):
            train_batches(net,loss_fn_nets[i], optimizer_nets[i],i, out, out_loss, kld, loss, epoch)
        print ('end of epoch: ', epoch)   
        for i,net in enumerate(nets):
            print("Number of net:",i)        
            loss_graph[i].append(statistic(net, loss_fn_nets[i], i, kld, loss, out, out_loss))

        

In [36]:
#print(loss_graf)
def graph_loss_func(loss_graph, nets):
    for i,net in enumerate(nets): 
        plt.plot(loss_graph[i])
    plt.ylabel('Loss function')
    plt.xlabel('Number of epoche')
    plt.show()
#print(out_loss)

#graph_loss_func()

In [37]:
def test_acc(out): # точность классификации
    acc = []
    for i,net in enumerate(nets):
        correct = 0
        net.eval()
        for x,y in test_loader:
            if device == 'cuda':
                x = x.cuda()
                y = y.cuda()     
            out[i] = net(x)    
            correct += out[i].argmax(1).eq(y).sum().cpu().numpy()
        acc.append(correct / len(test_data))
    print(sum(acc)/len(acc))   
    return(acc)
#test_acc(out)

In [38]:
# коэффициенты информативности, см. статью practical variational inference
# попробуем удалять параметры первого слоя по этому коэффициенту

def init_coeff(prune_coef, mu, sigma):
    for i,net in enumerate(nets): 
        mu.append(net[0].mean) 
        sigma.append(t.exp(2*net[0].log_sigma))
        prune_coef.append((mu[i]**2/sigma[i]).cpu().detach().numpy())  


In [39]:
# будем удалять по 10% от модели и смотреть качество
def delete_10(acc_delete, prune_coef, mu, sigma, nets, out):
    acc_delete = []
    sorted_coefs = []
    for i, net in enumerate(nets):
        sorted_coefs.append(np.sort(prune_coef[i].flatten()))
    for j in range(10):
        for i,net in enumerate(nets): 
            ids = (prune_coef[i] <= sorted_coefs[i][round(j/10*len(sorted_coefs[i]))]) 
            net[0].mean.data*=(1-t.tensor(ids*1.0, device=device, dtype=t.float))
            print ('nonzero params: ', (abs(net[0].mean)>0).float().mean())
        acc_delete.append(test_acc(out))
    return acc_delete    


In [40]:
def graph(acc_delete, lamb):
    proc = [0,10,20,30,40,50,60,70,80,90]
    plt.rcParams['figure.figsize'] = 12, 12
    for k, lam in enumerate(lamb):
        acc_delete_n = np.array(acc_delete[k])
        plt.plot(proc, np.mean(acc_delete_n, 1), label = 'lambda = {}'.format(str(lam)))
        # откладываем ошибку вокруг среднего, альфа - прозрачность линии
        plt.fill_between(proc, np.mean(acc_delete_n, 1)  + np.std(acc_delete_n, 1) , np.mean(acc_delete_n, 1) - np.std(acc_delete_n, 1) , alpha = 0.5 )
    plt.ylabel('Точность классификации', fontsize = 20)
    plt.xlabel('Процент удаления', fontsize = 20)
    plt.tick_params(axis='both', which='major', labelsize=18)
    plt.legend(loc='best')
    plt.savefig('Linear_1_sm3')
    plt.show()

#acc_delete = load('save_array_0.1')    
#graph(acc_delete, lamb)


    

In [41]:
# проверяем, что фокусов тут нет, удаляем оставшиеся 10%\
def delete_last10():
    flag = 0
    for j in range(10):
        for i,net in enumerate(nets): 
            if (flag == 0):
                sorted_coefs = np.sort(prune_coef[i].flatten())
                flag = 1
            ids = (prune_coef[i] <= sorted_coefs[round((0.9+j/100)*len(sorted_coefs))]) 
            net[0].mean.data*=(1-t.tensor(ids*1.0, device=device, dtype=t.float))
            print ('nonzero params: ', (abs(net[0].mean)>0).float().mean())
        (test_acc())
    for i,net in enumerate(nets):
        net[0].mean.data*=0
        print ('nonzero params: ', (abs(net[0].mean)>0).float().mean())
    (test_acc())
    
#delete_last10()    

In [47]:
loss_fn_nets =[]
nets = []
optimizer_nets = []
mu_glob = []
sigma_glob = []
prune_coef_glob = []
init_nets_output =  init_nets(loss_fn_nets)
train_nets(init_nets_output[0], init_nets_output[1], init_nets_output[2], init_nets_output[3], init_nets_output[4])
old_nets = nets[:]

for k,lam in enumerate(lamb):
    for i in enumerate(nets):
        new_net = VarSeqNet(VarLayer(784,  hidden_num), VarLayer(hidden_num, 10, act=lambda x:x))
        for j in range(0, 2): # бежим по слоям
            lam_param = lam / 10**log_lam_high
            new_net[j].mean.data*=0
            new_net[j].mean.data+=old_nets[i][j].mean(lam_param)
            new_net[j].mean_b.data*=0
            new_net[j].mean_b.data+=old_nets[i][j].mean_b(lam_param)
            new_net[j].log_sigma.data*=0
            new_net[j].log_sigma.data+=old_nets[i][j].log_sigma(lam_param)
            new_net[j].log_sigma_b.data*=0
            new_net[j].log_sigma_b.data+=old_nets[i][j].log_sigma_b(lam_param)
            
    nets[i] = new_net        
    acc_delete.append(None)
    init_coeff(prune_coef_glob, mu_glob, sigma_glob)
    acc_delete[k]= delete_10(acc_delete[k], prune_coef_glob, mu_glob, sigma_glob, nets, init_nets_output[0])

    
init_coeff(prune_coef_glob, mu_glob, sigma_glob)    
#graph_loss_func()
graph(acc_delete,lamb)
save(acc_delete)




Number of net: 0 tensor(500917.2500, device='cuda:0') tensor(59102.7148, device='cuda:0') tensor(441814.5312, device='cuda:0') 12.89463822153301
Number of net: 0 tensor(1926606.7500, device='cuda:0') tensor(106888.0625, device='cuda:0') tensor(1819718.6250, device='cuda:0') 59.30711073278689
Number of net: 0 tensor(369872.2188, device='cuda:0') tensor(36620.9922, device='cuda:0') tensor(333251.2188, device='cuda:0') 11.148246522463536
Number of net: 0 tensor(1752076.2500, device='cuda:0') tensor(39810.8828, device='cuda:0') tensor(1712265.3750, device='cuda:0') 71.3160947476965
Number of net: 0 tensor(125515.5781, device='cuda:0') tensor(71420.8359, device='cuda:0') tensor(54094.7461, device='cuda:0') 2.033441577886245
Number of net: 0 tensor(1364090.2500, device='cuda:0') tensor(76970.9766, device='cuda:0') tensor(1287119.2500, device='cuda:0') 69.05615273325493
Number of net: 0 tensor(450298.6875, device='cuda:0') tensor(45138.7539, device='cuda:0') tensor(405159.9375, device='cuda:0

  """


TypeError: list indices must be integers or slices, not tuple

In [257]:
new_net = VarSeqNet(VarLayer(784,  hidden_num), VarLayer(hidden_num, 10, act=lambda x:x))
i = 0 
for j in range(0, 2): # бежим по слоям
    new_net[j].mean.data*=0
    new_net[j].mean.data+=old_nets[i][j].mean(lam)
    new_net[j].mean_b.data*=0
    new_net[j].mean_b.data+=old_nets[i][j].mean_b(lam)
    new_net[j].log_sigma.data*=0
    new_net[j].log_sigma.data+=old_nets[i][j].log_sigma(lam)
    new_net[j].log_sigma_b.data*=0
    new_net[j].log_sigma_b.data+=old_nets[i][j].log_sigma_b(lam)

  """


In [258]:
(0, VarSeqNet(
  (0): VarLayerLinearAppr(
    (mean): LinearApprNet()
    (log_sigma): LinearApprNet()
    (mean_b): LinearApprNet()
    (log_sigma_b): LinearApprNet()
  )
  (1): VarLayerLinearAppr(
    (mean): LinearApprNet()
    (log_sigma): LinearApprNet()
    (mean_b): LinearApprNet()
    (log_sigma_b): LinearApprNet()
  )
)) 0

0

In [243]:
new_net(x)

tensor([[1213.8149,  885.8934, 1215.5417,  883.6002,  768.9736, 1339.3053,
         1235.9641, 1001.3223, 1015.4268, 1144.4662],
        [1113.5548,  825.9999, 1122.1970,  829.0692,  703.5684, 1222.4860,
         1139.6187,  909.0403,  941.6868, 1039.6541],
        [1339.7222, 1000.9510, 1352.3000,  987.5376,  854.2031, 1484.1738,
         1375.6842, 1108.0099, 1130.4620, 1266.8187],
        [1002.4268,  716.7179,  990.4283,  723.5088,  620.0521, 1093.7192,
         1012.1857,  808.2672,  829.2755,  925.5312],
        [1214.7711,  881.5259, 1217.4650,  878.8027,  773.8610, 1341.7554,
         1239.9010,  995.3021, 1017.7711, 1147.7567],
        [1264.0046,  948.8234, 1279.3776,  936.2783,  807.3743, 1401.2827,
         1299.3010, 1048.4663, 1071.0332, 1198.0283],
        [1162.2483,  848.7446, 1162.3492,  849.2804,  742.9947, 1287.5316,
         1187.1021,  956.5825,  976.6001, 1098.8002],
        [1161.8960,  854.9814, 1167.5698,  849.1383,  741.3011, 1291.7340,
         1193.3240,  9