In [8]:
import torch as t 
import torchvision
import numpy as np
from torch.utils.data.sampler import SubsetRandomSampler
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import matplotlib.pylab as plt
from torch.nn.utils import clip_grad_value_
%matplotlib inline
import pickle
from torchvision import datasets, transforms
import random

In [9]:
device = 'cuda' # cuda or cpu

In [30]:
batch_size = 64
init_log_sigma = -3.0 # логарифм дисперсии вариационного распределения при инициализации
prior_sigma = 0.1 # априорная дисперсия
epoch_num = 50 #количество эпох
lamb = [0, 0.1, 0.5, 1, 5, 10, 100, 1000]
# lam = 1.0 # коэффициент перед дивергенцией
hidden_num = 100 # количество нейронов на скрытом слое
t.manual_seed(42) # задаем значение генератора случайных чисел для повторяемости экспериментов
acc_delete = [] 
filename = 'Hypernet_lowrank_lamhn1' # куда сохранять
lam_hidden_num = 1
start_num = 1
log_lam_low = -2.0
log_lam_high = 2.0
warmup_epochs = 3 
mode = 'lowrank'

In [11]:
# сохранение данных
def save(file):
    outfile = open(filename, 'wb')
    pickle.dump(file, outfile)
    outfile.close()
    
def load(path = filename):
    infile = open(path, 'rb')
    file = pickle.load(infile)
    infile.close()
    return file
    
    

In [12]:
# загрузка данных
train_data = torchvision.datasets.MNIST('./files/', train=True, download=True,
                             transform = transforms.Compose([transforms.ToTensor(),
                              transforms.Normalize((0.5,), (0.5,)),
                                  torchvision.transforms.Lambda(lambda x: x.view(-1))
                              ]))

test_data = torchvision.datasets.MNIST('./files/', train=False, download=True,
                             transform = transforms.Compose([transforms.ToTensor(),
                              transforms.Normalize((0.5,), (0.5,)),
                                  torchvision.transforms.Lambda(lambda x: x.view(-1))
                              ]))


train_loader = t.utils.data.DataLoader(train_data, batch_size=batch_size, pin_memory=True )
test_loader = t.utils.data.DataLoader(test_data, batch_size=batch_size)


In [13]:
class LowRankNet(nn.Module):
    def __init__(self, size, hidden, gain_const = 1.0, gain_lamb = 1.0,
                 gain_lowrank = .0001,  act= lambda x: x):    
        nn.Module.__init__(self)        
        self.w = nn.Linear(1, hidden).to(device)
        t.nn.init.xavier_uniform(self.w.weight, gain_lamb)
        # проверка на вектор или матрица
        if isinstance(size, tuple) and len(size) == 2: # если сайз неизменяемый список и его длина 2
            self.in_, self.out_ = size
            self.diagonal = False
        else:
            self.out_ = size
            self.diagonal = True
            
        
        self.one = t.ones(1,device=device) # для упрощения работы с лямбдой. Костыль, можно сделать проще
        self.act = act
        
        if self.diagonal:
            self.w_d = nn.Linear(hidden, self.out_).to(device)
            t.nn.init.xavier_uniform(self.w_d.weight, gain_lowrank)
            # независимая от параметра lambda часть
            self.const = nn.Parameter(t.randn(self.out_, device=device)) 
            
        else:
            self.w_a1 = nn.Linear(hidden, self.in_).to(device)
            t.nn.init.xavier_uniform(self.w_a1.weight, gain_lowrank)
            
            self.w_a2 = nn.Linear(hidden, self.out_).to(device)
            t.nn.init.xavier_uniform(self.w_a2.weight, gain_lowrank)
            
            self.w_b = nn.Linear(hidden, self.out_).to(device)
            t.nn.init.xavier_uniform(self.w_b.weight, gain_lowrank)
            
            self.const = nn.Parameter(t.randn(self.in_, self.out_, device=device)) 
            t.nn.init.xavier_uniform(self.const,  gain_const)
            
            
    def forward(self, lam):
        h = self.act(self.w(self.one * lam))        
        if self.diagonal:
            return self.const + self.w_d(h)
        else:
            a1 = self.w_a1(h)
            a2 = self.w_a2(h)
            b = self.w_b(h)
         
            return self.const +  t.matmul(a1.view(-1, 1), a2.view(1, -1)) + b

        
class LinearApprNet(nn.Module):
    def __init__(self, size,  gain_const = 1.0, gain_const2 = 0.000001,  act= lambda x: x):    
        nn.Module.__init__(self)        
        if isinstance(size, tuple) and len(size) == 2:
            self.in_, self.out_ = size
            self.diagonal = False
        else:
            self.out_ = size
            self.diagonal = True
            
        
        self.one = t.ones(1,device=device) # для упрощения работы с лямбдой. Костыль, можно сделать проще
        self.act = act
        
        if self.diagonal:
            # независимая от параметра lambda часть
            self.const = nn.Parameter(t.randn(self.out_, device=device)) 
            self.const2 = nn.Parameter(t.ones(self.out_, device=device) * gain_const2) 
            
            
        else:
            self.const = nn.Parameter(t.randn(self.in_, self.out_, device=device)) 
            t.nn.init.xavier_uniform(self.const,  gain_const)
            self.const2 = nn.Parameter(t.randn(self.in_, self.out_, device=device)) 
            t.nn.init.xavier_uniform(self.const2,  gain_const2)
            
            
    def forward(self, lam):        
        if self.diagonal:
            return self.const + self.const2 * lam
        else:
            return self.const + self.const2 * lam 
                

In [14]:
# проверка что все работает
# случай вектора
n = LowRankNet(100, 10)
n(100)- n(0)

  


tensor([ 1.2171e-03,  2.7072e-04, -1.7269e-03, -7.0226e-04, -5.9742e-04,
         2.1653e-03, -4.3578e-04,  9.9391e-04, -3.8922e-04,  2.2368e-03,
         1.0792e-03, -7.2050e-04,  1.8463e-03,  3.1313e-03,  3.4079e-03,
        -9.5746e-04, -2.6060e-03, -9.3341e-05,  9.2231e-04, -3.2520e-04,
        -1.9574e-03, -1.0403e-03, -1.1552e-03, -1.9388e-03, -6.0648e-04,
         9.6023e-05,  1.5191e-03, -9.0298e-04,  1.2062e-04, -3.1674e-04,
        -6.9535e-04,  8.0307e-04,  1.7796e-03,  1.4316e-03, -9.2125e-04,
        -1.3005e-03,  2.3808e-03, -2.3137e-04, -2.2714e-03,  5.8860e-04,
        -7.6669e-04, -3.7073e-04, -4.5758e-04,  1.0692e-03,  1.1842e-03,
        -1.2034e-04, -9.3532e-04,  4.8220e-05, -2.8507e-03,  2.6738e-03,
        -1.5985e-03,  6.3312e-04,  1.6755e-03,  1.2929e-03,  4.1199e-04,
         4.4294e-04,  4.3845e-04, -6.0153e-04,  2.2393e-03,  1.4299e-03,
         2.7205e-03, -3.5923e-03, -2.3419e-04, -3.5461e-03, -5.7304e-04,
        -3.5763e-04, -7.5352e-04,  6.7401e-04,  2.1

In [15]:
# проверка что все работает
# случай матрицы
n = LowRankNet((100, 20), 10)
n(100) - n(0)

  


tensor([[-0.0018, -0.0042, -0.0005,  ..., -0.0009, -0.0017,  0.0036],
        [-0.0010, -0.0064, -0.0013,  ..., -0.0002, -0.0046,  0.0019],
        [-0.0017, -0.0045, -0.0008,  ..., -0.0013, -0.0023,  0.0039],
        ...,
        [-0.0017, -0.0044, -0.0009,  ..., -0.0017, -0.0023,  0.0045],
        [-0.0011, -0.0064, -0.0011,  ...,  0.0002, -0.0044,  0.0014],
        [-0.0010, -0.0068, -0.0011,  ...,  0.0006, -0.0047,  0.0008]],
       device='cuda:0', grad_fn=<SubBackward0>)

In [16]:
class VarLayer(nn.Module): # вариационная однослойная сеть
    def __init__(self, in_,  out_,   act=F.relu):         
        nn.Module.__init__(self)                    
        self.mean = nn.Parameter(t.randn(in_, out_, device=device)) # параметры средних
        t.nn.init.xavier_uniform(self.mean) 
        self.log_sigma = nn.Parameter(t.ones(in_, out_, device = device)*init_log_sigma) # логарифм дисперсии
        self.mean_b = nn.Parameter(t.randn(out_, device=device)) # то же самое для свободного коэффициента
        self.log_sigma_b = nn.Parameter(t.ones(out_, device=device) * init_log_sigma)
                
        self.in_ = in_
        self.out_ = out_
        self.act = act
        
    def forward(self,x):
        if self.training: # во время обучения - сэмплируем из нормального распределения
            self.eps_w = t.distributions.Normal(self.mean, t.exp(self.log_sigma))
            self.eps_b = t.distributions.Normal(self.mean_b, t.exp(self.log_sigma_b))
        
            w = self.eps_w.rsample()
            b = self.eps_b.rsample()
             
        else:  # во время контроля - смотрим средние значения параметра        
            w = self.mean 
            b = self.mean_b
            
        # функция активации 
        return self.act(t.matmul(x, w)+b)

    def KLD(self):        
        # подсчет дивергенции
        size = self.in_, self.out_
        out = self.out_
        self.eps_w = t.distributions.Normal(self.mean, t.exp(self.log_sigma))
        self.eps_b = t.distributions.Normal(self.mean_b,  t.exp(self.log_sigma_b))
        self.h_w = t.distributions.Normal(t.zeros(size, device=device), t.ones(size, device=device)*prior_sigma)
        self.h_b = t.distributions.Normal(t.zeros(out, device=device), t.ones(out, device=device)*prior_sigma)                
        k1 = t.distributions.kl_divergence(self.eps_w,self.h_w).sum()        
        k2 = t.distributions.kl_divergence(self.eps_b,self.h_b).sum()        
        return k1+k2
    
class VarLayerLowRank(nn.Module): # вариационная однослойная сеть
    def __init__(self, in_,  out_,   act=F.relu):         
        nn.Module.__init__(self)                    
        self.mean = LowRankNet((in_, out_), lam_hidden_num) # параметры средних            
        self.log_sigma = LowRankNet((in_, out_), lam_hidden_num) # логарифм дисперсии
        self.mean_b = LowRankNet( out_, lam_hidden_num) # то же самое для свободного коэффициента
        self.log_sigma_b = LowRankNet( out_, lam_hidden_num)
     
        self.log_sigma.const.data*= 0 # забьем константу нужными нам значениями
        self.log_sigma.const.data+= init_log_sigma
     
        self.log_sigma_b.const.data*= 0 # забьем константу нужными нам значениями
        self.log_sigma_b.const.data+= init_log_sigma
        
        
                
        self.in_ = in_
        self.out_ = out_
        self.act = act
        
    def forward(self,x, l):
        if self.training: # во время обучения - сэмплируем из нормального распределения
            self.eps_w = t.distributions.Normal(self.mean(l), t.exp(self.log_sigma(l)))
            self.eps_b = t.distributions.Normal(self.mean_b(l), t.exp(self.log_sigma_b(l)))
        
            w = self.eps_w.rsample()
            b = self.eps_b.rsample()
             
        else:  # во время контроля - смотрим средние значения параметра        
            w = self.mean(l) 
            b = self.mean_b(l)
            
        # функция активации 
        return self.act(t.matmul(x, w)+b)

    def KLD(self, l):        
        # подсчет дивергенции
        size = self.in_, self.out_
        out = self.out_
        self.eps_w = t.distributions.Normal(self.mean(l), t.exp(self.log_sigma(l)))
        self.eps_b = t.distributions.Normal(self.mean_b(l),  t.exp(self.log_sigma_b(l)))
        self.h_w = t.distributions.Normal(t.zeros(size, device=device), t.ones(size, device=device)*prior_sigma)
        self.h_b = t.distributions.Normal(t.zeros(out, device=device), t.ones(out, device=device)*prior_sigma)                
        k1 = t.distributions.kl_divergence(self.eps_w,self.h_w).sum()        
        k2 = t.distributions.kl_divergence(self.eps_b,self.h_b).sum()        
        return k1+k2
    

class VarLayerLinearAppr(nn.Module): # вариационная однослойная сеть
    def __init__(self, in_,  out_,   act=F.relu):         
        nn.Module.__init__(self)                    
        self.mean = LinearApprNet((in_, out_)) # параметры средних            
        self.log_sigma = LinearApprNet((in_, out_)) # логарифм дисперсии
        self.mean_b = LinearApprNet( out_) # то же самое для свободного коэффициента
        self.log_sigma_b = LinearApprNet( out_)
     
        self.log_sigma.const.data*= 0 # забьем константу нужными нам значениями
        self.log_sigma.const.data+= init_log_sigma
     
        self.log_sigma_b.const.data*= 0 # забьем константу нужными нам значениями
        self.log_sigma_b.const.data+= init_log_sigma
        
        
                
        self.in_ = in_
        self.out_ = out_
        self.act = act
        
    def forward(self,x, l):
        if self.training: # во время обучения - сэмплируем из нормального распределения
            self.eps_w = t.distributions.Normal(self.mean(l), t.exp(self.log_sigma(l)))
            self.eps_b = t.distributions.Normal(self.mean_b(l), t.exp(self.log_sigma_b(l)))
        
            w = self.eps_w.rsample()
            b = self.eps_b.rsample()
             
        else:  # во время контроля - смотрим средние значения параметра        
            w = self.mean(l) 
            b = self.mean_b(l)
            
        # функция активации 
        return self.act(t.matmul(x, w)+b)

    def KLD(self, l):        
        # подсчет дивергенции
        size = self.in_, self.out_
        out = self.out_
        self.eps_w = t.distributions.Normal(self.mean(l), t.exp(self.log_sigma(l)))
        self.eps_b = t.distributions.Normal(self.mean_b(l),  t.exp(self.log_sigma_b(l)))
        self.h_w = t.distributions.Normal(t.zeros(size, device=device), t.ones(size, device=device)*prior_sigma)
        self.h_b = t.distributions.Normal(t.zeros(out, device=device), t.ones(out, device=device)*prior_sigma)                
        k1 = t.distributions.kl_divergence(self.eps_w,self.h_w).sum()        
        k2 = t.distributions.kl_divergence(self.eps_b,self.h_b).sum()        
        return k1+k2    

In [17]:
l = VarLayerLinearAppr(784, 10)



In [18]:
l.log_sigma_b(100)

tensor([-2.9999, -2.9999, -2.9999, -2.9999, -2.9999, -2.9999, -2.9999, -2.9999,
        -2.9999, -2.9999], device='cuda:0', grad_fn=<AddBackward0>)

In [19]:
class VarSeqNet(nn.Sequential):    
    # класс-обертка на случай, если у нас многослойная нейронная сеть
    def KLD(self, lam = None):
        k = 0
        for l in self: 
            if lam is None:
                k+=l.KLD()
            else:
                k+=l.KLD(lam)
                
        return k
    
    def forward(self, x, lam = None):
        if lam is None:
            for l in self:
                x = l(x)
            return x
        else:
            for l in self:
                x = l(x, lam)
            return x
    


In [20]:
def train_batches(net, loss_fn, optimizer, i, out, out_loss, kld, loss, epoch):
    for id, (x,y) in enumerate(train_loader):  
            id+=1
            if device == 'cuda':
                x = x.cuda()
                y = y.cuda()            
            optimizer.zero_grad() 
            loss[i] = 0 
            #for _ in range(5):
            log_lam = np.random.uniform(low=log_lam_low, high=log_lam_high)
            lam = 10**log_lam 
            #lam = 1.0
            lam_param = lam/10**(log_lam_high) # нормируем вход
            out[i] = net(x, lam_param)
            # правдоподобие должно суммироваться по всей обучающей выборке
            # в случае батчей - она приводится к тому же порядку 
            out_loss[i] = loss_fn(out[i], y)* len(train_data) 
            #if epoch > warmup_epochs:                
            kld[i] =  net.KLD( lam_param) *lam
            #else:
            #    kld[i] =  net.KLD(lam_param) * 0.0


            loss[i] += (out_loss[i]+kld[i])       
            if id %100 == 0:           
                print ("Number of net:",i, loss[i].data, out_loss[i].data, kld[i].data, lam)            
                    
            loss[i].backward()       
            clip_grad_value_(net.parameters(), 1.0) # для стабильности градиента. С этим можно играться
            optimizer.step()

In [21]:
def statistic(net, loss_fn, i, kld, loss, out, out_loss):
    net.eval()  
    kld[i] =  net.KLD(1) 
    loss[i] = kld[i]
    for x,y in test_loader:
         if device == 'cuda':
            x = x.cuda()
            y = y.cuda()          
    out[i] = net(x, 1)   
    out_loss[i] = loss_fn(out[i], y)* len(train_data)
    #  print(out_loss[i])
    # print(loss[i])
    loss[i] += out_loss[i]
    net.train()
    print (loss[i])
    return loss[i]


    

In [22]:
# рассмотрим для примера сеть, состояющую из двух слоев
# второй слой - softmax. По сути для обучения задавать активацию явно не нужно, она забита в nn.CrossEntropyLoss
def init_nets(loss_fn_nets):
    for i in range(start_num):
        if mode == 'lowrank':
            nets.append(VarSeqNet(VarLayerLowRank(784,  hidden_num), VarLayerLowRank(hidden_num, 10, act=lambda x:x)))
        elif mode == 'linear':
            nets.append(VarSeqNet(VarLayerLinearAppr(784,  hidden_num), VarLayerLinearAppr(hidden_num, 10, act=lambda x:x)))
        else:
            raise ValueError('Bad mode')
        optimizer_nets.append(optim.Adam(nets[i].parameters(), lr=0.001))
        loss_fn_nets.append(nn.CrossEntropyLoss())
    loss_graph=[[],[],[]]
    out = [None, None, None]
    out_loss = [None, None, None]
    kld = [None, None, None]
    loss = [None, None, None]
    return out, out_loss, kld, loss, loss_graph

def train_nets(out, out_loss, kld, loss, loss_graph):
    for epoch in range(epoch_num):             
        for i,net in enumerate(nets):
            train_batches(net,loss_fn_nets[i], optimizer_nets[i],i, out, out_loss, kld, loss, epoch)
        print ('end of epoch: ', epoch)   
        for i,net in enumerate(nets):
            print("Number of net:",i)        
            loss_graph[i].append(statistic(net, loss_fn_nets[i], i, kld, loss, out, out_loss))

        

In [23]:
#print(loss_graf)
def graph_loss_func(loss_graph, nets):
    for i,net in enumerate(nets): 
        plt.plot(loss_graph[i])
    plt.ylabel('Loss function')
    plt.xlabel('Number of epoche')
    plt.show()
#print(out_loss)

#graph_loss_func()

In [24]:
def test_acc(out): # точность классификации
    acc = []
    for i,net in enumerate(nets):
        correct = 0
        net.eval()
        for x,y in test_loader:
            if device == 'cuda':
                x = x.cuda()
                y = y.cuda()     
            out[i] = net(x)    
            correct += out[i].argmax(1).eq(y).sum().cpu().numpy()
        acc.append(correct / len(test_data))
    print(sum(acc)/len(acc))   
    return(acc)
#test_acc(out)

In [25]:
# коэффициенты информативности, см. статью practical variational inference
# попробуем удалять параметры первого слоя по этому коэффициенту

def init_coeff(prune_coef, mu, sigma):
    for i,net in enumerate(nets): 
        mu.append(net[0].mean) 
        sigma.append(t.exp(2*net[0].log_sigma))
        prune_coef.append((mu[i]**2/sigma[i]).cpu().detach().numpy())  


In [26]:
# будем удалять по 10% от модели и смотреть качество
def delete_10(acc_delete, prune_coef, mu, sigma, nets, out):
    acc_delete = []
    sorted_coefs = []
    for i, net in enumerate(nets):
        sorted_coefs.append(np.sort(prune_coef[i].flatten()))
    for j in range(10):
        for i,net in enumerate(nets): 
            ids = (prune_coef[i] <= sorted_coefs[i][round(j/10*len(sorted_coefs[i]))]) 
            net[0].mean.data*=(1-t.tensor(ids*1.0, device=device, dtype=t.float))
            print ('nonzero params: ', (abs(net[0].mean)>0).float().mean())
        acc_delete.append(test_acc(out))
    return acc_delete    


In [27]:
def graph(acc_delete, lamb):
    proc = [0,10,20,30,40,50,60,70,80,90]
    plt.rcParams['figure.figsize'] = 12, 12
    for k, lam in enumerate(lamb):
        acc_delete_n = np.array(acc_delete[k])
        plt.plot(proc, np.mean(acc_delete_n, 1), label = 'lambda = {}'.format(str(lam)))
        # откладываем ошибку вокруг среднего, альфа - прозрачность линии
        plt.fill_between(proc, np.mean(acc_delete_n, 1)  + np.std(acc_delete_n, 1) , np.mean(acc_delete_n, 1) - np.std(acc_delete_n, 1) , alpha = 0.5 )
    plt.ylabel('Точность классификации', fontsize = 20)
    plt.xlabel('Процент удаления', fontsize = 20)
    plt.tick_params(axis='both', which='major', labelsize=18)
    plt.legend(loc='best')
    plt.savefig('1')
    plt.show()

#acc_delete = load('save_array_0.1')    
#graph(acc_delete, lamb)


    

In [28]:
# проверяем, что фокусов тут нет, удаляем оставшиеся 10%\
def delete_last10():
    flag = 0
    for j in range(10):
        for i,net in enumerate(nets): 
            if (flag == 0):
                sorted_coefs = np.sort(prune_coef[i].flatten())
                flag = 1
            ids = (prune_coef[i] <= sorted_coefs[round((0.9+j/100)*len(sorted_coefs))]) 
            net[0].mean.data*=(1-t.tensor(ids*1.0, device=device, dtype=t.float))
            print ('nonzero params: ', (abs(net[0].mean)>0).float().mean())
        (test_acc())
    for i,net in enumerate(nets):
        net[0].mean.data*=0
        print ('nonzero params: ', (abs(net[0].mean)>0).float().mean())
    (test_acc())
    
#delete_last10()    

In [None]:
loss_fn_nets =[]
nets = []
optimizer_nets = []
mu_glob = []
sigma_glob = []
prune_coef_glob = []
init_nets_output =  init_nets(loss_fn_nets)
train_nets(init_nets_output[0], init_nets_output[1], init_nets_output[2], init_nets_output[3], init_nets_output[4])
old_nets = nets[:]

for k,lam in enumerate(lamb):
    for i in range(len(nets)):
        new_net = VarSeqNet(VarLayer(784,  hidden_num), VarLayer(hidden_num, 10, act=lambda x:x))
        for j in range(0, 2): # бежим по слоям
            lam_param = lam / 10**log_lam_high
            new_net[j].mean.data*=0
            new_net[j].mean.data+=old_nets[i][j].mean(lam_param)
            new_net[j].mean_b.data*=0
            new_net[j].mean_b.data+=old_nets[i][j].mean_b(lam_param)
            new_net[j].log_sigma.data*=0
            new_net[j].log_sigma.data+=old_nets[i][j].log_sigma(lam_param)
            new_net[j].log_sigma_b.data*=0
            new_net[j].log_sigma_b.data+=old_nets[i][j].log_sigma_b(lam_param)
            
    nets[i] = new_net
            

        
    acc_delete.append(None)
    init_coeff(prune_coef_glob, mu_glob, sigma_glob)
    acc_delete[k]= delete_10(acc_delete[k], prune_coef_glob, mu_glob, sigma_glob, nets, init_nets_output[0])

    
init_coeff(prune_coef_glob, mu_glob, sigma_glob)    
#graph_loss_func()
graph(acc_delete,lamb)
save(acc_delete)


  


Number of net: 0 tensor(1.6137e+08, device='cuda:0') tensor(1.3356e+08, device='cuda:0') tensor(27804318., device='cuda:0') 32.77495446971793
Number of net: 0 tensor(55984864., device='cuda:0') tensor(54412196., device='cuda:0') tensor(1572666.7500, device='cuda:0') 3.4568149074219034
Number of net: 0 tensor(21751420., device='cuda:0') tensor(19422400., device='cuda:0') tensor(2329019., device='cuda:0') 8.58096363149618
Number of net: 0 tensor(8824769., device='cuda:0') tensor(5582224.5000, device='cuda:0') tensor(3242544.5000, device='cuda:0') 16.319677612708162
Number of net: 0 tensor(5704550., device='cuda:0') tensor(4189776.2500, device='cuda:0') tensor(1514774., device='cuda:0') 8.661383336424572
Number of net: 0 tensor(4099642.5000, device='cuda:0') tensor(3925222.7500, device='cuda:0') tensor(174419.6406, device='cuda:0') 1.1088039190546484
Number of net: 0 tensor(4153422.2500, device='cuda:0') tensor(4095002.7500, device='cuda:0') tensor(58419.4648, device='cuda:0') 0.389209623

end of epoch:  5
Number of net: 0
tensor(21989424., device='cuda:0', grad_fn=<AddBackward0>)
Number of net: 0 tensor(288560.1875, device='cuda:0') tensor(91781.6172, device='cuda:0') tensor(196778.5781, device='cuda:0') 0.6931179937489332
Number of net: 0 tensor(383588., device='cuda:0') tensor(237081.2969, device='cuda:0') tensor(146506.6875, device='cuda:0') 0.5292638079065602
Number of net: 0 tensor(717916.8125, device='cuda:0') tensor(576660.4375, device='cuda:0') tensor(141256.3906, device='cuda:0') 0.514791949659259
Number of net: 0 tensor(2103907., device='cuda:0') tensor(1102609.8750, device='cuda:0') tensor(1001297., device='cuda:0') 3.601693750742778
Number of net: 0 tensor(497183.1875, device='cuda:0') tensor(435105.2812, device='cuda:0') tensor(62077.9062, device='cuda:0') 0.22823561125845612
Number of net: 0 tensor(207177.9844, device='cuda:0') tensor(189898.6875, device='cuda:0') tensor(17279.3027, device='cuda:0') 0.0646409530718608
Number of net: 0 tensor(897225., devic

Number of net: 0 tensor(130104.0547, device='cuda:0') tensor(129125.5156, device='cuda:0') tensor(978.5408, device='cuda:0') 0.010129809479755092
end of epoch:  11
Number of net: 0
tensor(5227215.5000, device='cuda:0', grad_fn=<AddBackward0>)
Number of net: 0 tensor(214722.0781, device='cuda:0') tensor(191025.9219, device='cuda:0') tensor(23696.1621, device='cuda:0') 0.2358912291788127
Number of net: 0 tensor(308990.5000, device='cuda:0') tensor(216964.4219, device='cuda:0') tensor(92026.0859, device='cuda:0') 0.8963489715966227
Number of net: 0 tensor(415396.7188, device='cuda:0') tensor(344602.8750, device='cuda:0') tensor(70793.8359, device='cuda:0') 0.6786749471050655
Number of net: 0 tensor(7440338.5000, device='cuda:0') tensor(1316947.1250, device='cuda:0') tensor(6123391.5000, device='cuda:0') 54.05978446372039
Number of net: 0 tensor(879440.1875, device='cuda:0') tensor(244406.8906, device='cuda:0') tensor(635033.3125, device='cuda:0') 5.920278808293043
Number of net: 0 tensor(

Number of net: 0 tensor(9967566., device='cuda:0') tensor(1207787.5000, device='cuda:0') tensor(8759778., device='cuda:0') 36.918974185218964
Number of net: 0 tensor(3588469.2500, device='cuda:0') tensor(423227.6875, device='cuda:0') tensor(3165241.5000, device='cuda:0') 14.502772196819356
end of epoch:  17
Number of net: 0
tensor(2.8746e+10, device='cuda:0', grad_fn=<AddBackward0>)
Number of net: 0 tensor(1958797.7500, device='cuda:0') tensor(468243.2500, device='cuda:0') tensor(1490554.5000, device='cuda:0') 6.906385171824923
Number of net: 0 tensor(1.5816e+09, device='cuda:0') tensor(6.4318e+08, device='cuda:0') tensor(9.3846e+08, device='cuda:0') 81.81031012831652
Number of net: 0 tensor(1545692.5000, device='cuda:0') tensor(477463.0625, device='cuda:0') tensor(1068229.3750, device='cuda:0') 4.79551098126083
Number of net: 0 tensor(683448.6875, device='cuda:0') tensor(634358.8125, device='cuda:0') tensor(49089.8633, device='cuda:0') 0.21359321264298078
Number of net: 0 tensor(34827

In [257]:
new_net = VarSeqNet(VarLayer(784,  hidden_num), VarLayer(hidden_num, 10, act=lambda x:x))
i = 0 
for j in range(0, 2): # бежим по слоям
    new_net[j].mean.data*=0
    new_net[j].mean.data+=old_nets[i][j].mean(lam)
    new_net[j].mean_b.data*=0
    new_net[j].mean_b.data+=old_nets[i][j].mean_b(lam)
    new_net[j].log_sigma.data*=0
    new_net[j].log_sigma.data+=old_nets[i][j].log_sigma(lam)
    new_net[j].log_sigma_b.data*=0
    new_net[j].log_sigma_b.data+=old_nets[i][j].log_sigma_b(lam)

  """


In [258]:
lam

0

In [243]:
new_net(x)

tensor([[1213.8149,  885.8934, 1215.5417,  883.6002,  768.9736, 1339.3053,
         1235.9641, 1001.3223, 1015.4268, 1144.4662],
        [1113.5548,  825.9999, 1122.1970,  829.0692,  703.5684, 1222.4860,
         1139.6187,  909.0403,  941.6868, 1039.6541],
        [1339.7222, 1000.9510, 1352.3000,  987.5376,  854.2031, 1484.1738,
         1375.6842, 1108.0099, 1130.4620, 1266.8187],
        [1002.4268,  716.7179,  990.4283,  723.5088,  620.0521, 1093.7192,
         1012.1857,  808.2672,  829.2755,  925.5312],
        [1214.7711,  881.5259, 1217.4650,  878.8027,  773.8610, 1341.7554,
         1239.9010,  995.3021, 1017.7711, 1147.7567],
        [1264.0046,  948.8234, 1279.3776,  936.2783,  807.3743, 1401.2827,
         1299.3010, 1048.4663, 1071.0332, 1198.0283],
        [1162.2483,  848.7446, 1162.3492,  849.2804,  742.9947, 1287.5316,
         1187.1021,  956.5825,  976.6001, 1098.8002],
        [1161.8960,  854.9814, 1167.5698,  849.1383,  741.3011, 1291.7340,
         1193.3240,  9