In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.datasets as dsets
import torchvision.transforms as transforms
import torch.nn.init as init
import torch.nn.functional as F
import visdom
import copy
import torch.nn.utils.prune as prune
from tqdm.notebook import tqdm
import numpy as np
import timeit

# custom librarys (model, parameters...) Lottery_Ticket_Prac/custom/utils.py
import custom.utils as cu

In [2]:
torch.manual_seed(55)
torch.cuda.manual_seed_all(55)
torch.backends.cudnn.enabled = False

In [3]:
# cuda setting
GPU_NUM = 1
device = torch.device(f'cuda:{GPU_NUM}' if torch.cuda.is_available() else 'cpu')
torch.cuda.set_device(device)

print ('Available devices ', torch.cuda.device_count())
print ('Current cuda device ', torch.cuda.current_device())
print(torch.cuda.get_device_name(device))

print("cpu와 cuda 중 다음 기기로 학습함:", device, '\n')

Available devices  2
Current cuda device  1
GeForce RTX 2080 Ti
cpu와 cuda 중 다음 기기로 학습함: cuda:1 



In [4]:
# set model type
model_type = 'LeNet300'
#model_type = 'Conv6'

best_accu = []

# model, parameter get
param = cu.parameters()

if model_type == 'LeNet300':
    model = cu.LeNet300().to(device)
elif model_type == 'Conv6':
    model = cu.Conv6().to(device)
#elif ...
    
param.type(model_type)    
model_init = copy.deepcopy(model)
criterion = nn.CrossEntropyLoss().to(device)

# parameter check
print('\n'.join("%s: %s" % item for item in param.__dict__.items()))

model_type: LeNet300
lr: 0.0012
epochs: 50
batch_size: 60
weight_decay: 0.0012
iteration: 0
prune_per_c: 1
prune_per_f: 0.2
prune_per_o: 0.1
noi: 12
trainset: Dataset MNIST
    Number of datapoints: 60000
    Root location: ../MNIST_data/
    Split: Train
    StandardTransform
Transform: Compose(
               ToTensor()
               Normalize(mean=(0.1307,), std=(0.3081,))
           )
valset: empty
testset: Dataset MNIST
    Number of datapoints: 10000
    Root location: ../MNIST_data/
    Split: Test
    StandardTransform
Transform: Compose(
               ToTensor()
               Normalize(mean=(0.1307,), std=(0.3081,))
           )
train_loader: <torch.utils.data.dataloader.DataLoader object at 0x7f711cdcf1d0>
val_loader: empty
test_loader: <torch.utils.data.dataloader.DataLoader object at 0x7f712486ed10>


In [5]:
# visdom setting
vis = visdom.Visdom()
vis.close(env="main")

Tracker_type = "Accuracy_Tracker"
title = model_type + "_" + Tracker_type

# make plot
vis_plt = vis.line(X=torch.Tensor(1).zero_(), Y=torch.Tensor(1).zero_(), 
                    opts=dict(title = title,
                              legend=['100.0'],
                             showlegend=True,
                              xtickmin = 0,
                              xtickmax = 20000,
                              ytickmin = 0.95,
                              ytickmax = 0.99
                             )
                   )

def visdom_plot(loss_plot, num, loss_value, name):
    vis.line(X = num,
            Y = loss_value,
            win = loss_plot,
            name = name,
            update = 'append'
            )

Setting up a new session...


In [6]:
# change parameter for test (class에 직접 접근하여 변경)
param.epochs = 5
param.noi = 5
"""
"""

'\n'

In [7]:
#a = '12345.weight_mask'

In [8]:
#a[:(len(a)-12)]

In [9]:
#cp_mask

In [10]:
# train, test, prune function
def train(model, dataloader, optimizer, criterion, cp_mask):
    model.train()
    running_loss = 0.0
    for batch_idx, (data, label) in enumerate(dataloader):
        data, label = data.to(device), label.to(device)
        optimizer.zero_grad()
        outputs = model(data)
        loss = criterion(outputs, label)
        loss.backward()
        # freeze 0-weight gradient
        """
        if cp_mask:
            i = 0
            for name, p in model.named_parameters():
                if 'weight' in name:
                    p.grad.data *= cp_mask[i]
                    i += 1
        """
        optimizer.step()
        running_loss += loss / len(dataloader)
    return running_loss

def test(model, dataloader, criterion):
    model.eval()
    correct = 0
    total = 0
    test_loss = 0
    with torch.no_grad():
        for data, label in dataloader:
            data, label = data.to(device), label.to(device)
            outputs = model(data)
            _, predicted = torch.max(outputs.data, 1)
            loss = criterion(outputs, label)

            test_loss += loss / len(dataloader)
            total += label.size(0)
            correct += (predicted == label).sum().item()
        # 로더 -> 배치 개수 로더.dataset -> 전체 길이, 
    return (correct/total), test_loss

# prune function
# pruning mask 생성 -> mask 복사 -> init값 복사 -> prune 진행
def weight_init(model1, model2, c_rate, f_rate, o_rate):
    # layer별로 지정된 rate만큼 prune mask 생성
    for name, module in model1.named_modules():
        if isinstance(module, nn.Conv2d):
            prune.l1_unstructured(module, name = 'weight', amount = c_rate)
        if isinstance(module, nn.Linear):
            if name != 'fc3':
                prune.l1_unstructured(module, name = 'weight', amount = f_rate)
            else:
                prune.l1_unstructured(module, name = 'weight', amount = o_rate)
                        
    # mask 복사
    """
    cp_mask = []
    for name, mask in model1.named_buffers():
        cp_mask.append(mask)
    """
    cp_mask = {}
    for name, mask in model1.named_buffers():
        cp_mask[name[:(len(name)-12)]] = mask
    
    # init 값을 model에 복사
    for name, p in model1.named_parameters():
        if 'weight_orig' in name:
            for name2, p2 in model2.named_parameters():
                if name[0:len(name) - 5] in name2:
                    p.data = copy.deepcopy(p2.data)
        if 'bias_orig' in name:
            for name2, p2 in model2.named_parameters():
                if name[0:len(name) - 5] in name2:
                    p.data = copy.deepcopy(p2.data)
                    
    # prune 진행
    for name, module in model1.named_modules():
        if isinstance(module, nn.Conv2d):
            prune.remove(module, name = 'weight')
        elif isinstance(module, nn.Linear):
            prune.remove(module, name = 'weight')            
    # copy된 mask return
    return cp_mask

# weight count function
# dict type ['Layer name' : [all, non_zero, zero, ratio]]
def weight_counter(model):
    layer_weight = {'all.weight':[0, 0, 0, 0]}
    
    for name, p in model.named_parameters():
        if 'weight' in name:
            remain, pruned = (p != 0).sum().item(), (p == 0).sum().item()
            layer_weight[name] = [remain+pruned, remain, pruned, round((remain/(remain+pruned))*100, 2)]
            
    for i in layer_weight.keys():
        for j in range(0, 3):
            layer_weight['all.weight'][j] += layer_weight[i][j]
    layer_weight['all.weight'][3] = round(layer_weight['all.weight'][1]/layer_weight['all.weight'][0]*100, 2)

    print("Layer".center(12), "Weight".center(39), "Ratio(%)".rjust(7), sep='')
    for i in layer_weight.keys():
        print("%s" % i.ljust(13), ":",
              ("%s (%s | %s)" % (layer_weight[i][0], layer_weight[i][1], layer_weight[i][2])).center(36),
              ("%.2f" % layer_weight[i][3]).rjust(7),
              sep=''
             )       
    return layer_weight

# print best accuracy in each iteration
def best_accuracy(best_accu):
    print(best_accu)
    print(len(best_accu))
    print("Maximum accuracy weight remaining")
    for i in range(len(best_accu)):
        print("Remaining weight %.1f %% " % (best_accu[i][0]),
             "Epoch %d" % best_accu[i][1],
             "Accu %.4f" % best_accu[i][2])

In [11]:
#optimizer = optim.Adam(model.parameters(), lr = param.lr, weight_decay = param.weight_decay)

def make_test (i, cp_mask):
   # this i refers to the parameter (which evaluates to the /value/ passed)
   return (lambda grad: grad.mul_(cp_mask[i]))
"""
i = 0
for name, data in model.named_parameters():
    if 'weight' in name:
        hook = model.fc1.weight.register_hook(make_test(i))
        i += 1

optimizer = optim.Adam(model.parameters(), lr = param.lr, weight_decay = param.weight_decay)
"""

print(make_test(i, cp_mask))

optimizer = optim.Adam(model.parameters(), lr = param.lr, weight_decay = param.weight_decay)


cp_mask = weight_init(model, model_init, 
                       (1),
                       (0.5),
                       (0.5)
                      )
a = model.fc1.weight.clone()
#a = model.fc1.weight.grad.mul_(cp_mask[0])
#hook = model.fc1.weight.register_hook(a)

for batch_idx, (data, label) in enumerate(param.train_loader):
        print(model.fc1.weight.grad)
        data, label = data.to(device), label.to(device)
        print(model.fc1.weight.grad)
        optimizer.zero_grad()
        print(model.fc1.weight.grad)
        outputs = model(data)
        print(model.fc1.weight.grad)
        loss = criterion(outputs, label)
        print(model.fc1.weight.grad)
        loss.backward()
        print(model.fc1.weight.grad)
        
        break

print(model.fc1.weight.grad)

optimizer.zero_grad()

module.weight.register_hook(lambda x, name=name: x*mask_dict[name])


i = 0
for name, data in range(model.named_parameters()):
    print(name)
    if 'weight' in name:
        module.weight.register_hook(lambda x, name=name: x*mask_dict[name])
        #a = (lambda grad : grad.mul_(cp_mask[i]))
        #print(a)
        #hook = model.fc1.weight.register_hook(make_test(i, cp_mask))
        i += 1

i = 0
for name, module in model.named_modules():
    print(name)
    i += 1

j = 0
for name, data in (model.named_parameters()):
    if 'weight' in name:
        #module.weight.register_hook(lambda x, name=name: x*mask_dict[name])
        #a = copy.deepcopy(lambda grad : grad.mul_(cp_mask[j]))
        #print(a)
        hook = data.register_hook(lambda grad, name=name : grad.mul_(cp_mask[j]))
        #print(hook)
        j += 1

#def apply_mask(model, mask_dict):
i = 0
for name, module in model.named_modules():
    if 'fc' in name:
        module.weight.data *= cp_mask[i]
        #checking if names of layers and it's tensor shapes match with that of masks
        print('module name is:', name, 'and weight size is:', module.weight.size()) 
        print('corresponding tensor is:', cp_mask[i].shape) #matching shapes for multiplication

        module.weight.register_hook(lambda grad, name=name : grad.mul_(cp_mask[j]))
        i += 1

In [12]:
for i in range(param.noi):
    best_accu.append(0)
    best_accu[i] = [0, 0, 0]
    cp_mask = {}
    

    # pruning 및 mask 복사
    # layer별 prune rate를 입력
    cp_mask = weight_init(model, model_init, 
                           (1 - ((1-param.prune_per_c) ** i)),
                           (1 - ((1-param.prune_per_f) ** i)),
                           (1 - ((1-param.prune_per_o) ** i))
                          )
    print(model.fc1.weight[0][300:325])
    
    if i != 0:
        hook.remove()

    for name, module in model.named_modules():
        if 'fc' in name:
            module.weight.register_hook(lambda grad, name=name : grad.mul_(cp_mask[name]))
    optimizer = optim.Adam(model.parameters(), lr = param.lr, weight_decay = param.weight_decay)
    """
    j = 0
    for name, module in model.named_modules():
        if 'fc' in name:
            module.weight.register_hook(lambda grad, j=j : grad.mul_(cp_mask[j]))
            j += 1
    optimizer = optim.Adam(model.parameters(), lr = param.lr, weight_decay = param.weight_decay)
    """
    """
    j = 0
    for name, data in (model.named_parameters()):
        if 'weight' in name:
            #module.weight.register_hook(lambda x, name=name: x*mask_dict[name])
            #a = copy.deepcopy(lambda grad : grad.mul_(cp_mask[j]))
            #print(a)
            hook = data.register_hook(lambda grad, name=name : grad.mul_(cp_mask[j]))
            #print(hook)
            j += 1
    optimizer = optim.Adam(model.parameters(), lr = param.lr, weight_decay = param.weight_decay)
    """
    
    
    
    """hook = model.fc1.weight.register_hook(lambda grad: grad.mul_(cp_mask[0]))
    hook = model.fc2.weight.register_hook(lambda grad: grad.mul_(cp_mask[1]))
    hook = model.fc3.weight.register_hook(lambda grad: grad.mul_(cp_mask[2]))
    optimizer = optim.Adam(model.parameters(), lr = param.lr, weight_decay = param.weight_decay)"""
    
    
    
    
    # prune 진행 후 남은 weight 수 확인
    weight_counts = weight_counter(model)
    # 총 weight 중 남은 weight의 수 저장 (visdom plot시 사용하기 위함)
    remaining_weight = weight_counts['all.weight'][3]
    # 시작 시간 check
    start_time = timeit.default_timer()
    
    
    print("Learning start!\n")
    for epoch in tqdm(range(param.epochs)):
        # epoch가 0일때 정확도 계산
        if epoch == 0:
            accuracy, test_loss = test(model, param.test_loader, criterion)
            visdom_plot(vis_plt,torch.Tensor([accuracy]), torch.Tensor([0]),
                        str(remaining_weight)
                       )
            print('[epoch : %d]' % (epoch),
             '(r_loss: x.xxxxx)',
             '(t_loss: x.xxxxx)',
             '(accu: %.4f)' % (accuracy)
             )
        # model training    
        running_loss = train(model, param.train_loader, optimizer, criterion, cp_mask)
        
        # val_set이 있을 경우 val_set을 통해 loss, accu를 구한다.
        if param.valset == 'empty':
            accuracy, test_loss = test(model, param.test_loader, criterion)
        else:
            accuracy, test_loss = test(model, param.val_loader, criterion)
        
        # visdom plot (plot window, x-axis, y-axis, label name)
        visdom_plot(vis_plt, torch.Tensor([(epoch+1) * 1000]), torch.Tensor([accuracy]),
                    str(remaining_weight)
                   )
        
        # best accuracy list (weight_remain, epoch, accuracy)
        if best_accu[i][2] <= accuracy:
            best_accu[i] = [remaining_weight, epoch, accuracy]
        
        print('[epoch : %d]' % (epoch+1),
             '(r_loss: %.5f)' % (running_loss),
             '(t_loss: %.5f)' % (test_loss),
             '(accu: %.4f)' % (accuracy)
             )
        
    print("---------------------------------------")
    weight_counts = weight_counter(model)
    print("---------------------------------------")    
    print(model.fc1.weight[0][300:325])
    stop_time = timeit.default_timer()    
    print("Finish!",
          "(Best accu: %.4f)" % best_accu[i][2],
          "(Time taken(sec) : %.2f)" % (stop_time - start_time),
          "\n\n\n\n\n\n\n")

# iteration별 최고 정확도 확인
best_accuracy(best_accu)

tensor([-0.0364, -0.0253,  0.0110, -0.0491, -0.0176, -0.0388, -0.0201,  0.0141,
         0.0521,  0.0196,  0.0637, -0.0456, -0.0173, -0.0687,  0.0146,  0.0409,
        -0.0027,  0.0476, -0.0080,  0.0368,  0.0281,  0.0333, -0.0130,  0.0091,
         0.0487], device='cuda:1', grad_fn=<SliceBackward>)
   Layer                     Weight                Ratio(%)
all.weight   :        266200 (266200 | 0)          100.00
fc1.weight   :        235200 (235200 | 0)          100.00
fc2.weight   :         30000 (30000 | 0)           100.00
fc3.weight   :          1000 (1000 | 0)            100.00
Learning start!



HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))

[epoch : 0] (r_loss: x.xxxxx) (t_loss: x.xxxxx) (accu: 0.0735)
[epoch : 1] (r_loss: 0.20805) (t_loss: 0.14887) (accu: 0.9502)
[epoch : 2] (r_loss: 0.10718) (t_loss: 0.11349) (accu: 0.9627)
[epoch : 3] (r_loss: 0.09354) (t_loss: 0.10031) (accu: 0.9679)
[epoch : 4] (r_loss: 0.08629) (t_loss: 0.09715) (accu: 0.9680)
[epoch : 5] (r_loss: 0.08111) (t_loss: 0.08932) (accu: 0.9716)

---------------------------------------
   Layer                     Weight                Ratio(%)
all.weight   :        266200 (266200 | 0)          100.00
fc1.weight   :        235200 (235200 | 0)          100.00
fc2.weight   :         30000 (30000 | 0)           100.00
fc3.weight   :          1000 (1000 | 0)            100.00
---------------------------------------
tensor([-0.0063,  0.0142, -0.0029,  0.0006,  0.0260,  0.0216,  0.0080,  0.0015,
         0.0015,  0.0009, -0.0002,  0.0003, -0.0036, -0.0059, -0.0121,  0.0129,
         0.0123,  0.0176,  0.0139,  0.0391,  0.0520,  0.0293,  0.0387,  0.0185,
        -

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))

[epoch : 0] (r_loss: x.xxxxx) (t_loss: x.xxxxx) (accu: 0.1287)
[epoch : 1] (r_loss: 0.19947) (t_loss: 0.11859) (accu: 0.9628)
[epoch : 2] (r_loss: 0.10571) (t_loss: 0.10958) (accu: 0.9642)
[epoch : 3] (r_loss: 0.08939) (t_loss: 0.12707) (accu: 0.9587)
[epoch : 4] (r_loss: 0.08335) (t_loss: 0.09968) (accu: 0.9676)
[epoch : 5] (r_loss: 0.07658) (t_loss: 0.11073) (accu: 0.9639)

---------------------------------------
   Layer                     Weight                Ratio(%)
all.weight   :      266200 (213060 | 53140)         80.04
fc1.weight   :      235200 (188160 | 47040)         80.00
fc2.weight   :        30000 (24000 | 6000)          80.00
fc3.weight   :          1000 (900 | 100)            90.00
---------------------------------------
tensor([ 0.0080, -0.0153, -0.0423, -0.0184, -0.0123,  0.0106,  0.0048,  0.0033,
         0.0033,  0.0020,  0.0000, -0.0000, -0.0113, -0.0175, -0.0063,  0.0486,
         0.0512,  0.0271, -0.0130, -0.0366, -0.0037,  0.0454,  0.0833,  0.0574,
         

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))

[epoch : 0] (r_loss: x.xxxxx) (t_loss: x.xxxxx) (accu: 0.2045)
[epoch : 1] (r_loss: 0.19382) (t_loss: 0.11184) (accu: 0.9658)
[epoch : 2] (r_loss: 0.10102) (t_loss: 0.10812) (accu: 0.9644)
[epoch : 3] (r_loss: 0.08698) (t_loss: 0.10264) (accu: 0.9686)
[epoch : 4] (r_loss: 0.07906) (t_loss: 0.08797) (accu: 0.9723)
[epoch : 5] (r_loss: 0.07493) (t_loss: 0.09596) (accu: 0.9704)

---------------------------------------
   Layer                     Weight                Ratio(%)
all.weight   :      266200 (170538 | 95662)         64.06
fc1.weight   :      235200 (150528 | 84672)         64.00
fc2.weight   :       30000 (19200 | 10800)          64.00
fc3.weight   :          1000 (810 | 190)            81.00
---------------------------------------
tensor([-1.1500e-03,  8.6840e-04, -5.4905e-05,  2.3534e-04, -1.6003e-03,
        -9.5676e-03,  3.6602e-05, -6.5233e-05, -6.8787e-05, -4.7163e-05,
         0.0000e+00, -0.0000e+00,  1.0154e-03,  7.0820e-04, -1.4696e-03,
         6.1298e-03,  1.0568e-

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))

[epoch : 0] (r_loss: x.xxxxx) (t_loss: x.xxxxx) (accu: 0.2783)
[epoch : 1] (r_loss: 0.18543) (t_loss: 0.10953) (accu: 0.9651)
[epoch : 2] (r_loss: 0.09727) (t_loss: 0.09330) (accu: 0.9704)
[epoch : 3] (r_loss: 0.08337) (t_loss: 0.09006) (accu: 0.9723)
[epoch : 4] (r_loss: 0.07435) (t_loss: 0.07700) (accu: 0.9768)
[epoch : 5] (r_loss: 0.07193) (t_loss: 0.10080) (accu: 0.9677)

---------------------------------------
   Layer                     Weight                Ratio(%)
all.weight   :      266200 (136511 | 129689)        51.28
fc1.weight   :      235200 (120422 | 114778)        51.20
fc2.weight   :       30000 (15360 | 14640)          51.20
fc3.weight   :          1000 (729 | 271)            72.90
---------------------------------------
tensor([-0.0000, -0.0000,  0.0000, -0.0000,  0.0237,  0.0091, -0.0000,  0.0000,
         0.0000,  0.0000,  0.0000, -0.0000, -0.0000, -0.0000, -0.0030, -0.0075,
        -0.0380, -0.0176, -0.0050, -0.0005,  0.0231,  0.0343,  0.0102, -0.0256,
        -

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))

[epoch : 0] (r_loss: x.xxxxx) (t_loss: x.xxxxx) (accu: 0.2605)
[epoch : 1] (r_loss: 0.18025) (t_loss: 0.10397) (accu: 0.9687)
[epoch : 2] (r_loss: 0.09443) (t_loss: 0.10736) (accu: 0.9683)
[epoch : 3] (r_loss: 0.08079) (t_loss: 0.08034) (accu: 0.9737)
[epoch : 4] (r_loss: 0.07250) (t_loss: 0.08598) (accu: 0.9726)
[epoch : 5] (r_loss: 0.06974) (t_loss: 0.08804) (accu: 0.9729)

---------------------------------------
   Layer                     Weight                Ratio(%)
all.weight   :      266200 (109282 | 156918)        41.05
fc1.weight   :      235200 (96338 | 138862)         40.96
fc2.weight   :       30000 (12288 | 17712)          40.96
fc3.weight   :          1000 (656 | 344)            65.60
---------------------------------------
tensor([-0.0000, -0.0000,  0.0000, -0.0000,  0.0365,  0.0442, -0.0000,  0.0000,
         0.0000,  0.0000,  0.0000, -0.0000, -0.0000, -0.0000, -0.0310, -0.0110,
         0.0021, -0.0024,  0.0060,  0.0000,  0.0446,  0.0371,  0.0449,  0.0140,
        -

In [13]:
model.fc1.weight[0][300:325]

tensor([-0.0000, -0.0000,  0.0000, -0.0000,  0.0365,  0.0442, -0.0000,  0.0000,
         0.0000,  0.0000,  0.0000, -0.0000, -0.0000, -0.0000, -0.0310, -0.0110,
         0.0021, -0.0024,  0.0060,  0.0000,  0.0446,  0.0371,  0.0449,  0.0140,
        -0.0279], device='cuda:1', grad_fn=<SliceBackward>)

In [14]:
len(cp_mask['fc1'])

300

In [15]:
for i in range (3):
    lambda a : a + 1
    print (a)

NameError: name 'a' is not defined

In [None]:
print(model.fc1.weight[0])