In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.datasets as dsets
import torchvision.transforms as transforms
import torch.nn.init as init
import torch.nn.functional as F
import visdom
import copy
import torch.nn.utils.prune as prune
from tqdm.notebook import tqdm
import numpy as np
import timeit

# custom librarys
# model, parameters
import custom.utils as cu

In [2]:
torch.manual_seed(55)
torch.cuda.manual_seed_all(55)
torch.backends.cudnn.enabled = False

In [3]:
GPU_NUM = 1
device = torch.device(f'cuda:{GPU_NUM}' if torch.cuda.is_available() else 'cpu')
torch.cuda.set_device(device)

print ('Available devices ', torch.cuda.device_count())
print ('Current cuda device ', torch.cuda.current_device())
print(torch.cuda.get_device_name(device))

print("cpu와 cuda 중 다음 기기로 학습함:", device, '\n')

Available devices  2
Current cuda device  1
GeForce RTX 2080 Ti
cpu와 cuda 중 다음 기기로 학습함: cuda:1 



In [4]:
# visdom setting
vis = visdom.Visdom()
vis.close(env="main")

# make plot
vis_plt = vis.line(X=torch.Tensor(1).zero_(), Y=torch.Tensor(1).zero_(), 
                    opts=dict(title = 'LeNet300_Accuracy_Tracker',
                              legend=['100'],
                             showlegend=True,
                              xtickmin = 0,
                              xtickmax = 20000,
                              ytickmin = 0.95,
                              ytickmax = 0.99
                             )
                   )

def visdom_plot(loss_plot, loss_value, num, name):
    vis.line(X = num,
            Y = loss_value,
            win = loss_plot,
            name = name,
            update = 'append'
            )

Setting up a new session...


In [None]:

optimizer = optim.SGD(model.parameters(), lr=1.0)
criterion = nn.CrossEntropyLoss()

batch_size = 10
x = torch.randn(batch_size, 2)
target = torch.randint(0, 2, (batch_size,))

# Get weight before training
w0 = model[0].weight.detach().clone()

# Single training iteration
optimizer.zero_grad()
output = model(x)
loss = criterion(output, target)
loss.backward()
print('Gradient: ', model[0].weight.grad)
optimizer.step()

# Compare weight update
w1 = model[0].weight.detach().clone()
print('Weights updated ', w0!=w1)

In [7]:
model = nn.Sequential(
    nn.Linear(2, 2),
    nn.Sigmoid(),
    nn.Linear(2, 2)
)

In [8]:
# Create Gradient mask
gradient_mask = torch.zeros(2, 2)
gradient_mask[0, 0] = 1.0
model[0].weight.register_hook(lambda grad: grad.mul_(gradient_mask))

<torch.utils.hooks.RemovableHandle at 0x7f3696e5e5d0>

In [9]:
gradient_mask

tensor([[1., 0.],
        [0., 0.]])

In [14]:
model.state_dict().keys()

odict_keys(['0.weight', '0.bias', '2.weight', '2.bias'])

In [10]:
optimizer = optim.SGD(model.parameters(), lr=1.0)
criterion = nn.CrossEntropyLoss()

batch_size = 10
x = torch.randn(batch_size, 2)
target = torch.randint(0, 2, (batch_size,))

# Get weight before training
w0 = model[0].weight.detach().clone()

# Single training iteration
optimizer.zero_grad()
output = model(x)
loss = criterion(output, target)
loss.backward()
print('Gradient: ', model[0].weight.grad)
optimizer.step()

# Compare weight update
w1 = model[0].weight.detach().clone()
print('Weights updated ', w0!=w1)

Gradient:  tensor([[-0.0006, -0.0000],
        [-0.0000,  0.0000]])
Weights updated  tensor([[ True, False],
        [False, False]])


In [21]:
model[2].weight.grad[0] * [0, 0]

TypeError: only integer tensors of a single element can be converted to an index

In [5]:
model_type = 'Conv6'

In [6]:
#switch = 0
best_accu = []

In [7]:
param = cu.parameters()

In [8]:
param.__dict__

{'lr': 'empty',
 'epochs': 'empty',
 'batch_size': 'empty',
 'weight_decay': 'empty',
 'iteration': 'empty',
 'remaining_weight_conv': 'empty',
 'remaining_weight_fc': 'empty',
 'remaining_weight_ffc': 'empty',
 'prune_per_c': 'empty',
 'prune_per_f': 'empty',
 'prune_per_o': 'empty',
 'noi': 'empty',
 'trainset': 'empty',
 'valset': 'empty',
 'testset': 'empty',
 'train_loader': 'empty',
 'val_loader': 'empty',
 'test_loader': 'empty'}

In [9]:
param.type(model_type)

In [10]:
param.__dict__

{'lr': 0.0003,
 'epochs': 50,
 'batch_size': 60,
 'weight_decay': 0.003,
 'iteration': 0,
 'remaining_weight_conv': 'empty',
 'remaining_weight_fc': 'empty',
 'remaining_weight_ffc': 'empty',
 'prune_per_c': 0.15,
 'prune_per_f': 0.2,
 'prune_per_o': 0.1,
 'noi': 12,
 'trainset': Dataset CIFAR10
     Number of datapoints: 50000
     Root location: ../CIFAR10/
     Split: Train
     StandardTransform
 Transform: Compose(
                ToTensor()
                Normalize(mean=(0.4914, 0.4822, 0.4465), std=(0.247, 0.243, 0.261))
            ),
 'valset': Dataset CIFAR10
     Number of datapoints: 50000
     Root location: ../CIFAR10/
     Split: Train
     StandardTransform
 Transform: Compose(
                ToTensor()
                Normalize(mean=(0.4914, 0.4822, 0.4465), std=(0.247, 0.243, 0.261))
            ),
 'testset': Dataset CIFAR10
     Number of datapoints: 10000
     Root location: ../CIFAR10/
     Split: Test
     StandardTransform
 Transform: Compose(
                

In [11]:
param.epochs = 5

In [12]:
param.epochs

5

# parameter
lr = 0.0012
#epochs = 50
#epochs = 20
epochs = 30
batch_size = 60
weight_decay = 1.2e-3
iteration = 0
remaining_weight = 1
prune_per = 0.2
# number of iteration
noi = 11

switch = 0
best_accu = []
# 마지막 layer의 Pruning rate는 기존의 1/2
# prune_per_ll = prune_per/2

cp_mask

transforms = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))])

mnist_train = dsets.MNIST(root='../MNIST_data/',
                         train=True,
                         transform=transforms,
                         download=True)
mnist_test = dsets.MNIST(root='../MNIST_data/',
                        train=False,
                        transform=transforms,
                        download=True)

train_loader = torch.utils.data.DataLoader(dataset=mnist_train,
                                         batch_size=batch_size,
                                         shuffle=True,
                                         drop_last=True)
test_loader = torch.utils.data.DataLoader(dataset=mnist_test,
                                         shuffle=False,
                                         drop_last=True)

In [13]:
# train, test, prune function
def train(model, dataloader, optimizer, criterion, cp_mask):
    model.train()
    running_loss = 0.0
    for batch_idx, (data, label) in enumerate(dataloader):
        data, label = data.to(device), label.to(device)
        optimizer.zero_grad()
        outputs = model(data)
        loss = criterion(outputs, label)
        loss.backward()
        # 0-weight 학습 방지
        if cp_mask:
            i = 0
            for name, p in model.named_parameters():
                if 'weight' in name:
                    p.grad.data *= cp_mask[i]
                    i += 1
        optimizer.step()
        running_loss += loss / len(dataloader)
    return running_loss

def test(model, dataloader, criterion):
    model.eval()
    correct = 0
    test_loss = 0
    with torch.no_grad():
        for data, label in dataloader:
            data, label = data.to(device), label.to(device)
            outputs = model(data)
            #test_loss += F.nll_loss(outputs, label, reduction='sum').item() # sum up batch loss
            loss = criterion(outputs, label)
            predicted = outputs.data.max(1, keepdim=True)[1]
            correct += predicted.eq(label.data.view_as(predicted)).sum().item()
            
            test_loss += loss / len(dataloader)
        accuracy =  correct / len(dataloader)
        # 로더 -> 배치 개수 로더.dataset -> 전체 길이, 
    return accuracy, test_loss

# prune function
# pruning mask 생성 -> mask 복사 -> init값 복사 -> prune 진행
def weight_init(model1, model2, c_rate, f_rate, o_rate):
    # prune mask 생성
    for name, module in model1.named_modules():
        if isinstance(module, nn.Conv2d):
            prune.l1_unstructured(module, name = 'weight', amount = c_rate)
        if isinstance(module, nn.Linear):
            # bottle neck 방지
            if name != 'fc3':
                prune.l1_unstructured(module, name = 'weight', amount = f_rate)
                break
            else:
                prune.l1_unstructured(module, name = 'weight', amount = o_rate)
                        
    # mask 복사
    cp_mask = []
    for name, mask in model1.named_buffers():
        cp_mask.append(mask)
    
    # init 값을 model에 복사
    for name, p in model1.named_parameters():
        if 'weight_orig' in name:
            for name2, p2 in model2.named_parameters():
                if name[0:len(name) - 5] in name2:
                    p.data = copy.deepcopy(p2.data)
                    break
        if 'bias_orig' in name:
            for name2, p2 in model2.named_parameters():
                if name[0:len(name) - 5] in name2:
                    p.data = copy.deepcopy(p2.data)
                    break
                    
    # prune 진행
    for name, module in model1.named_modules():
        
        if isinstance(module, nn.Conv2d):
            prune.remove(module, name = 'weight')
        elif isinstance(module, nn.Linear):
            prune.remove(module, name = 'weight')
    # copy된 mask return
    return cp_mask

In [14]:
print(len(param.train_loader), len(param.val_loader))

750 83


def test(model, dataloader, criterion):
    model.eval()
    correct = 0.0
    total = 0.0
    with torch.no_grad():
        for data, label in dataloader:
            data, label = data.to(device), label.to(device)
            outputs = model(data)
            
            predicted = torch.argmax(outputs.data, 1)
            total += label.size(0)
            correct += (predicted == label).sum().item()
            accuracy = (correct/total)

    return accuracy

In [15]:
model = cu.Conv6().to(device)
model_init = copy.deepcopy(model)

criterion = nn.CrossEntropyLoss().to(device)
#optimizer = optim.Adam(model.parameters(), lr = lr, weight_decay = 1.2e-3)

In [16]:
#EPS = 1e-6
# number of weight
a = ((model.fc1.weight != 0).sum(dim=1)).sum(dim=0) + ((model.fc2.weight != 0).sum(dim=1)).sum(dim=0) + ((model.fc3.weight != 0).sum(dim=1)).sum(dim=0)
#b = ((model.fc1.weight == 0).sum(dim=1)).sum(dim=0) + ((model.fc2.weight == 0).sum(dim=1)).sum(dim=0) + ((model.fc3.weight == 0).sum(dim=1)).sum(dim=0)
b = ((model.fc1.weight == 0).sum(dim=1)).sum(dim=0) + ((model.fc2.weight == 0).sum(dim=1)).sum(dim=0) + ((model.fc3.weight == 0).sum(dim=1)).sum(dim=0)

now = (a + b)

In [17]:
def calc_now(model):
    fc1_1 = ((model.fc1.weight != 0).sum(dim=1)).sum(dim=0).item()
    fc1_0 = ((model.fc1.weight == 0).sum(dim=1)).sum(dim=0).item()
    fc1 = fc1_1 + fc1_0
    fc1_p = fc1_0 / fc1_1
    fc2_1 = ((model.fc2.weight != 0).sum(dim=1)).sum(dim=0).item()
    fc2_0 = ((model.fc2.weight == 0).sum(dim=1)).sum(dim=0).item()
    fc2 = fc2_1 + fc2_0
    fc3_1 = ((model.fc3.weight != 0).sum(dim=1)).sum(dim=0).item()
    fc3_0 = ((model.fc3.weight == 0).sum(dim=1)).sum(dim=0).item()
    fc3 = fc3_1 + fc3_0
    #print(fc1, fc2, fc3, fc1+fc2+fc3, fc1_1 + fc2_1 + fc3_1 ,fc1_0 + fc2_0 + fc3_0)
    print("Remaining weight %.1f %%" %(((fc1_1+fc2_1+fc3_1)/(fc1+fc2+fc3))*100))
    print('total weight :',
        '%d' % (fc1+fc2+fc3),
         '(%d |' % (fc1_1+fc2_1+fc3_1),
         '%d)' % (fc1_0+fc2_0+fc3_0)
         )
    print('fc1 :',
        '%d' % fc1,
         '(%d |' % fc1_1,
         '%d)' % fc1_0
         )
    print('fc2 :',
        '%d' % fc2,
         '(%d |' % fc2_1,
         '%d)' % fc2_0
         )
    print('fc3 :',
        '%d' % fc3,
         '(%d |' % fc3_1,
         '%d)' % fc3_0
         )

a

b

weight_init(model, model_init, 1 - weight_remaining)

weight_init(model, model_init, 1 - weight_remaining)

model.state_dict().keys()

print(model.fc3.weight_orig)

for name, module in model.named_modules():
    if isinstance(module, nn.Linear):
        prune.l1_unstructured(module, name = 'weight', amount = 0.9)

    # init 값 복사
for name, p in model.named_parameters():
     if 'weight_orig' in name:
        for name2, p2 in model_init.named_parameters():
            if name[0:len(name) - 5] in name2:
                p.data = copy.deepcopy(p2.data)
                break
    if 'bias_orig' in name:
        for name2, p2 in modelinit.named_parameters():
            if name[0:len(name) - 5] in name2:
                p.data = copy.deepcopy(p2.data)
                break

for name, module in model.named_modules():
    if isinstance(module, nn.Linear):
        prune.remove(module, name = 'weight')

print(model.fc3.weight[0][0])

print(model_init.fc3.weight[0][0])

In [None]:
for i in range(param.noi):
    best_accu.append(0)
    best_accu[i] = [0, 0, 0]
    cp_mask = []
    remaining_weight = param.remaining_weight_f
    if i != 0:
        # x1 = 1 * (1-0.2)
        # x2 = 1 * (1-0.2) * (1-0.2)
        # ...
        # xn = 1 * (1-0.2) ** n -> 남은 weight
        # pruning weight 1 - (1-0.2)**n
        
        # 필요한 값은 pruning weight 
        # c = conv f = fc o = output layer
        param.remaining_weight_c = (1-param.prune_per_c) ** i
        param.remaining_weight_f = (1-param.prune_per_f) ** i
        param.remaining_weight_o = (1-param.prune_per_o) ** i
        #remaining_weight = param.remaining_weight_f
        #1- 남은 웨이트 -> prune 할 비율
        # pruning 및 mask 복사
        cp_mask = weight_init(model, model_init,
                              1 - param.remaining_weight_c,
                              1 - param.remaining_weight_f,
                              1 - param.remaining_weight_o
                             )
        #switch = 1
    optimizer = optim.Adam(model.parameters(), lr = param.lr, weight_decay = param.weight_decay)
    print("Learning start!\n")
    calc_now(model)
    
    print(model.fc3.weight[0])
    
    start_time = timeit.default_timer()
    #pw = ((model.fc1.weight == 0).sum(dim=1)).sum(dim=0) + ((model.fc2.weight == 0).sum(dim=1)).sum(dim=0) + ((model.fc3.weight == 0).sum(dim=1)).sum(dim=0)
    #print('pruned weight (All | Pruned) %d |' % now,'%d' % pw)
    #print(model.fc3.weight[0][0])
    #print(model_init.fc3.weight[0][0])
    
    
    
    
    for epoch in tqdm(range(param.epochs)):
        # epoch가 0일때 정확도 계산
        if epoch == 0:
            accuracy, test_loss = test(model, param.test_loader, criterion)
            visdom_plot(vis_plt, torch.Tensor([accuracy]), torch.Tensor([0]),
                        str(round(remaining_weight*100, 1))
                       )
            print('[epoch : %d]' % (epoch),
             '(loss: x.xxxxx)',
             '(accu: %.4f)' % (accuracy)
             )
        # model training    
        running_loss = train(model, param.train_loader, optimizer, criterion, cp_mask)
        # val_set이 있을 경우 val_set을 통해 loss, accu를 구한다.
        if param.valset == 'empty':
            accuracy, test_loss = test(model, param.test_loader, criterion)
        else:
            accuracy, test_loss = test(model, param.val_loader, criterion)
        
        # visdom plot
        visdom_plot(vis_plt, torch.Tensor([accuracy]), torch.Tensor([(epoch+1) * 1000]),
                    str(round(remaining_weight*100, 1))
                   )
        
        # best accuracy list (weight_remain, epoch, accuracy)
        if best_accu[i][2] <= accuracy:
            best_accu[i] = [remaining_weight, epoch, accuracy]
        
        print('[epoch : %d]' % (epoch+1),
             '(r_loss: %.5f)' % (running_loss),
             '(t_loss: %.5f)' % (test_loss),
             '(accu: %.4f)' % (accuracy)
             )
    stop_time = timeit.default_timer()
    #print(model.fc3.weight[0][0])
    #print(model_init.fc3.weight[0][0])
    
    print(model.fc3.weight[0])
    
    print("Finish!",
          "(Best accu: %.4f)" % best_accu[i][2],
          "(Time taken(sec) : %.2f)" % (stop_time - start_time),
          "\n\n\n\n\n\n\n")
    #calc_now(model)

    

Learning start!

Remaining weight 100.0 %
total weight : 1116672 (1116672 | 0)
fc1 : 1048576 (1048576 | 0)
fc2 : 65536 (65536 | 0)
fc3 : 2560 (2560 | 0)
tensor([ 7.6890e-03, -3.6649e-02,  5.9590e-02, -4.2926e-02,  2.7237e-02,
         5.2710e-02, -5.4603e-02,  3.5584e-02,  2.7941e-02, -3.2613e-02,
        -5.8250e-02, -4.6459e-02,  9.4622e-03,  4.9968e-02,  5.2741e-02,
        -7.3652e-03, -4.9151e-02, -3.7955e-02,  1.2158e-03,  5.4745e-02,
        -1.7738e-02, -4.9666e-02,  3.2432e-02,  2.1323e-02, -1.7187e-02,
        -3.6458e-02,  5.1150e-02, -6.1049e-03, -1.0101e-02,  1.0226e-02,
        -3.4342e-02,  9.4833e-04, -4.0037e-02, -2.6993e-02, -2.1734e-02,
        -1.0343e-02,  4.8679e-02, -1.1301e-02, -4.4951e-02, -2.4664e-03,
        -1.6751e-02,  4.2464e-02,  5.2488e-02, -8.3954e-03,  5.9923e-02,
         5.4813e-02, -3.3796e-02,  4.9552e-02, -2.6248e-02,  3.5886e-02,
        -3.0951e-02,  4.2183e-03, -3.6124e-02, -6.3934e-04, -3.2179e-02,
        -4.8573e-02, -3.5609e-02, -3.6832e-0

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))

[epoch : 0] (loss: x.xxxxx) (accu: 0.1000)


[epoch : 0] (loss: x.xxxxx) (accu: 0.1053)
[epoch : 1] (r_loss: 0.00008) (t_loss: -8.30549) (accu: 0.9617)
[epoch : 2] (r_loss: 0.00012) (t_loss: -8.05022) (accu: 0.9668)
[epoch : 3] (r_loss: 0.00021) (t_loss: -8.19537) (accu: 0.9714)
[epoch : 4] (r_loss: 0.00009) (t_loss: -7.94015) (accu: 0.9688)
[epoch : 5] (r_loss: 0.00006) (t_loss: -8.75366) (accu: 0.9603)


1
[epoch : 0] (loss: x.xxxxx) (accu: 0.1053)
[epoch : 1] (loss: 0.00008) (accu: 0.9617)
[epoch : 2] (loss: 0.00012) (accu: 0.9668)
[epoch : 3] (loss: 0.00021) (accu: 0.9714)
[epoch : 4] (loss: 0.00009) (accu: 0.9688)
[epoch : 5] (loss: 0.00006) (accu: 0.9603)
2
[epoch : 0] (loss: x.xxxxx) (accu: 0.0980)
[epoch : 1] (loss: 0.00002) (accu: 0.9651)
[epoch : 2] (loss: 0.00004) (accu: 0.9611)
[epoch : 3] (loss: 0.00003) (accu: 0.9703)
[epoch : 4] (loss: 0.00007) (accu: 0.9706)
[epoch : 5] (loss: 0.00002) (accu: 0.9663)

In [None]:
print(model.fc3.weight[0])

In [None]:
print("Maximum accuracy per weight remaining")
for i in range(len(best_accu)):
    print("Remaining weight %.1f %% " % (best_accu[i][0] * 100),
         "Epoch %d" % best_accu[i][1],
         "Accu %.4f %%" % best_accu[i][2])

In [None]:
print(model.fc3.weight)

for name, p in model.named_parameters():
    EPS = 1e-6
    if 'weight' in name:
        tensor = p.data.cpu().numpy()
        grad_tensor = p.grad.data.cpu().numpy()
        grad_tensor = np.where(tensor < EPS, 0, grad_tensor)
        p.grad.data = torch.from_numpy(grad_tensor).to(device)
        print(p.grad.data)

데이터 숫자 60000
배치 길이 60
배치 개수 1000
epoch = 50

이터레이션 횟수 50000