In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.datasets as dsets
import torchvision.transforms as transforms
import torch.nn.init as init
import torch.nn.functional as F
import visdom
import copy
import torch.nn.utils.prune as prune
from tqdm.notebook import tqdm
import numpy as np
import timeit
from functools import partial

# custom librarys (model, parameters...)
import custom.utils as cu

torch.manual_seed(55)
torch.cuda.manual_seed_all(55)
torch.backends.cudnn.enabled = False

GPU_NUM = 1
device = torch.device(f'cuda:{GPU_NUM}' if torch.cuda.is_available() else 'cpu')
torch.cuda.set_device(device)

print ('Available devices ', torch.cuda.device_count())
print ('Current cuda device ', torch.cuda.current_device())
print(torch.cuda.get_device_name(device))

print("cpu와 cuda 중 다음 기기로 학습함:", device, '\n')

Available devices  2
Current cuda device  1
GeForce RTX 2080 Ti
cpu와 cuda 중 다음 기기로 학습함: cuda:1 



In [2]:
model = nn.Sequential(
    nn.Linear(2, 2),
    nn.Sigmoid(),
    nn.Linear(2, 2)
)
model
print(model[0].weight)
batch_size = 10
x = torch.randn(batch_size, 2)
target = torch.randint(0, 2, (batch_size,))

print(target)

gradient_mask = torch.zeros(2, 2)
gradient_mask[0, 0] = 1.0
gradient_mask[1, 1] = 1.0
gradient_mask[1, 0] = 1.0
print(gradient_mask)
criterion = nn.CrossEntropyLoss()

for name, i in model.named_parameters():
    if '0.weight' in name:
        i.data *= gradient_mask

Parameter containing:
tensor([[0.4748, 0.6191],
        [0.3449, 0.0954]], requires_grad=True)
tensor([0, 1, 1, 0, 1, 0, 0, 0, 1, 0])
tensor([[1., 0.],
        [1., 1.]])


In [None]:
model[0].weight.register_hook(lambda grad: grad.mul_(gradient_mask))
optimizer = optim.SGD(model.parameters(), lr=1.0 ,weight_decay = 0.003)
for i in range(200):
    # Get weight before training
    w0 = model[0].weight.detach().clone()

    # Single training iteration
    optimizer.zero_grad()
    output = model(x)
    loss = criterion(output, target)
    loss.backward()
    #print('Gradient: ', model[0].weight.grad)

    optimizer.step()
    if i == 0:
        print('Gradient: ', model[0].weight.grad)
        print(model[0].weight, '\n')
    elif i == 99:
        print('Gradient: ', model[0].weight.grad)
        print(model[0].weight, '\n')

In [None]:
optimizer = optim.SGD(model.parameters(), lr=1.0 ,weight_decay = 0.003)
for i in range(200):
    # Get weight before training
    w0 = model[0].weight.detach().clone()

    # Single training iteration
    optimizer.zero_grad()
    output = model(x)
    loss = criterion(output, target)
    loss.backward()
    #print('Gradient: ', model[0].weight.grad)

    for name, p in model.named_parameters():
        if '0.weight' in name:
            p.grad.data *= gradient_mask
    print(model[0].weight.grad)   
    optimizer.step()
    #print(model[0].weight.grad)
    print(model[0].weight)
    """if i == 0:
        print('Gradient: ', model[0].weight.grad)
        print(model[0].weight, '\n')
    elif i == 99:
        print('Gradient: ', model[0].weight.grad)
        print(model[0].weight, '\n')"""

In [6]:
optimizer = optim.SGD(model.parameters(), lr=1.0 ,weight_decay = 0.003)
EPS = 1e-8
for i in range(1):
    # Get weight before training
    w0 = model[0].weight.detach().clone()
    print(model[0].weight)
    print(model[0].weight.grad)
    print(model[0].weight.grad.data)
    # Single training iteration
    optimizer.zero_grad()
    output = model(x)
    loss = criterion(output, target)
    loss.backward()
    #print('Gradient: ', model[0].weight.grad)
    print(model[0].weight)
    print(model[0].weight.grad)
    print(model[0].weight.grad.data)
    for name, p in model.named_parameters():
        if '0.weight' in name:
            print(p)
            print(p.grad.data)
            tensor = p.data.numpy()
            grad_tensor = p.grad.data.numpy()
            print(grad_tensor)
            grad_tensor = np.where(tensor < EPS, 0, grad_tensor)
            print(grad_tensor)
            p.grad.data = torch.from_numpy(grad_tensor)
            print(p.grad.data)
    #print(model[0].weight.grad)    
    optimizer.step()
    #print(model[0].weight)
    """
    w1 = model[0].weight.detach().clone()

    if i == 0:
        print('Gradient: ', model[0].weight.grad)
        print(model[0].weight, '\n')
    elif i == 99:
        print('Gradient: ', model[0].weight.grad)
        print(model[0].weight, '\n')"""



Parameter containing:
tensor([[0.4513, 0.0000],
        [0.3790, 0.0638]], requires_grad=True)
tensor([[ 0.0084,  0.0000],
        [-0.0138,  0.0134]])
tensor([[ 0.0084,  0.0000],
        [-0.0138,  0.0134]])
Parameter containing:
tensor([[0.4513, 0.0000],
        [0.3790, 0.0638]], requires_grad=True)
tensor([[ 0.0081, -0.0060],
        [-0.0136,  0.0134]])
tensor([[ 0.0081, -0.0060],
        [-0.0136,  0.0134]])
Parameter containing:
tensor([[0.4513, 0.0000],
        [0.3790, 0.0638]], requires_grad=True)
tensor([[ 0.0081, -0.0060],
        [-0.0136,  0.0134]])
[[ 0.00811383 -0.00604693]
 [-0.01363752  0.01343278]]
[[ 0.00811383  0.        ]
 [-0.01363752  0.01343278]]
tensor([[ 0.0081,  0.0000],
        [-0.0136,  0.0134]])


In [None]:
sum(w0 != w1)

In [None]:
(0 != w1).sum()

In [None]:
        for name, p in model.named_parameters():
            if 'weight' in name:
                tensor = p.data.cpu().numpy()
                grad_tensor = p.grad.data.cpu().numpy()
                grad_tensor = np.where(tensor < EPS, 0, grad_tensor)
                p.grad.data = torch.from_numpy(grad_tensor).to(device)

In [None]:
model_type = 'LeNet300'
#model_type = 'Conv6'

In [None]:
# train, test, prune function
def train(model, dataloader, optimizer, criterion, cp_mask):
    model.train()
    running_loss = 0.0
    for batch_idx, (data, label) in enumerate(dataloader):
        data, label = data.to(device), label.to(device)
        optimizer.zero_grad()
        outputs = model(data)
        loss = criterion(outputs, label)
        loss.backward()
        # 0-weight 학습 방지
        """
        if cp_mask:
            i = 0
            for name, p in model.named_parameters():
                if 'weight' in name:
                    p.grad.data *= cp_mask[i]
                    i += 1
        """            
        optimizer.step()
        running_loss += loss / len(dataloader)
    return running_loss

def test(model, dataloader, criterion):
    model.eval()
    correct = 0
    total = 0
    test_loss = 0
    with torch.no_grad():
        for data, label in dataloader:
            data, label = data.to(device), label.to(device)
            outputs = model(data)
            _, predicted = torch.max(outputs.data, 1)
            #test_loss += F.nll_loss(outputs, label, reduction='sum').item() # sum up batch loss
            loss = criterion(outputs, label)
            #predicted = outputs.data.max(1, keepdim=True)[1]
            #correct += predicted.eq(label.data.view_as(predicted)).sum().item()
            
            test_loss += loss / len(dataloader)
            total += label.size(0)
            correct += (predicted == label).sum().item()
        #accuracy =  correct / len(dataloader)
        # 로더 -> 배치 개수 로더.dataset -> 전체 길이, 
    return (correct/total), test_loss

# prune function
# pruning mask 생성 -> mask 복사 -> init값 복사 -> prune 진행
def weight_init(model1, model2, c_rate, f_rate, o_rate):
    # layer별로 지정된 rate만큼 prune mask 생성
    for name, module in model1.named_modules():
        if isinstance(module, nn.Conv2d):
            prune.l1_unstructured(module, name = 'weight', amount = c_rate)
        if isinstance(module, nn.Linear):
            if name != 'fc3':
                prune.l1_unstructured(module, name = 'weight', amount = f_rate)
            else:
                prune.l1_unstructured(module, name = 'weight', amount = o_rate)
                        
    # mask 복사
    cp_mask = []
    for name, mask in model1.named_buffers():
        cp_mask.append(mask)
    
    # init 값을 model에 복사
    for name, p in model1.named_parameters():
        if 'weight_orig' in name:
            for name2, p2 in model2.named_parameters():
                if name[0:len(name) - 5] in name2:
                    p.data = copy.deepcopy(p2.data)
        if 'bias_orig' in name:
            for name2, p2 in model2.named_parameters():
                if name[0:len(name) - 5] in name2:
                    p.data = copy.deepcopy(p2.data)
                    
    # prune 진행
    for name, module in model1.named_modules():
        if isinstance(module, nn.Conv2d):
            prune.remove(module, name = 'weight')
        elif isinstance(module, nn.Linear):
            prune.remove(module, name = 'weight')
            
    # copy된 mask return
    return cp_mask

# weight count function
# dict type['name' : [all, non_zero, zero, ratio]]
def weight_counter(model):
    layer_weight = {'all.weight':[0, 0, 0, 0]}
    
    for name, p in model.named_parameters():
        if 'weight' in name:
            remain, pruned = (p != 0).sum().item(), (p == 0).sum().item()
            layer_weight[name] = [remain+pruned, remain, pruned, round((remain/(remain+pruned))*100, 2)]
            
    for i in layer_weight.keys():
        for j in range(0, 3):
            layer_weight['all.weight'][j] += layer_weight[i][j]
    layer_weight['all.weight'][3] = round(layer_weight['all.weight'][1]/layer_weight['all.weight'][0]*100, 2)

    print("Layer".center(12), "Weight".center(39), "Ratio(%)".rjust(7), sep='')
    for i in layer_weight.keys():
        print("%s" % i.ljust(13), ":",
              ("%s (%s | %s)" % (layer_weight[i][0], layer_weight[i][1], layer_weight[i][2])).center(36),
              ("%.2f" % layer_weight[i][3]).rjust(7),
              sep=''
             )
        
    return layer_weight

In [None]:
param = cu.parameters()

if model_type == 'LeNet300':
    model = cu.LeNet300().to(device)
elif model_type == 'Conv6':
    model = cu.Conv6().to(device)
    
param.type(model_type)    
model_init = copy.deepcopy(model)
criterion = nn.CrossEntropyLoss().to(device)

In [None]:
weight_counter(model)

In [None]:
i = 0
cp_mask = []
cp_mask = weight_init(model, model_init, 
                           (0.5),
                           (0.2),
                           (0.1)
                          )

In [None]:
optimizer = optim.Adam(model.parameters(), lr = param.lr, weight_decay = param.weight_decay)

In [None]:
running_loss = train(model, param.train_loader, optimizer, criterion, cp_mask)

In [None]:
def grad_hook_template(param, name, grad):
    print(f'Receive grad for {name} w whape {grad.shape}')

In [None]:
a = 0
for name, i in model.named_parameters():
    if 'weight' in name:
        #print(len(i))
        #print(len(b[name]))
        print(f'Register hook for {name}')
        i.register_hook(partial(grad_hook_template, name=name, i=i))
        a += 1
        #partial(grad_hook_template, name=name, param=param)
        #lambda grad: grad_hook_template(param, name, grad)

In [None]:
for name, i in model.named_parameters():
    #print(name, i)
    if 'fc1.weight' in name:
        a = i.register_hook(lambda grad: grad * cp_mask[0]).is_leaf
    elif 'fc2.weight' in name:
        b = i.register_hook(lambda grad: grad * cp_mask[1])
    elif 'fc3.weight' in name:
        c = i.register_hook(lambda grad: grad * cp_mask[2])

In [None]:
q = 0
for name, i in model.named_parameters():
    if 'weight' in name:
        i.Variable_hook(lambda grad:grad.mul_(cp_mask[q]))
        q += 1

In [None]:
q = 0
for name, i in model.named_parameters():
    if 'weight' in name:
        i.register_hook(lambda grad:grad.mul_(b[name]))
        

In [None]:
a = []
b = {}
h = []
c = 0
for i in model.state_dict().keys():
    if 'weight' in i:
        print(i)
        b[i] = cp_mask[c]
        h.append(i)
        c += 1
print(h)

In [None]:
for i in range(len(h)):
    for name, j in model.named_parameters():
        if h[i] in name:
            print(h[i])
            j.register_hook(lambda grad : grad.mul_(cp_mask[i]))
    #if 
    #print(i)

a = []
for i in range(len(cp_mask)):
    a.append(i)
print(a)

In [None]:
a = 0

In [None]:
q = 0
for name, i in model.named_parameters():
    if 'weight' in name:
        a = copy.deepcopy(q)
        print(a, q)
        i.register_hook(lambda grad:grad.mul_(cp_mask[a]))
        q += 1


In [None]:
grads = {}
def save_grad(name):
    def hook(grad):
        grads[name] = grad
    print(hook)
    return hook

In [None]:
a = 0
for name, i in model.named_parameters():
    if 'weight' in name:
        #print(len(i))
        #print(len(b[name]))
        i.register_hook(save_grad(lambda grad:grad.mul_(b[name])))
        

In [None]:
grads

In [None]:
torch.utils.hooks.RemovableHandle

In [None]:
a

In [None]:
a = torch.FloatTensor()

In [None]:
b = torch.FloatTensor((1, 2, 3))

In [None]:
b

In [None]:
c = torch.FloatTensor((11, 22, 33, 44))

In [None]:
c

In [None]:
d = torch.cat(tuple(b), dim=1)

In [None]:
type(a)

In [None]:
cp_mask.state_dict().keys()

In [None]:
model.fc1.weight.register_hook(lambda grad: grad * cp_mask[0])
model.fc2.weight.register_hook(lambda grad: grad * cp_mask[1])
model.fc3.weight.register_hook(lambda grad: grad * cp_mask[2])

In [None]:
a

In [None]:
w0 = model.fc3.weight.detach().clone()

In [None]:
w0

In [None]:
optimizer = optim.Adam(model.parameters(), lr = param.lr, weight_decay = param.weight_decay)

In [None]:
print(cp_mask)

In [None]:
model.fc2.weight

In [None]:
for name, p in model.named_parameters():
    if name =='fc2.weight':
        print(p)

In [None]:
j = 0
for name, p in model.named_parameters():
    if 'weight' in name:
        print(j)
        #p.register_hook(lambda grad: grad.mul_(cp_mask[j]))
        #print(name)
        #print(p[0], cp_mask[j][0])
        print(cp_mask[j])
        j += 1

In [None]:
for i in range(param.noi):
    best_accu.append(0)
    best_accu[i] = [0, 0, 0]
    cp_mask = []

    # pruning 및 mask 복사
    # layer별 prune rate를 입력
    cp_mask = weight_init(model, model_init, 
                           (1 - ((1-param.prune_per_c) ** i)),
                           (1 - ((1-param.prune_per_f) ** i)),
                           (1 - ((1-param.prune_per_o) ** i))
                          )
    #model2[0].weight.register_hook(lambda grad: grad * gradient_mask)
    j = 0
    for name, p in model.named_parameters():
        if 'weight' in name:
            p.register_hook(lambda grad: grad.mul_(cp_mask[j]))
            print(name)
            print(p[0], cp_mask[j][0])
            j += 1
    
    
    optimizer = optim.Adam(model.parameters(), lr = param.lr, weight_decay = param.weight_decay)
    print("Learning start!\n")
    # weight 개수 계산 및 저장
    
    #iteration 횟수 = i
    
    weight_counts = weight_counter(model)
    
    #print(model.conv1.weight[0])
    #print(model.fc3.weight[0])
    
    
    remaining_weight = weight_counts['all.weight'][3]
    
    start_time = timeit.default_timer()
    
    for epoch in tqdm(range(param.epochs)):
        # epoch가 0일때 정확도 계산
        if epoch == 0:
            accuracy, test_loss = test(model, param.test_loader, criterion)
            visdom_plot(vis_plt,torch.Tensor([accuracy]), torch.Tensor([0]),
                        str(remaining_weight)
                       )
            print('[epoch : %d]' % (epoch),
             '(r_loss: x.xxxxx)',
             '(t_loss: x.xxxxx)',
             '(accu: %.4f)' % (accuracy)
             )
        # model training    
        running_loss = train(model, param.train_loader, optimizer, criterion, cp_mask)
        
        # val_set이 있을 경우 val_set을 통해 loss, accu를 구한다.
        if param.valset == 'empty':
            accuracy, test_loss = test(model, param.test_loader, criterion)
        else:
            accuracy, test_loss = test(model, param.val_loader, criterion)
        
        # visdom plot
        visdom_plot(vis_plt, torch.Tensor([accuracy]), torch.Tensor([(epoch+1) * 1000]),
                    str(remaining_weight)
                   )
        
        # best accuracy list (weight_remain, epoch, accuracy)
        if best_accu[i][2] <= accuracy:
            best_accu[i] = [remaining_weight, epoch, accuracy]
        
        print('[epoch : %d]' % (epoch+1),
             '(r_loss: %.5f)' % (running_loss),
             '(t_loss: %.5f)' % (test_loss),
             '(accu: %.4f)' % (accuracy)
             )
    stop_time = timeit.default_timer()
    #print(model.fc3.weight[0][0])
    #print(model_init.fc3.weight[0][0])
    
    #print(model.fc3.weight[0])
    
    #print(model.conv1.weight[0])
    #print(model.fc3.weight[0])
    
    print("Finish!",
          "(Best accu: %.4f)" % best_accu[i][2],
          "(Time taken(sec) : %.2f)" % (stop_time - start_time),
          "\n\n\n\n\n\n\n")

In [None]:
print(model.conv1.weight[0][0])

In [None]:
model2.state_dict().keys()

In [None]:
print(model2[0].weight.grad)

In [None]:
 for name, p in model2.named_modules():
        print(p)

In [None]:
if x > 0:
	value = 10
else:
	value = 20

value = 10 if x > 0 else 20

In [None]:
model = nn.Sequential(
    nn.Linear(2, 2),
    nn.Sigmoid(),
    nn.Linear(2, 2)
)
print(model[0].weight)
# Create Gradient mask
gradient_mask = torch.zeros(2, 2)
gradient_mask[0, 0] = 1.0
model[0].weight.register_hook(lambda grad: grad.mul_(gradient_mask))

optimizer = optim.SGD(model.parameters(), lr=1.0)
criterion = nn.CrossEntropyLoss()

batch_size = 10
x = torch.randn(batch_size, 2)
target = torch.randint(0, 2, (batch_size,))

# Get weight before training
w0 = model[0].weight.detach().clone()

# Single training iteration
optimizer.zero_grad()
output = model(x)
loss = criterion(output, target)
loss.backward()
print('Gradient: ', model[0].weight.grad)
optimizer.step()

In [None]:
print(model[0].weight)
# Create Gradient mask
gradient_mask = torch.zeros(2, 2)
gradient_mask[0, 0] = 1.0
model[0].weight.register_hook(lambda grad: grad.mul_(gradient_mask))

optimizer = optim.SGD(model.parameters(), lr=1.0)
criterion = nn.CrossEntropyLoss()

batch_size = 10
x = torch.randn(batch_size, 2)
target = torch.randint(0, 2, (batch_size,))

# Get weight before training
w0 = model[0].weight.detach().clone()

# Single training iteration
optimizer.zero_grad()
output = model(x)
loss = criterion(output, target)
loss.backward()
print('Gradient: ', model[0].weight.grad)
optimizer.step()

In [None]:
criterion = nn.CrossEntropyLoss()
model2 = nn.Sequential(
    nn.Linear(2, 2),
    nn.Sigmoid(),
    nn.Linear(2, 2)
)
print(model2[0].weight)

In [None]:
# Create Gradient mask
gradient_mask = torch.zeros(2, 2)
gradient_mask[0, 0] = 1.0
model2[0].weight.register_hook(lambda grad: grad.mul_(gradient_mask))
#model2[0].weight.register_hook(lambda grad: grad * gradient_mask)
print(model2[0].weight)
print(gradient_mask)

In [None]:
optimizer = optim.SGD(model2.parameters(), lr=1.0, weight_decay = 0.003)

In [None]:
print(target)

In [None]:
batch_size = 10
x = torch.randn(batch_size, 2)
target = torch.randint(0, 2, (batch_size,))

optimizer.zero_grad()
output = model2(x)
loss = criterion(output, target)
loss.backward()
optimizer.step()

In [None]:
print('Gradient: ', model2[0].weight, model2[2].weight)

In [None]:
# Get weight before training
#w0 = model2[0].weight.detach().clone()

# Single training iteration
optimizer.step()

# Compare weight update
w1 = model2[0].weight.detach().clone()
print('Weights updated ', w0!=w1)

In [None]:
print(model2[0].weight[:,:])

In [None]:
print(model2[0].weight[0:1,0:1])

In [None]:
model2[0].weight[0:1,0:1].requires_grad.zero_()

In [None]:
for name, param in model2.named_parameters():
    if 'weight' in name:
        param.data[0:1, 0:1].zero_()
        param.data[0:1, 0:1].requires_grad = False
        print(param.data)

In [None]:
optimizer = optim.SGD(filter(lambda p: p.requires_grad, model2.parameters()), lr=0.001, momentum=0.9)

In [None]:
print(“Freezing Parameters(1->10) on the Convolution Layer”,child)
for param in child.parameters():
param.data[:,1:10,:,:].zero_()
param.data[:,1:10,:,:].requires_grad = False

optimizer_ft = OPTIM.SGD(filter(lambda p: p.requires_grad, model_ft.parameters()), lr=0.001, momentum=0.9)