In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.datasets as dsets
import torchvision.transforms as transforms
import torch.nn.init as init
import torch.nn.functional as F
import visdom
import copy
import torch.nn.utils.prune as prune
from tqdm.notebook import tqdm
import numpy as np
import timeit
import sys
from torch.utils.data.sampler import SubsetRandomSampler

In [2]:
torch.manual_seed(55)
torch.cuda.manual_seed_all(55)
torch.backends.cudnn.enabled = False

In [3]:
# cuda setting. GPU_NUM = 사용할 GPU의 번호
GPU_NUM = 1
device = torch.device(f'cuda:{GPU_NUM}' if torch.cuda.is_available() else 'cpu')
torch.cuda.set_device(device)
print ('Available devices :', torch.cuda.device_count())
print ('Current cuda device : %d (%s))' % (torch.cuda.current_device(), torch.cuda.get_device_name(device)))
print("cpu와 cuda 중 다음 기기로 학습함:", device, '\n')

Available devices : 2
Current cuda device : 1 (GeForce RTX 2080 Ti))
cpu와 cuda 중 다음 기기로 학습함: cuda:1 



In [4]:
# Lenet 300 100 10
lr = 0.0012
epochs = 3
batch_size = 60
weight_decay = 1.2e-3
test_iter= 1
prune_per_c = 1
prune_per_f = 0.2
prune_per_o = 0.1
prune_iters = 1

# dataset
transform = transforms.Compose([
    transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))
])

trainset = dsets.MNIST(root='../MNIST_data/',
                         train=True,
                         transform = transform,
                         download=True)
testset = dsets.MNIST(root='../MNIST_data/',
                        train=False,
                        transform = transform,
                        download=True)
valset = dsets.MNIST('../MNIST_data/',
                         train=True,
                         transform = transform,
                         download=True)

# validation set 분류
validation_ratio = (1/12)
num_train = len(trainset)
indices = list(range(num_train))
# 설정한 비율만큼 분할 시의 data 갯수
split = int(np.floor(validation_ratio * num_train))
# shuffle
np.random.shuffle(indices)
# data 분할
train_idx, val_idx = indices[split:], indices[:split]
train_sampler = SubsetRandomSampler(train_idx)
val_sampler = SubsetRandomSampler(val_idx)

train_loader = torch.utils.data.DataLoader(dataset = trainset,
                                          batch_size = batch_size,
                                          sampler = train_sampler,
                                          drop_last = True)

val_loader = torch.utils.data.DataLoader(dataset = valset,
                                          batch_size = batch_size,
                                          sampler = val_sampler,
                                          drop_last = True)

test_loader = torch.utils.data.DataLoader(dataset = testset,
                                          shuffle = False,
                                          drop_last = True)
class Lenet300(nn.Module):
    def __init__(self):
        super(Lenet300, self).__init__()
        
        self.fc1 = nn.Linear(28*28, 300, bias = True)
        self.fc2 = nn.Linear(300, 100, bias = True)
        self.fcout = nn.Linear(100, 10, bias = True)
        
        init.xavier_normal_(self.fc1.weight)
        init.xavier_normal_(self.fc2.weight)
        init.xavier_normal_(self.fcout.weight)
        init.normal_(self.fc1.bias)
        init.normal_(self.fc2.bias)
        init.normal_(self.fcout.bias)
        
    def forward(self, x):
        x = x.view(x.size(0), -1)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fcout(x)
        return x

model = Lenet300().to(device)
model_init = copy.deepcopy(model)

In [5]:
class Lenet300(nn.Module):
    def __init__(self):
        super(Lenet300, self).__init__()
        
        self.fc1 = nn.Linear(28*28, 300, bias = True)
        self.fc2 = nn.Linear(300, 100, bias = True)
        self.fcout = nn.Linear(100, 10, bias = True)
        
        init.xavier_normal_(self.fc1.weight)
        init.xavier_normal_(self.fc2.weight)
        init.xavier_normal_(self.fcout.weight)
        init.normal_(self.fc1.bias)
        init.normal_(self.fc2.bias)
        init.normal_(self.fcout.bias)
        
    def forward(self, x):
        x = x.view(x.size(0), -1)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fcout(x)
        return x

model = Lenet300().to(device)
model_init = copy.deepcopy(model)

In [9]:
for name, p in model.named_parameters():
    if 'weight_orig' in name:
        for name2, p2 in model_init.named_parameters():
            if name[0:len(name) - 5] in name2:
                print(name, name2)
    if 'bias' in name:
        for name2, p2 in model_init.named_parameters():
            if name in name2:
                print(name, name2)

fc1.bias fc1.bias
fc1.weight_orig fc1.weight
fc2.bias fc2.bias
fc2.weight_orig fc2.weight
fcout.bias fcout.bias
fcout.weight_orig fcout.weight


In [None]:
weight_counter(model)

In [None]:
for name, p in model.named_parameters():
    print(name)
     #   if 'weight' in name:
   #         remain, pruned = (p != 0).sum().item(), (p == 0).sum().item()
      #      print(name, remain+pruned, pruned)

In [None]:
for name, p in model.named_parameters():
    print(name)
        if 'weight' in name:
            remain, pruned = (p != 0).sum().item(), (p == 0).sum().item()
            print(name, remain+pruned, pruned)

In [12]:
print(model.state_dict().keys())

odict_keys(['fc1.bias', 'fc1.weight_orig', 'fc1.weight_mask', 'fc2.bias', 'fc2.weight_orig', 'fc2.weight_mask', 'fcout.bias', 'fcout.weight_orig', 'fcout.weight_mask'])


In [None]:
a = (model.fcout.weight != 0).sum().item() + (model.fcout.weight == 0).sum().item()
b = (model.fcout.weight != 0).sum().item()
#a = (model.fcout.weight_mask != 0).sum().item() + (model.fcout.weight_mask == 0).sum().item()

print(a, b)

#model.fcout.weight

In [13]:
model.fcout.weight[5][40:60]

tensor([ 0.2773, -0.0032, -0.1322, -0.0011, -0.1686,  0.0941,  0.3244, -0.0791,
         0.0287, -0.0016, -0.0512, -0.0388, -0.0821,  0.1490,  0.1307, -0.2078,
         0.1664, -0.0302,  0.0005,  0.3474], device='cuda:1',
       grad_fn=<SliceBackward>)

In [8]:
module = model.fc1
prune.ln_structured(module, name="weight", amount=0.2, n=1, dim=1)
module = model.fc2
prune.ln_structured(module, name="weight", amount=0.2, n=1, dim=1)
module = model.fcout
prune.ln_structured(module, name="weight", amount=0.1, n=1, dim=1) 

Linear(in_features=100, out_features=10, bias=True)

In [17]:
model.fcout.weight[5][40:60]

tensor([ 0.2773, -0.0032, -0.1322, -0.0011, -0.1686,  0.0941,  0.3244, -0.0791,
         0.0287, -0.0016, -0.0512, -0.0388, -0.0821,  0.1490,  0.0000, -0.2078,
         0.1664, -0.0000,  0.0005,  0.3474], device='cuda:1',
       grad_fn=<SliceBackward>)

In [19]:
model.fcout.weight_orig[5][40:60]

tensor([ 0.2773, -0.0032, -0.1322, -0.0011, -0.1686,  0.0941,  0.3244, -0.0791,
         0.0287, -0.0016, -0.0512, -0.0388, -0.0821,  0.1490,  0.1307, -0.2078,
         0.1664, -0.0302,  0.0005,  0.3474], device='cuda:1',
       grad_fn=<SliceBackward>)

In [None]:
module = model.fc1
for hook in module._forward_pre_hooks.values():
    if hook._tensor_name == "weight":  # select out the correct hook
        break
module = model.fc2
for hook in module._forward_pre_hooks.values():
    if hook._tensor_name == "weight":  # select out the correct hook
        break
module = model.fcout
for hook in module._forward_pre_hooks.values():
    if hook._tensor_name == "weight":  # select out the correct hook
        break

In [24]:
module = model.fc1
prune.remove(module, 'weight')
module = model.fc2
prune.remove(module, 'weight')
module = model.fcout
prune.remove(module, 'weight')


Linear(in_features=100, out_features=10, bias=True)

In [None]:
module = model.fcout
prune.ln_structured(module, name="weight", amount=0.1, n=1, dim=1)
#for hook in module._forward_pre_hooks.values():
    #if hook._tensor_name == "weight":  # select out the correct hook
        #break

a = copy.deepcopy(model.fcout.weight_mask)
prune.remove(module, 'weight')


for name, module in model.named_modules():
        if 'fcout' in name:
            print(name)
            module.weight.register_hook(lambda grad, name=name : grad.mul_(a))



In [6]:
module = model.fcout
prune.ln_structured(module, name="weight", amount=0.1, n=1, dim=1)
for hook in module._forward_pre_hooks.values():
    print(hook)
    if hook._tensor_name == "weight":  # select out the correct hook
        break
prune.remove(module, 'weight')

<torch.nn.utils.prune.LnStructured object at 0x7fbc0d2aef10>


Linear(in_features=100, out_features=10, bias=True)

In [None]:
print(model.fcout.weight_mask)

In [None]:
for name, module in model.named_modules():
    print(name)

In [None]:
print(model.fcout.weight_mask)

In [None]:
# visdom setting
vis = visdom.Visdom()
vis.close(env="main")

Tracker_type = "Accuracy_Tracker"
title = fname + "_" + Tracker_type

# make plot
vis_plt = vis.line(X=torch.Tensor(1).zero_(), Y=torch.Tensor(1).zero_(), 
                    opts=dict(title = title,
                              legend=['100.0'],
                              showlegend=True,
                              xtickmin = 0,
                              xtickmax = 50000,
                              ytickmin = 0.94,
                              ytickmax = 0.99
                             )
                   )


# visdom append plot
def visdom_plot(loss_plot, num, loss_value, name):
    vis.line(X = num,
            Y = loss_value,
            win = loss_plot,
            name = str(name),
            update = 'append'
            )
    
def result_plot():
    x = []
    for i in range(param.epochs+1):
        x.append(i*1000)

    for name in test_result['Average of trials']:
        visdom_plot(vis_plt, torch.Tensor(x), torch.Tensor(test_result['Average of trials'][name][2]),
                            name)

In [15]:
# train, test, prune, util function
def train(model, dataloader, optimizer, criterion):
    model.train()
    running_loss = 0.0
    for batch_idx, (data, label) in enumerate(dataloader):
        data, label = data.to(device), label.to(device)
        optimizer.zero_grad()
        outputs = model(data)
        loss = criterion(outputs, label)
        loss.backward()
        optimizer.step()
        running_loss += loss / len(dataloader)
    return running_loss

def train2(model, dataloader, optimizer, criterion):
    EPS = 1e-6
    model.train()
    running_loss = 0.0
    for batch_idx, (data, label) in enumerate(dataloader):
        data, label = data.to(device), label.to(device)
        optimizer.zero_grad()
        outputs = model(data)
        loss = criterion(outputs, label)
        loss.backward()
                # Freezing Pruned weights by making their gradients Zero
        for name, p in model.named_parameters():
            if 'weight' in name:
                tensor = p.data.cpu().numpy()
                grad_tensor = p.grad.data.cpu().numpy()
                grad_tensor = np.where(tensor < EPS, 0, grad_tensor)
                p.grad.data = torch.from_numpy(grad_tensor).to(device)
                
        optimizer.step()
        running_loss += loss / len(dataloader)
    return running_loss

def test(model, dataloader, criterion):
    model.eval()
    correct = 0
    total = 0
    test_loss = 0
    with torch.no_grad():
        for data, label in dataloader:
            data, label = data.to(device), label.to(device)
            outputs = model(data)
            _, predicted = torch.max(outputs.data, 1)
            loss = criterion(outputs, label)

            test_loss += loss / len(dataloader)
            total += label.size(0)
            correct += (predicted == label).sum().item()
        # 로더 -> 배치 개수 로더.dataset -> 전체 길이, 
    return (correct/total), test_loss

def weight_counter(model):
    layer_weight = {'all.weight':[0, 0, 0, 0]}
    
    for name, p in model.named_parameters():
        if 'weight' in name:
            remain, pruned = (p != 0).sum().item(), (p == 0).sum().item()
            layer_weight[name] = [remain+pruned, remain, pruned, round((remain/(remain+pruned))*100, 2)]
            
    for i in layer_weight.keys():
        for j in range(0, 3):
            layer_weight['all.weight'][j] += layer_weight[i][j]
    layer_weight['all.weight'][3] = round(layer_weight['all.weight'][1]/layer_weight['all.weight'][0]*100, 2)
    print("------------------------------------------------------------\n",
          "Layer".center(12), "Weight".center(39), "Ratio(%)".rjust(7), sep='')
    for i in layer_weight.keys():
        
        print("%s" % i.ljust(13), ":",
              ("%s (%s | %s)" % (layer_weight[i][0], layer_weight[i][1], layer_weight[i][2])).center(36),
              ("%.2f" % layer_weight[i][3]).rjust(7),
              sep=''
             )
    print("------------------------------------------------------------")
    return layer_weight

In [16]:
criterion = nn.CrossEntropyLoss().to(device)
optimizer = optim.Adam(model.parameters(), lr = lr, weight_decay = weight_decay)

In [20]:
for prune_iter in range(prune_iters):
    print(weight_counter(model))

    start_t = timeit.default_timer()

    for epoch in range(epochs):
        
        running_loss = train2(model, train_loader, optimizer, criterion)
        accuracy, test_loss = test(model, test_loader, criterion)
        print('[epoch : %d] (l_loss: %.5f) (t_loss: %.5f) (accu: %.4f)' %
                  ((epoch+1), (running_loss), (test_loss), (accuracy)))
    stop_t = timeit.default_timer()

    print("Finish! (Time taken(sec) : %.2f) \n\n" %
          ((stop_t - start_t)))
    print(weight_counter(model))
#result_plot()

------------------------------------------------------------
   Layer                     Weight                Ratio(%)
all.weight   :        266200 (266200 | 0)          100.00
fc1.weight_orig:        235200 (235200 | 0)          100.00
fc2.weight_orig:         30000 (30000 | 0)           100.00
fcout.weight_orig:          1000 (1000 | 0)            100.00
------------------------------------------------------------
{'all.weight': [266200, 266200, 0, 100.0], 'fc1.weight_orig': [235200, 235200, 0, 100.0], 'fc2.weight_orig': [30000, 30000, 0, 100.0], 'fcout.weight_orig': [1000, 1000, 0, 100.0]}
[epoch : 1] (l_loss: 0.41935) (t_loss: 0.26503) (accu: 0.9252)
[epoch : 2] (l_loss: 0.26556) (t_loss: 0.27264) (accu: 0.9181)
[epoch : 3] (l_loss: 0.22458) (t_loss: 0.20909) (accu: 0.9393)
Finish! (Time taken(sec) : 41.08) 


------------------------------------------------------------
   Layer                     Weight                Ratio(%)
all.weight   :        266200 (266190 | 10)         

In [None]:
print(weight_counter(model))

In [21]:
model.fcout.weight[5][40:60]

tensor([-3.9923e-42, -3.2728e-03,  7.1228e-42, -8.7679e-42, -2.9133e-42,
         1.4909e-01,  4.3131e-01, -3.7409e-05,  1.3305e-02, -4.7378e-04,
        -3.2579e-04, -1.1653e-03, -1.4359e-03,  1.3418e-01,  0.0000e+00,
         3.4892e-42,  2.1135e-01,  0.0000e+00,  5.5071e-43,  3.6802e-01],
       device='cuda:1')

In [22]:
model.fcout.weight_orig[5][40:60]

tensor([-3.9923e-42, -3.2728e-03,  7.1228e-42, -8.7679e-42, -2.9133e-42,
         1.4909e-01,  4.3131e-01, -3.7409e-05,  1.3305e-02, -4.7378e-04,
        -3.2579e-04, -1.1653e-03, -1.4359e-03,  1.3418e-01,  5.1568e-42,
         3.4892e-42,  2.1135e-01,  1.9831e-41,  5.5071e-43,  3.6802e-01],
       device='cuda:1', grad_fn=<SliceBackward>)

In [23]:
model.fcout.weight_mask[5][40:60]

tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 0.,
        1., 1.], device='cuda:1')

In [25]:
model.fcout.weight[5][40:60]

tensor([-3.9923e-42, -3.2728e-03,  7.1228e-42, -8.7679e-42, -2.9133e-42,
         1.4909e-01,  4.3131e-01, -3.7409e-05,  1.3305e-02, -4.7378e-04,
        -3.2579e-04, -1.1653e-03, -1.4359e-03,  1.3418e-01,  0.0000e+00,
         3.4892e-42,  2.1135e-01,  0.0000e+00,  5.5071e-43,  3.6802e-01],
       device='cuda:1', grad_fn=<SliceBackward>)

AttributeError: 'Linear' object has no attribute 'weight_orig'