Modules:
* **pruned_layers.py**, contains the pruning of DNNs to reduce the storage of insignificant weight parameters by two methods: pruning by percentage and prune by standara deviation.
* **train_util.py**, includes the training process of DNNs with pruned connections.
* **quantize.py**, applies the quantization (weight sharing) part on the DNN to reduce the storage of weight parameters.
* **huffman_coding.py**, applies the Huffman coding onto the weight of DNNs to further compress the weight size.

Files Created:
* **net_before_pruning.pt**, the weight parameters before applying pruning on DNN weight parameters.
* **net_after_pruning.pt**, the weight paramters after applying pruning on DNN weight parameters.
* **net_after_quantization.pt**, the weight parameters after applying quantization (weight sharing) on DNN weight parameters.
* **codebook_vgg16.npy**, the quantization codebook of each layer after applying quantization (weight sharing).
* **huffman_encoding.npy**, the encoding map of each item within the quantization codebook in the whole DNN architecture.
* **huffman_freq.npy**, the frequency map of each item within the quantization codebook in the whole DNN.

This work uses VGG16_half, which is a down-scaled version of VGG16 using a width multiplier of 0.5. See the implementation in **vgg16.py** for more details.

In [1]:
import os
#from google.colab import drive
#drive.mount('/content/drive')

# Change the current working directory to the Google drive folder for path to dataloader
#os.chdir('/content/drive/MyDrive/')

from vgg16 import VGG16, VGG16_half
from train_util import train, finetune_after_prune, test
from quantize import quantize_whole_model
from huffman_coding import huffman_coding
from summary import summary
import torch
import numpy as np
from prune import prune

import time

device = 'cuda' if torch.cuda.is_available() else 'cpu'
if device =='cuda':
    print("Training on GPU...")
else:
    print("Training on CPU...")


Mounted at /content/drive
Training on GPU...


### Full-precision model training

In [2]:
net = VGG16_half()
net = net.to(device)

#net.load_state_dict(torch.load("net_before_pruning.pt"))

# Hyperparameters tuning
#train(net, epochs=10, batch_size=233, lr=0.00, reg=0.0)
#train(net, epochs=10, batch_size=128, lr=0.001, reg=0.01)  #73.25
#train(net, epochs=10, batch_size=128, lr=0.007, reg=0.01)  # 84.8
#train(net, epochs=10, batch_size=128, lr=0.009, reg=0.01)  # 84.9
#train(net, epochs=10, batch_size=128, lr=0.01, reg=0.01)   #85.07
#train(net, epochs=30, batch_size=128, lr=0.01, reg=0.01)   # 89.75
#train(net, epochs=45, batch_size=128, lr=0.01, reg=0.01)   # 90.52    <-- 
#train(net, epochs=10, batch_size=128, lr=0.02, reg=0.01)   #83.6
#train(net, epochs=10, batch_size=128, lr=0.1, reg=0.01)    # 55.56

#train(net, epochs=10, batch_size=128, lr=0.01, reg=0.0001) #82.0%
#train(net, epochs=10, batch_size=128, lr=0.01, reg=0.001)  #82.4
#train(net, epochs=10, batch_size=128, lr=0.01, reg=0.1)    #35.12

#train(net, epochs=10, batch_size=128, lr=0.1, reg=0.0001)  #71.0%

In [3]:
# Loading the best weight paramters
if(torch.cuda.is_available()):
  net.load_state_dict(torch.load("net_before_pruning.pt"))
  print('loading to GPU...')
else:
  net.load_state_dict(torch.load("net_before_pruning.pt", map_location=torch.device('cpu')))
  print('loading to CPU...')

test(net)

loading to GPU...
Files already downloaded and verified
Test Loss=0.3275, Test accuracy=90.5200


In [None]:
print("-----Summary before pruning-----")
summary(net)
print("-------------------------------")

### Pruning & Finetune with pruned connections

In [5]:
# Pruning by percentage or standard deviation clipping
#prune(net, method='percentage', q=0.25, s=0.75)        #25.00% sparsity | 84.5% accuracy
#prune(net, method='percentage', q=0.6680, s=0.75)      #66.79% sparsity | 83.24% accuracy
#prune(net, method='percentage', q=0.6681, s=0.75)      #66.80% sparsity | 83.15% accuracy

#prune(net, method='std', q=0.25, s=0.046)    # 6.76% sparse, 90.52% acc
#prune(net, method='std', q=0.25, s=0.05)     # 7.31% sparse, 90.50% acc
#prune(net, method='std', q=0.25, s=0.1)      # 13.94% sparse, 90.46% acc
#prune(net, method='std', q=0.25, s=0.3)      # 35.92% sparse, 90.34% acc
#prune(net, method='std', q=0.25, s=0.5)      # 52.16% sparse, 89.76% acc
prune(net, method='std', q=0.25, s=0.75)      # 66.80% sparse, 84.50% acc
#prune(net, method='std', q=0.25, s=1.0)      # 76.99% sparse, 68.14% acc
#prune(net, method='std', q=0.25, s=1.5)      # 88.90% sparse, 18.60% acc
#prune(net, method='std', q=0.25, s=1.7)      # 91.65% sparse, 10.04% acc
#prune(net, method='std', q=0.25, s=2.0)      # 94.50% sparse, 10% acc

test(net)


Files already downloaded and verified
Test Loss=0.5701, Test accuracy=84.5000


In [None]:
print("\n-----Summary after pruning-----")
summary(net)
print("-------------------------------")

In [None]:
# Finetuing model after pruning
#finetune_after_prune(net, epochs=50, batch_size=128, lr=0.001, reg=5e-5)

#finetune_after_prune(net, epochs=10, batch_size=128, lr=0.01, reg=0.01)  #77.65 E?

#finetune_after_prune(net, epochs=35, batch_size=128, lr=0.0001, reg=0.01)  #90.64 E23
#finetune_after_prune(net, epochs=35, batch_size=128, lr=0.0005, reg=0.01)  #89.52 E8
#finetune_after_prune(net, epochs=35, batch_size=128, lr=0.001, reg=0.01)   #88.31 E5   
#finetune_after_prune(net, epochs=35, batch_size=128, lr=0.005, reg=0.01)   #83.36 E22
#finetune_after_prune(net, epochs=45, batch_size=128, lr=0.01, reg=0.01)    #79.12 E34
#finetune_after_prune(net, epochs=35, batch_size=128, lr=0.05, reg=0.01)    #50.64 E10, dropped to 12 @E34
#finetune_after_prune(net, epochs=35, batch_size=128, lr=0.1, reg=0.01)     #


#print("-----Summary After Finetuning (After Pruning) -----")
#summary(net)
#print("-------------------------------")


In [8]:
# Loading the best weight paramters
if(torch.cuda.is_available()):
  net.load_state_dict(torch.load("net_after_pruning.pt"))
  print('loading to GPU...')
else:
  net.load_state_dict(torch.load("net_after_pruning.pt", map_location=torch.device('cpu')))
  print('loading to CPU...')

# Predict using the pruned and finetuned model
test(net)



loading to GPU...
Files already downloaded and verified
Test Loss=0.3470, Test accuracy=90.6400


### Quantization

In [None]:
centers = quantize_whole_model(net, bits=4)
torch.save(net.state_dict(), "net_after_quantization.pt")
np.save("codebook_vgg16.npy", centers)


In [None]:
#centers = quantize_whole_model(net, bits=2)  #25.17%
#centers = quantize_whole_model(net, bits=3)  #87.18%
#centers = quantize_whole_model(net, bits=4)  #90.12%
#centers = quantize_whole_model(net, bits=5)  #90.43%
#centers = quantize_whole_model(net, bits=8)  #90.63%

#if(torch.cuda.is_available()):
#  net.load_state_dict(torch.load("net_after_quantization.pt"))
#  print('loading to GPU...')
#else:
#  net.load_state_dict(torch.load("net_after_quantization.pt", map_location=torch.device('cpu')))
#  print('loading to CPU...')


test(net, is_quantized=True, centers=centers)

print("-----Summary After Test After Quantization -----")
summary(net)
print("-------------------------------")


### Huffman Coding

In [None]:
frequency_map, encoding_map = huffman_coding(net, centers)
np.save("huffman_encoding", encoding_map)
np.save("huffman_freq", frequency_map)