In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from math import ceil
import torch
from torch.utils.data import DataLoader
from torch.autograd import Variable
import torch.optim as optim

import matplotlib.pyplot as plt
%matplotlib inline

import sys
sys.path.append('..')
from utils.input_pipeline import get_image_folders
from utils.training import train
from utils.quantization import optimization_step, quantize, initial_scales

torch.cuda.is_available()

True

In [3]:
torch.backends.cudnn.benchmark = True

In [4]:
LEARNING_RATE = 1e-4  # learning rate for all possible weights
HYPERPARAMETER_T = 0.15  # hyperparameter for quantization

# Create data iterators

In [5]:
batch_size = 64

In [6]:
train_folder, val_folder = get_image_folders()

train_iterator = DataLoader(
    train_folder, batch_size=batch_size, num_workers=4,
    shuffle=True, pin_memory=True
)

val_iterator = DataLoader(
    val_folder, batch_size=256, num_workers=4,
    shuffle=False, pin_memory=True
)

# number of training samples
train_size = len(train_folder.imgs)
train_size

100000

# Model

In [7]:
from get_densenet import get_model

In [8]:
model, loss, optimizer = get_model(learning_rate=LEARNING_RATE)

# load pretrained model, accuracy ~73%
model.load_state_dict(torch.load('../vanilla_densenet_big/model_step5.pytorch_state'))

#### keep copy of full precision kernels

In [9]:
# copy almost all full precision kernels of the model
all_fp_kernels = [
    Variable(kernel.data.clone(), requires_grad=True) 
    for kernel in optimizer.param_groups[1]['params']
]
# all_fp_kernels - kernel tensors of all convolutional layers 
# (with the exception of the first conv layer)

#### initial quantization 

In [10]:
# scaling factors for each quantized layer
initial_scaling_factors = []

In [11]:
# these kernels will be quantized
all_kernels = [kernel for kernel in optimizer.param_groups[1]['params']]

In [12]:
for k, k_fp in zip(all_kernels, all_fp_kernels):
    
    # choose initial scaling factors 
    w_p_initial, w_n_initial = initial_scales(k_fp.data)
    initial_scaling_factors += [(w_p_initial, w_n_initial)]
    
    # do quantization
    k.data = quantize(k_fp.data, w_p_initial, w_n_initial, t=HYPERPARAMETER_T)

#### parameter updaters

In [13]:
# optimizer for updating only all_fp_kernels
optimizer_fp = optim.Adam(all_fp_kernels, lr=LEARNING_RATE)

In [14]:
# optimizer for updating only scaling factors
optimizer_sf = optim.Adam([
    Variable(torch.FloatTensor([w_p, w_n]).cuda(), requires_grad=True) 
    for w_p, w_n in initial_scaling_factors
], lr=LEARNING_RATE)

# Train

In [15]:
n_epochs = 5
n_batches = ceil(train_size/batch_size)

# total number of batches in the train set
n_batches

1563

In [16]:
%%time
def optimization_step_fn(model, loss, x_batch, y_batch):
    return optimization_step(
        model, loss, x_batch, y_batch, 
        optimizer_list=[optimizer, optimizer_fp, optimizer_sf],
        t=HYPERPARAMETER_T
    )
all_losses = train(
    model, loss, optimization_step_fn,
    train_iterator, val_iterator, n_epochs
)
# epoch logloss  accuracy    top5_accuracy time  (first value: train, second value: val)

0  3.543 2.475  0.221 0.403  0.466 0.689  2233.851
1  2.544 2.199  0.389 0.461  0.674 0.745  2219.524
2  2.256 1.983  0.449 0.512  0.726 0.775  2218.226
3  2.099 1.870  0.483 0.534  0.755 0.794  2218.565
4  1.990 1.844  0.505 0.548  0.772 0.793  2219.560
CPU times: user 3h 13min 58s, sys: 23min 55s, total: 3h 37min 53s
Wall time: 3h 5min 9s


# Save

In [17]:
model.cpu();
torch.save(model.state_dict(), 'model_ternary_quantization.pytorch_state')