In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from math import ceil
import torch
from torch.utils.data import DataLoader
from torch.autograd import Variable
import torch.optim as optim

import matplotlib.pyplot as plt
%matplotlib inline

import sys
sys.path.append('..')
from utils.input_pipeline import get_image_folders
from utils.training import train
from utils.quantization import optimization_step, quantize, initial_scales

torch.cuda.is_available()

In [None]:
torch.backends.cudnn.benchmark = True

In [None]:
LEARNING_RATE = 1e-4  # learning rate for all possible weights
HYPERPARAMETER_T = 0.15  # hyperparameter for quantization

# Create data iterators

In [None]:
batch_size = 512

In [None]:
train_folder, val_folder = get_image_folders()

train_iterator = DataLoader(
    train_folder, batch_size=batch_size, num_workers=4,
    shuffle=True, pin_memory=True
)

val_iterator = DataLoader(
    val_folder, batch_size=256, num_workers=4,
    shuffle=False, pin_memory=True
)

# number of training samples
train_size = len(train_folder.imgs)
train_size

# Model

In [None]:
from get_densenet import get_model

In [None]:
model, loss, optimizer = get_model(learning_rate=LEARNING_RATE)

# load pretrained model, accuracy ~48%
model.load_state_dict(torch.load('../vanilla_densenet_small/model.pytorch_state'))

#### keep copy of full precision kernels

In [None]:
# copy almost all full precision kernels of the model
all_fp_kernels = [
    Variable(kernel.data.clone(), requires_grad=True) 
    for kernel in optimizer.param_groups[1]['params']
]
# all_fp_kernels - kernel tensors of all convolutional layers 
# (with the exception of the first conv layer)

#### initial quantization 

In [None]:
# scaling factors for each quantized layer
initial_scaling_factors = []

In [None]:
# these kernels will be quantized
all_kernels = [kernel for kernel in optimizer.param_groups[1]['params']]

In [None]:
for k, k_fp in zip(all_kernels, all_fp_kernels):
    
    # choose initial scaling factors 
    w_p_initial, w_n_initial = initial_scales(k_fp.data)
    initial_scaling_factors += [(w_p_initial, w_n_initial)]
    
    # do quantization, this uses hyperparameter 't'
    k.data = quantize(k_fp.data, w_p_initial, w_n_initial, t=HYPERPARAMETER_T)

#### parameter updaters

In [None]:
# optimizer for updating only all_fp_kernels
optimizer_fp = optim.Adam(all_fp_kernels, lr=LEARNING_RATE)

In [None]:
# optimizer for updating only scaling factors
optimizer_sf = optim.Adam([
    Variable(torch.FloatTensor([w_p, w_n]).cuda(), requires_grad=True) 
    for w_p, w_n in initial_scaling_factors
], lr=LEARNING_RATE)

# Train

In [None]:
n_epochs = 10
n_batches = ceil(train_size/batch_size)

# total number of batches in the train set
n_batches

In [None]:
%%time
def optimization_step_fn(model, loss, x_batch, y_batch):
    return optimization_step(
        model, loss, x_batch, y_batch, 
        optimizer_list=[optimizer, optimizer_fp, optimizer_sf],
        t=HYPERPARAMETER_T
    )
all_losses = train(
    model, loss, optimization_step_fn,
    train_iterator, val_iterator,
    n_epochs=n_epochs, steps_per_epoch=n_batches, n_validation_batches=40
)
# epoch logloss  accuracy    top5_accuracy time  (first value: train, second value: val)

# Loss/epoch plots

In [None]:
epochs = [x[0] for x in all_losses]
plt.plot(epochs, [x[1] for x in all_losses], label='train');
plt.plot(epochs, [x[2] for x in all_losses], label='val');
plt.legend();
plt.xlabel('epoch');
plt.ylabel('loss');

In [None]:
plt.plot(epochs, [x[3] for x in all_losses], label='train');
plt.plot(epochs, [x[4] for x in all_losses], label='val');
plt.legend();
plt.xlabel('epoch');
plt.ylabel('accuracy');

In [None]:
plt.plot(epochs, [x[5] for x in all_losses], label='train');
plt.plot(epochs, [x[6] for x in all_losses], label='val');
plt.legend();
plt.xlabel('epoch');
plt.ylabel('top5_accuracy');

# Save

In [None]:
model.cpu();
torch.save(model.state_dict(), 'model_ternary_quantization.pytorch_state')