<h2>Googlenet Compression</h2>

<p>Training Googlenet on CIFAR-100, and applying Deep Compression.<p>

In [31]:
import torch
from torch import nn
from torch import optim
from torchvision import datasets,transforms, models
import matplotlib.pyplot as plt
import numpy as np
from PIL import Image
import os
import sys
import argparse
import time
from torch.autograd import Variable
from torch.optim.lr_scheduler import _LRScheduler
import re
import math
from collections import Iterable
from itertools import islice

<h3>Hyperparameter Settings</h3>

In [47]:
batchsize = 128
l_r = 0.1
w = 1

<h3>Loading and preprocessing data:</h3>
<p>Let us calculate the mean and standard deviation of the dataset. We'll use this in the transforms later.</p>

In [33]:
def compute_mean_std(dataset):
    
    data_r = np.dstack([np.array(im)[:, :, 0]/255 for im,label in dataset])
    data_g = np.dstack([np.array(im)[:, :, 1]/255 for im,label in dataset])
    data_b = np.dstack([np.array(im)[:, :, 2]/255 for im,label in dataset])
    
    mean = [np.asscalar(np.mean(data_r)), np.asscalar(np.mean(data_g)), np.asscalar(np.mean(data_b))]
    std = [np.asscalar(np.std(data_r)), np.asscalar(np.std(data_g)), np.asscalar(np.std(data_b))]
    
    return mean,std

In [None]:
#cifar_norm_train=datasets.CIFAR100(data_dir, train=True, transform=None, target_transform=None, download=False)
#cifar_norm_test=datasets.CIFAR100(data_dir, train=False, transform=None, target_transform=None, download=False)
#train_mean,train_std = compute_mean_std(cifar_norm_train)
#test_mean, test_std = compute_mean_std(cifar_norm_test)
#print(train_mean,train_std)
#print(test_mean,test_std)

In [34]:
#No need to calculate this multiple times, by saving the mean/std values
train_mean,train_std = [0.5070751592371341, 0.48654887331495067, 0.4409178433670344],[0.2673342858792403, 0.2564384629170882, 0.27615047132568393]
test_mean, test_std = [0.508796412760417, 0.48739301317401906, 0.4419422112438727],[0.2682515741720801, 0.25736373644781246, 0.2770957707973041]

Padding with 4 zeros, taking 32 by 32 crops, random flipping and normalization by values found above.

In [35]:
train_transforms = transforms.Compose([transforms.Pad(4,fill=0),
                                     transforms.RandomResizedCrop(32),
                                     transforms.RandomHorizontalFlip(),
                                     transforms.ToTensor(),
                                     transforms.Normalize(train_mean,train_std)])

test_transforms = transforms.Compose([transforms.ToTensor(),
                                     transforms.Normalize(test_mean,test_std)])

In [36]:
data_dir='../input/cifar-100-python/'
cifar_train=datasets.CIFAR100(data_dir, train=True, transform=train_transforms, target_transform=None, download=False)
cifar_test=datasets.CIFAR100(data_dir, train=False, transform=test_transforms, target_transform=None, download=False)

In [37]:
trainloader = torch.utils.data.DataLoader(cifar_train, batch_size=batchsize, shuffle=True)
testloader = torch.utils.data.DataLoader(cifar_test, batch_size=batchsize, shuffle=True)

<h3>Visualizing CIFAR</h3>
<p>Just to see what's actually in there.</p>

In [None]:
#for inputs,labels in testloader:
#    break
#images = inputs.numpy()
#for i in range(3):
#    plt.figure()
#    plt.imshow(images[i].T)

<h2>Utils</h2>

In [38]:
def iter_str_every(iterable, k):
    """
    :param iterable:
    :param k:
    :return:
    """
    i = iter(iterable)
    piece = ''.join(islice(i, k))
    while piece:
        yield piece
        piece = ''.join(islice(i, k))


def get_sparsity(param):
    """
    :param param:
    :return:
    """
    mask = param.eq(0)
    return float(mask.sum()) / mask.numel()


class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        if self.count > 0:
            self.avg = self.sum / self.count

    def accumulate(self, val, n=1):
        self.sum += val
        self.count += n
        if self.count > 0:
            self.avg = self.sum / self.count


class Logger(object):
    def __init__(self, file_path):
        """
        write log to file
        :param file_path: str, path to the file
        """
        self.f = open(file_path, 'w')
        self.fid = self.f.fileno()
        self.filepath = file_path

    def close(self):
        """
        close log file
        :return:
        """
        return self.f.close()

    def flush(self):
        self.f.flush()
        os.fsync(self.fid)

    def write(self, content, wrap=True, flush=False, verbose=False):
        """
        write file and flush buffer to the disk
        :param content: str
        :param wrap: bool, whether to add '\n' at the end of the content
        :param flush: bool, whether to flush buffer to the disk, default=False
        :param verbose: bool, whether to print the content, default=False
        :return:
            void
        """
        if verbose:
            print(content)
        if wrap:
            content += "\n"
        self.f.write(content)
        if flush:
            self.f.flush()
            os.fsync(self.fid)


class StageScheduler(object):

    def __init__(self, max_num_stage, stage_step=45):
        """
        :param max_num_stage:
        :param stage_step:
        """
        self.max_num_stage = max_num_stage

        self.stage_step = stage_step
        if isinstance(stage_step, int):
            self.stage_step = [stage_step] * max_num_stage
        if isinstance(stage_step, str):
            self.stage_step = list(map(int, stage_step.split(',')))
        assert isinstance(self.stage_step, list)

        num_stage = len(self.stage_step)
        if num_stage < self.max_num_stage:
            for i in range(self.max_num_stage - num_stage):
                self.stage_step.append(self.stage_step[num_stage - 1])
        elif num_stage > self.max_num_stage:
            self.max_num_stage = num_stage
        assert len(self.stage_step) == self.max_num_stage

        for i in range(1, self.max_num_stage):
            self.stage_step[i] += self.stage_step[i - 1]

    def step(self, epoch):
        """
        :param epoch:
        :return:
        """
        stage = self.max_num_stage - 1
        for i, max_epoch in enumerate(self.stage_step):
            if epoch < max_epoch:
                stage = i
                break
        if stage > 0:
            epoch -= self.stage_step[stage - 1]
        return stage, epoch

<h2>Pruning</h2>

In [53]:
def prune_vanilla_elementwise(param, sparsity, fn_importance=lambda x: x.abs()):
    """
    element-wise vanilla pruning
    :param param: torch.(cuda.)Tensor, weight of conv/fc layer
    :param sparsity: float, pruning sparsity
    :param fn_importance: function, inputs 'param' and returns the importance of
                                    each position in 'param',
                                    default=lambda x: x.abs()
    :return:
        torch.(cuda.)ByteTensor, mask for zeros
    """
    sparsity = min(max(0.0, sparsity), 1.0)
    if sparsity == 1.0:
        return torch.zeros_like(param).byte()
    num_el = param.numel()
    importance = fn_importance(param)
    num_pruned = int(math.ceil(num_el * sparsity))
    num_stayed = num_el - num_pruned
    if sparsity <= 0.5:
        _, topk_indices = torch.topk(importance.view(num_el), k=num_pruned,
                                     dim=0, largest=False, sorted=False)
        mask = torch.zeros_like(param).byte()
        param.view(num_el).index_fill_(0, topk_indices, 0)
        mask.view(num_el).index_fill_(0, topk_indices, 1)
    else:
        thr = torch.min(torch.topk(importance.view(num_el), k=num_stayed,
                                   dim=0, largest=True, sorted=False)[0])
        mask = torch.lt(importance, thr)
        param.masked_fill_(mask, 0)
    return mask


def prune_vanilla_kernelwise(param, sparsity, fn_importance=lambda x: x.norm(1, -1)):
    """
    kernel-wise vanilla pruning, the importance determined by L1 norm
    :param param: torch.(cuda.)Tensor, weight of conv/fc layer
    :param sparsity: float, pruning sparsity
    :param fn_importance: function, inputs 'param' as size (param.size(0) * param.size(1), -1) and
                                    returns the importance of each kernel in 'param',
                                    default=lambda x: x.norm(1, -1)
    :return:
        torch.(cuda.)ByteTensor, mask for zeros
    """
    assert param.dim() >= 3
    sparsity = min(max(0.0, sparsity), 1.0)
    if sparsity == 1.0:
        return torch.zeros_like(param).byte()
    num_kernels = param.size(0) * param.size(1)
    param_k = param.view(num_kernels, -1)
    param_importance = fn_importance(param_k)
    num_pruned = int(math.ceil(num_kernels * sparsity))
    _, topk_indices = torch.topk(param_importance, k=num_pruned,
                                 dim=0, largest=False, sorted=False)
    mask = torch.zeros_like(param).byte()
    mask_k = mask.view(num_kernels, -1)
    param_k.index_fill_(0, topk_indices, 0)
    mask_k.index_fill_(0, topk_indices, 1)
    return mask


def prune_vanilla_filterwise(sparsity, param, fn_importance=lambda x: x.norm(1, -1)):
    """
    filter-wise vanilla pruning, the importance determined by L1 norm
    :param param: torch.(cuda.)Tensor, weight of conv/fc layer
    :param sparsity: float, pruning sparsity
    :param fn_importance: function, inputs 'param' as size (param.size(0), -1) and
                                returns the importance of each filter in 'param',
                                default=lambda x: x.norm(1, -1)
    :return:
        torch.(cuda.)ByteTensor, mask for zeros
    """
    assert param.dim() >= 3
    sparsity = min(max(0.0, sparsity), 1.0)
    if sparsity == 1.0:
        return torch.zeros_like(param).byte()
    num_filters = param.size(0)
    param_k = param.view(num_filters, -1)
    param_importance = fn_importance(param_k)
    num_pruned = int(math.ceil(num_filters * sparsity))
    _, topk_indices = torch.topk(param_importance, k=num_pruned,
                                 dim=0, largest=False, sorted=False)
    mask = torch.zeros_like(param).byte()
    mask_k = mask.view(num_filters, -1)
    param_k.index_fill_(0, topk_indices, 0)
    mask_k.index_fill_(0, topk_indices, 1)
    return mask


class VanillaPruner(object):

    def __init__(self, rule=None):
        """
        Pruner Class for Vanilla Pruning Method
        :param rule: str, path to the rule file, each line formats
                          'param_name granularity sparsity_stage_0, sparstiy_stage_1, ...'
                     list of tuple, [(param_name(str), granularity(str),
                                      sparsity(float) or [sparsity_stage_0(float), sparstiy_stage_1,],
                                      fn_importance(optional, str or function))]
                     'granularity': str, choose from ['element', 'kernel', 'filter']
                     'fn_importance': str, choose from ['abs', 'l1norm', 'l2norm']
        """
        if rule:
            if isinstance(rule, str):
                content = map(lambda x: x.split(), open(rule).readlines())
                content = filter(lambda x: len(x) == 3, content)
                rule = list(map(lambda x: (x[0], x[1], list(map(float, x[2].split(',')))), content))
            for r in rule:
                if not isinstance(r[2], Iterable):
                    assert isinstance(r[2], float) or isinstance(r[2], int)
                    r[2] = [float(r[2])]
                if len(r) == 3:
                    r.append('default')
                granularity = r[1]
                if granularity == 'element':
                    r.append(prune_vanilla_elementwise)
                elif granularity == 'kernel':
                    r.append(prune_vanilla_kernelwise)
                elif granularity == 'filter':
                    r.append(prune_vanilla_filterwise)
                else:
                    raise NotImplementedError

        self.rule = rule

        self.masks = dict()

        print("=" * 89)
        if self.rule:
            print("Initializing Vanilla Pruner with rules:")
            for r in self.rule:
                print(r[:-1])
        else:
            print("Initializing Vanilla Pruner WITHOUT rules")
        print("=" * 89)

    def load_state_dict(self, state_dict, replace_rule=True):
        """
        Recover Pruner
        :param state_dict: dict, a dictionary containing a whole state of the Pruner
        :param replace_rule: bool, whether to use rule settings in 'state_dict'
        :return: VanillaPruner
        """
        if replace_rule:
            self.rule = state_dict['rule']
            for r in self.rule:
                granularity = r[1]
                if granularity == 'element':
                    r.append(prune_vanilla_elementwise)
                elif granularity == 'kernel':
                    r.append(prune_vanilla_kernelwise)
                elif granularity == 'filter':
                    r.append(prune_vanilla_filterwise)
                else:
                    raise NotImplementedError
        self.masks = state_dict['masks']
        print("=" * 89)
        print("Customizing Vanilla Pruner with rules:")
        for r in self.rule:
            print(r[:-1])
        print("=" * 89)

    def state_dict(self):
        """
        Returns a dictionary containing a whole state of the Pruner
        :return: dict, a dictionary containing a whole state of the Pruner
        """
        state_dict = dict()
        state_dict['rule'] = [r[:-1] for r in self.rule]
        state_dict['masks'] = self.masks
        return state_dict

    def prune_param(self, param, param_name, stage=0, verbose=False):
        """
        prune parameter
        :param param: torch.(cuda.)tensor
        :param param_name: str, name of param
        :param stage: int, the pruning stage, default=0
        :param verbose: bool, whether to print the pruning details
        :return:
            torch.(cuda.)ByteTensor, mask for zeros
        """
        rule_id = -1
        #for idx, r in enumerate(self.rule):
        #    m = re.match(r[0], param_name)
        #    if m is not None and len(param_name) == m.span()[1]:
        #        rule_id = idx
        #        break
        if rule_id > -1:
            sparsity = self.rule[rule_id][2][stage]
            fn_prune = self.rule[rule_id][-1]
            fn_importance = self.rule[rule_id][3]
            if verbose:
                print("{param_name:^30} | {stage:5d} | {spars:.3f}".
                      format(param_name=param_name, stage=stage, spars=sparsity))
            if fn_importance is None or fn_importance == 'default':
                mask = fn_prune(param=param, sparsity=sparsity)
            elif fn_importance == 'abs':
                mask = fn_prune(param=param, sparsity=sparsity, fn_importance=lambda x: x.abs())
            elif fn_importance == 'l1norm':
                mask = fn_prune(param=param, sparsity=sparsity, fn_importance=lambda x: x.norm(1, -1))
            elif fn_importance == 'l2norm':
                mask = fn_prune(param=param, sparsity=sparsity, fn_importance=lambda x: x.norm(2, -1))
            else:
                mask = fn_prune(param=param, sparsity=sparsity, fn_importance=fn_importance)
            return mask
        else:
            if verbose:
                print("{param_name:^30} | skipping".format(param_name=param_name))
            return None

    def prune(self, model, stage=0, update_masks=False, verbose=False):
        """
        prune models
        :param model: torch.nn.Module
        :param stage: int, the pruning stage, default=0
        :param update_masks: bool, whether update masks
        :param verbose: bool, whether to print the pruning details
        :return:
            void
        """
        update_masks = True if update_masks or len(self.masks) == 0 else False
        if verbose:
            print("=" * 89)
            print("Pruning Models")
            if len(self.masks) == 0:
                print("Initializing Masks")
            elif update_masks:
                print("Updating Masks")
            print("=" * 89)
            print("{name:^30} | stage | sparsity".format(name='param_name'))
        for param_name, param in model.named_parameters():
            if 'AuxLogits' not in param_name:
                # deal with googlenet
                if param.dim() > 1:
                    if update_masks:
                        mask = self.prune_param(param=param.data, param_name=param_name,
                                                stage=stage, verbose=verbose)
                        if mask is not None:
                            self.masks[param_name] = mask
                    else:
                        if param_name in self.masks:
                            mask = self.masks[param_name]
                            param.data.masked_fill_(mask, 0)
        if verbose:
            print("=" * 89)

<h3>Googlenet</h3>

In [40]:
net = models.googlenet()

In [41]:
best_prec1 = 0
net = torch.nn.DataParallel(net, device_ids=list(range(1))).cuda()

In [42]:
 criterion = nn.CrossEntropyLoss().cuda()

In [55]:
optimizer = optim.SGD(net.parameters(), lr=0.1, momentum=0.9, weight_decay=5e-4)
pruner = VanillaPruner()
input_size = 224

Initializing Vanilla Pruner WITHOUT rules


In [79]:
def validate(val_loader, model, criterion, epoch):
    # switch to evaluate mode
    model.eval()
    print("=" * 89)

    with torch.no_grad():
        for i, (input, target) in enumerate(val_loader):
            target = target.cuda(non_blocking=True)

            # compute output
            output = model(input)
            loss = criterion(output.logits, target)
            test_loss += loss.item()
            # measure accuracy and record loss
            _, preds = output.max(1)
            del output
            x= preds.eq(labels).sum()
            print(x)
            correct = correct+x
            
            losses.update(loss.item(), input.size(0))
            top1.update(prec1[0], input.size(0))
        k=k+1
        if k%10 == 0:
            print(k)
    print("=" * 89)
    print('Test set: Average loss: {:.4f}, Accuracy: {:.4f}'.format(
        test_loss / len(trainloader.dataset),
        correct.float() / len(testloader.dataset)
    ))
    return correct.float() / len(testloader.dataset)

In [81]:
def train(train_loader, model, criterion, optimizer, pruner, epoch,stage):
    # switch to train mode
    model.train()
    print("=" * 89)
    
    for batch_index, (input, target) in enumerate(train_loader):
        # measure data loading time
        target = target.cuda(non_blocking=True)

        # compute output
        output = model(input)
        loss = criterion(output.logits, target)

        # compute gradient and do SGD step
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        # pruning
        pruner.prune(model=model, stage=stage, update_masks=False)
        
        print('Training Epoch: {epoch} [{trained_samples}/{total_samples}]\tLoss: {:0.4f}\tLR: {:0.6f}'.format(
            loss.item(),
            optimizer.param_groups[0]['lr'],
            epoch=epoch,
            trained_samples=batch_index * 128 + len(input),
            total_samples=len(train_loader.dataset)
        ))

In [83]:
#train_scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=[60, 120, 160], gamma=0.2)
prev_epoch = 10
max_epoch = 20
stage = 0
start = time.time()
for epoch in range(prev_epoch,max_epoch):
    if epoch == 0:
        pruner.prune(model=net, stage=stage, update_masks=True)
    train(train_loader=trainloader, model=net, criterion=criterion, optimizer=optimizer,
              pruner=pruner, epoch=epoch,stage=stage)
end = time.time()
print('Finished in',end-start)

Training Epoch: 10 [128/50000]	Loss: 3.6612	LR: 0.100000
Training Epoch: 10 [256/50000]	Loss: 3.5402	LR: 0.100000
Training Epoch: 10 [384/50000]	Loss: 3.8542	LR: 0.100000
Training Epoch: 10 [512/50000]	Loss: 3.6381	LR: 0.100000
Training Epoch: 10 [640/50000]	Loss: 3.5999	LR: 0.100000
Training Epoch: 10 [768/50000]	Loss: 3.6518	LR: 0.100000
Training Epoch: 10 [896/50000]	Loss: 3.6314	LR: 0.100000
Training Epoch: 10 [1024/50000]	Loss: 3.7106	LR: 0.100000
Training Epoch: 10 [1152/50000]	Loss: 3.6113	LR: 0.100000
Training Epoch: 10 [1280/50000]	Loss: 3.4598	LR: 0.100000
Training Epoch: 10 [1408/50000]	Loss: 3.5032	LR: 0.100000
Training Epoch: 10 [1536/50000]	Loss: 3.4323	LR: 0.100000
Training Epoch: 10 [1664/50000]	Loss: 3.7716	LR: 0.100000
Training Epoch: 10 [1792/50000]	Loss: 3.5260	LR: 0.100000
Training Epoch: 10 [1920/50000]	Loss: 3.5874	LR: 0.100000
Training Epoch: 10 [2048/50000]	Loss: 3.6199	LR: 0.100000
Training Epoch: 10 [2176/50000]	Loss: 3.4904	LR: 0.100000
Training Epoch: 10 [2

Training Epoch: 10 [17920/50000]	Loss: 3.5132	LR: 0.100000
Training Epoch: 10 [18048/50000]	Loss: 3.7002	LR: 0.100000
Training Epoch: 10 [18176/50000]	Loss: 3.6346	LR: 0.100000
Training Epoch: 10 [18304/50000]	Loss: 3.5236	LR: 0.100000
Training Epoch: 10 [18432/50000]	Loss: 3.6210	LR: 0.100000
Training Epoch: 10 [18560/50000]	Loss: 3.7660	LR: 0.100000
Training Epoch: 10 [18688/50000]	Loss: 3.5301	LR: 0.100000
Training Epoch: 10 [18816/50000]	Loss: 3.5936	LR: 0.100000
Training Epoch: 10 [18944/50000]	Loss: 3.8357	LR: 0.100000
Training Epoch: 10 [19072/50000]	Loss: 3.6361	LR: 0.100000
Training Epoch: 10 [19200/50000]	Loss: 3.4482	LR: 0.100000
Training Epoch: 10 [19328/50000]	Loss: 3.7320	LR: 0.100000
Training Epoch: 10 [19456/50000]	Loss: 3.7101	LR: 0.100000
Training Epoch: 10 [19584/50000]	Loss: 3.6969	LR: 0.100000
Training Epoch: 10 [19712/50000]	Loss: 3.6388	LR: 0.100000
Training Epoch: 10 [19840/50000]	Loss: 3.7326	LR: 0.100000
Training Epoch: 10 [19968/50000]	Loss: 3.5642	LR: 0.1000

Training Epoch: 10 [35968/50000]	Loss: 3.8116	LR: 0.100000
Training Epoch: 10 [36096/50000]	Loss: 3.4502	LR: 0.100000
Training Epoch: 10 [36224/50000]	Loss: 3.6222	LR: 0.100000
Training Epoch: 10 [36352/50000]	Loss: 3.6648	LR: 0.100000
Training Epoch: 10 [36480/50000]	Loss: 3.7872	LR: 0.100000
Training Epoch: 10 [36608/50000]	Loss: 3.5892	LR: 0.100000
Training Epoch: 10 [36736/50000]	Loss: 3.7615	LR: 0.100000
Training Epoch: 10 [36864/50000]	Loss: 3.5437	LR: 0.100000
Training Epoch: 10 [36992/50000]	Loss: 3.7191	LR: 0.100000
Training Epoch: 10 [37120/50000]	Loss: 3.5297	LR: 0.100000
Training Epoch: 10 [37248/50000]	Loss: 3.4678	LR: 0.100000
Training Epoch: 10 [37376/50000]	Loss: 3.4733	LR: 0.100000
Training Epoch: 10 [37504/50000]	Loss: 3.6786	LR: 0.100000
Training Epoch: 10 [37632/50000]	Loss: 3.6427	LR: 0.100000
Training Epoch: 10 [37760/50000]	Loss: 3.5790	LR: 0.100000
Training Epoch: 10 [37888/50000]	Loss: 3.7788	LR: 0.100000
Training Epoch: 10 [38016/50000]	Loss: 3.6915	LR: 0.1000

Training Epoch: 11 [3584/50000]	Loss: 3.6921	LR: 0.100000
Training Epoch: 11 [3712/50000]	Loss: 3.6447	LR: 0.100000
Training Epoch: 11 [3840/50000]	Loss: 3.6057	LR: 0.100000
Training Epoch: 11 [3968/50000]	Loss: 3.5995	LR: 0.100000
Training Epoch: 11 [4096/50000]	Loss: 3.5663	LR: 0.100000
Training Epoch: 11 [4224/50000]	Loss: 3.6077	LR: 0.100000
Training Epoch: 11 [4352/50000]	Loss: 3.5401	LR: 0.100000
Training Epoch: 11 [4480/50000]	Loss: 3.3848	LR: 0.100000
Training Epoch: 11 [4608/50000]	Loss: 3.4720	LR: 0.100000
Training Epoch: 11 [4736/50000]	Loss: 3.6819	LR: 0.100000
Training Epoch: 11 [4864/50000]	Loss: 3.6387	LR: 0.100000
Training Epoch: 11 [4992/50000]	Loss: 3.4576	LR: 0.100000
Training Epoch: 11 [5120/50000]	Loss: 3.6641	LR: 0.100000
Training Epoch: 11 [5248/50000]	Loss: 3.4544	LR: 0.100000
Training Epoch: 11 [5376/50000]	Loss: 3.6846	LR: 0.100000
Training Epoch: 11 [5504/50000]	Loss: 3.4869	LR: 0.100000
Training Epoch: 11 [5632/50000]	Loss: 3.6785	LR: 0.100000
Training Epoch

Training Epoch: 11 [21632/50000]	Loss: 3.6293	LR: 0.100000
Training Epoch: 11 [21760/50000]	Loss: 3.4215	LR: 0.100000
Training Epoch: 11 [21888/50000]	Loss: 3.4900	LR: 0.100000
Training Epoch: 11 [22016/50000]	Loss: 3.5548	LR: 0.100000
Training Epoch: 11 [22144/50000]	Loss: 3.6513	LR: 0.100000
Training Epoch: 11 [22272/50000]	Loss: 3.4390	LR: 0.100000
Training Epoch: 11 [22400/50000]	Loss: 3.7226	LR: 0.100000
Training Epoch: 11 [22528/50000]	Loss: 3.6482	LR: 0.100000
Training Epoch: 11 [22656/50000]	Loss: 3.6195	LR: 0.100000
Training Epoch: 11 [22784/50000]	Loss: 3.4092	LR: 0.100000
Training Epoch: 11 [22912/50000]	Loss: 3.6089	LR: 0.100000
Training Epoch: 11 [23040/50000]	Loss: 3.3694	LR: 0.100000
Training Epoch: 11 [23168/50000]	Loss: 3.7022	LR: 0.100000
Training Epoch: 11 [23296/50000]	Loss: 3.5209	LR: 0.100000
Training Epoch: 11 [23424/50000]	Loss: 3.7107	LR: 0.100000
Training Epoch: 11 [23552/50000]	Loss: 3.6250	LR: 0.100000
Training Epoch: 11 [23680/50000]	Loss: 3.8485	LR: 0.1000

Training Epoch: 11 [39680/50000]	Loss: 3.7741	LR: 0.100000
Training Epoch: 11 [39808/50000]	Loss: 3.6564	LR: 0.100000
Training Epoch: 11 [39936/50000]	Loss: 3.6421	LR: 0.100000
Training Epoch: 11 [40064/50000]	Loss: 3.3350	LR: 0.100000
Training Epoch: 11 [40192/50000]	Loss: 3.3188	LR: 0.100000
Training Epoch: 11 [40320/50000]	Loss: 3.4138	LR: 0.100000
Training Epoch: 11 [40448/50000]	Loss: 3.4784	LR: 0.100000
Training Epoch: 11 [40576/50000]	Loss: 3.6864	LR: 0.100000
Training Epoch: 11 [40704/50000]	Loss: 3.2788	LR: 0.100000
Training Epoch: 11 [40832/50000]	Loss: 3.3089	LR: 0.100000
Training Epoch: 11 [40960/50000]	Loss: 3.5313	LR: 0.100000
Training Epoch: 11 [41088/50000]	Loss: 3.5541	LR: 0.100000
Training Epoch: 11 [41216/50000]	Loss: 3.7470	LR: 0.100000
Training Epoch: 11 [41344/50000]	Loss: 3.4345	LR: 0.100000
Training Epoch: 11 [41472/50000]	Loss: 3.5608	LR: 0.100000
Training Epoch: 11 [41600/50000]	Loss: 3.6093	LR: 0.100000
Training Epoch: 11 [41728/50000]	Loss: 3.7978	LR: 0.1000

Training Epoch: 12 [7680/50000]	Loss: 3.6682	LR: 0.100000
Training Epoch: 12 [7808/50000]	Loss: 3.5102	LR: 0.100000
Training Epoch: 12 [7936/50000]	Loss: 3.6902	LR: 0.100000
Training Epoch: 12 [8064/50000]	Loss: 3.4903	LR: 0.100000
Training Epoch: 12 [8192/50000]	Loss: 3.4816	LR: 0.100000
Training Epoch: 12 [8320/50000]	Loss: 3.2170	LR: 0.100000
Training Epoch: 12 [8448/50000]	Loss: 3.3582	LR: 0.100000
Training Epoch: 12 [8576/50000]	Loss: 3.4668	LR: 0.100000
Training Epoch: 12 [8704/50000]	Loss: 3.4042	LR: 0.100000
Training Epoch: 12 [8832/50000]	Loss: 3.5927	LR: 0.100000
Training Epoch: 12 [8960/50000]	Loss: 3.6088	LR: 0.100000
Training Epoch: 12 [9088/50000]	Loss: 3.5000	LR: 0.100000
Training Epoch: 12 [9216/50000]	Loss: 3.4415	LR: 0.100000
Training Epoch: 12 [9344/50000]	Loss: 3.7633	LR: 0.100000
Training Epoch: 12 [9472/50000]	Loss: 3.6426	LR: 0.100000
Training Epoch: 12 [9600/50000]	Loss: 3.7827	LR: 0.100000
Training Epoch: 12 [9728/50000]	Loss: 3.5159	LR: 0.100000
Training Epoch

Training Epoch: 12 [25728/50000]	Loss: 3.7068	LR: 0.100000
Training Epoch: 12 [25856/50000]	Loss: 3.6580	LR: 0.100000
Training Epoch: 12 [25984/50000]	Loss: 3.3667	LR: 0.100000
Training Epoch: 12 [26112/50000]	Loss: 3.7266	LR: 0.100000
Training Epoch: 12 [26240/50000]	Loss: 3.5036	LR: 0.100000
Training Epoch: 12 [26368/50000]	Loss: 3.5411	LR: 0.100000
Training Epoch: 12 [26496/50000]	Loss: 3.3991	LR: 0.100000
Training Epoch: 12 [26624/50000]	Loss: 3.4280	LR: 0.100000
Training Epoch: 12 [26752/50000]	Loss: 3.6187	LR: 0.100000
Training Epoch: 12 [26880/50000]	Loss: 3.4627	LR: 0.100000
Training Epoch: 12 [27008/50000]	Loss: 3.5445	LR: 0.100000
Training Epoch: 12 [27136/50000]	Loss: 3.5799	LR: 0.100000
Training Epoch: 12 [27264/50000]	Loss: 3.7497	LR: 0.100000
Training Epoch: 12 [27392/50000]	Loss: 3.4250	LR: 0.100000
Training Epoch: 12 [27520/50000]	Loss: 3.4937	LR: 0.100000
Training Epoch: 12 [27648/50000]	Loss: 3.7339	LR: 0.100000
Training Epoch: 12 [27776/50000]	Loss: 3.6536	LR: 0.1000

Training Epoch: 12 [43776/50000]	Loss: 3.5965	LR: 0.100000
Training Epoch: 12 [43904/50000]	Loss: 3.5365	LR: 0.100000
Training Epoch: 12 [44032/50000]	Loss: 3.4752	LR: 0.100000
Training Epoch: 12 [44160/50000]	Loss: 3.5127	LR: 0.100000
Training Epoch: 12 [44288/50000]	Loss: 3.3791	LR: 0.100000
Training Epoch: 12 [44416/50000]	Loss: 3.6054	LR: 0.100000
Training Epoch: 12 [44544/50000]	Loss: 3.4882	LR: 0.100000
Training Epoch: 12 [44672/50000]	Loss: 3.3588	LR: 0.100000
Training Epoch: 12 [44800/50000]	Loss: 3.6617	LR: 0.100000
Training Epoch: 12 [44928/50000]	Loss: 3.4745	LR: 0.100000
Training Epoch: 12 [45056/50000]	Loss: 3.5018	LR: 0.100000
Training Epoch: 12 [45184/50000]	Loss: 3.4329	LR: 0.100000
Training Epoch: 12 [45312/50000]	Loss: 3.6024	LR: 0.100000
Training Epoch: 12 [45440/50000]	Loss: 3.6248	LR: 0.100000
Training Epoch: 12 [45568/50000]	Loss: 3.5768	LR: 0.100000
Training Epoch: 12 [45696/50000]	Loss: 3.6293	LR: 0.100000
Training Epoch: 12 [45824/50000]	Loss: 3.4609	LR: 0.1000

Training Epoch: 13 [11776/50000]	Loss: 3.3824	LR: 0.100000
Training Epoch: 13 [11904/50000]	Loss: 3.1884	LR: 0.100000
Training Epoch: 13 [12032/50000]	Loss: 3.5400	LR: 0.100000
Training Epoch: 13 [12160/50000]	Loss: 3.7381	LR: 0.100000
Training Epoch: 13 [12288/50000]	Loss: 3.4872	LR: 0.100000
Training Epoch: 13 [12416/50000]	Loss: 3.7589	LR: 0.100000
Training Epoch: 13 [12544/50000]	Loss: 3.4942	LR: 0.100000
Training Epoch: 13 [12672/50000]	Loss: 3.4954	LR: 0.100000
Training Epoch: 13 [12800/50000]	Loss: 3.6859	LR: 0.100000
Training Epoch: 13 [12928/50000]	Loss: 3.5415	LR: 0.100000
Training Epoch: 13 [13056/50000]	Loss: 3.4772	LR: 0.100000
Training Epoch: 13 [13184/50000]	Loss: 3.3909	LR: 0.100000
Training Epoch: 13 [13312/50000]	Loss: 3.5324	LR: 0.100000
Training Epoch: 13 [13440/50000]	Loss: 3.6173	LR: 0.100000
Training Epoch: 13 [13568/50000]	Loss: 3.7959	LR: 0.100000
Training Epoch: 13 [13696/50000]	Loss: 3.4388	LR: 0.100000
Training Epoch: 13 [13824/50000]	Loss: 3.5382	LR: 0.1000

Training Epoch: 13 [29824/50000]	Loss: 3.5845	LR: 0.100000
Training Epoch: 13 [29952/50000]	Loss: 3.5926	LR: 0.100000
Training Epoch: 13 [30080/50000]	Loss: 3.3141	LR: 0.100000
Training Epoch: 13 [30208/50000]	Loss: 3.4355	LR: 0.100000
Training Epoch: 13 [30336/50000]	Loss: 3.2314	LR: 0.100000
Training Epoch: 13 [30464/50000]	Loss: 3.2084	LR: 0.100000
Training Epoch: 13 [30592/50000]	Loss: 3.6026	LR: 0.100000
Training Epoch: 13 [30720/50000]	Loss: 3.4057	LR: 0.100000
Training Epoch: 13 [30848/50000]	Loss: 3.2256	LR: 0.100000
Training Epoch: 13 [30976/50000]	Loss: 3.6487	LR: 0.100000
Training Epoch: 13 [31104/50000]	Loss: 3.5003	LR: 0.100000
Training Epoch: 13 [31232/50000]	Loss: 3.2426	LR: 0.100000
Training Epoch: 13 [31360/50000]	Loss: 3.5086	LR: 0.100000
Training Epoch: 13 [31488/50000]	Loss: 3.4410	LR: 0.100000
Training Epoch: 13 [31616/50000]	Loss: 3.4050	LR: 0.100000
Training Epoch: 13 [31744/50000]	Loss: 3.5056	LR: 0.100000
Training Epoch: 13 [31872/50000]	Loss: 3.5155	LR: 0.1000

Training Epoch: 13 [47872/50000]	Loss: 3.5209	LR: 0.100000
Training Epoch: 13 [48000/50000]	Loss: 3.6114	LR: 0.100000
Training Epoch: 13 [48128/50000]	Loss: 3.3307	LR: 0.100000
Training Epoch: 13 [48256/50000]	Loss: 3.3782	LR: 0.100000
Training Epoch: 13 [48384/50000]	Loss: 3.4647	LR: 0.100000
Training Epoch: 13 [48512/50000]	Loss: 3.4387	LR: 0.100000
Training Epoch: 13 [48640/50000]	Loss: 3.4961	LR: 0.100000
Training Epoch: 13 [48768/50000]	Loss: 3.4569	LR: 0.100000
Training Epoch: 13 [48896/50000]	Loss: 3.4215	LR: 0.100000
Training Epoch: 13 [49024/50000]	Loss: 3.6030	LR: 0.100000
Training Epoch: 13 [49152/50000]	Loss: 3.2908	LR: 0.100000
Training Epoch: 13 [49280/50000]	Loss: 3.3504	LR: 0.100000
Training Epoch: 13 [49408/50000]	Loss: 3.6025	LR: 0.100000
Training Epoch: 13 [49536/50000]	Loss: 3.6114	LR: 0.100000
Training Epoch: 13 [49664/50000]	Loss: 3.6573	LR: 0.100000
Training Epoch: 13 [49792/50000]	Loss: 3.6284	LR: 0.100000
Training Epoch: 13 [49920/50000]	Loss: 3.6072	LR: 0.1000

Training Epoch: 14 [15872/50000]	Loss: 3.5961	LR: 0.100000
Training Epoch: 14 [16000/50000]	Loss: 3.6425	LR: 0.100000
Training Epoch: 14 [16128/50000]	Loss: 3.3873	LR: 0.100000
Training Epoch: 14 [16256/50000]	Loss: 3.4575	LR: 0.100000
Training Epoch: 14 [16384/50000]	Loss: 3.4420	LR: 0.100000
Training Epoch: 14 [16512/50000]	Loss: 3.7174	LR: 0.100000
Training Epoch: 14 [16640/50000]	Loss: 3.6880	LR: 0.100000
Training Epoch: 14 [16768/50000]	Loss: 3.6021	LR: 0.100000
Training Epoch: 14 [16896/50000]	Loss: 3.5752	LR: 0.100000
Training Epoch: 14 [17024/50000]	Loss: 3.4645	LR: 0.100000
Training Epoch: 14 [17152/50000]	Loss: 3.6236	LR: 0.100000
Training Epoch: 14 [17280/50000]	Loss: 3.4584	LR: 0.100000
Training Epoch: 14 [17408/50000]	Loss: 3.4095	LR: 0.100000
Training Epoch: 14 [17536/50000]	Loss: 3.6177	LR: 0.100000
Training Epoch: 14 [17664/50000]	Loss: 3.3543	LR: 0.100000
Training Epoch: 14 [17792/50000]	Loss: 3.4773	LR: 0.100000
Training Epoch: 14 [17920/50000]	Loss: 3.5113	LR: 0.1000

Training Epoch: 14 [33920/50000]	Loss: 3.6707	LR: 0.100000
Training Epoch: 14 [34048/50000]	Loss: 3.2900	LR: 0.100000
Training Epoch: 14 [34176/50000]	Loss: 3.4312	LR: 0.100000
Training Epoch: 14 [34304/50000]	Loss: 3.4664	LR: 0.100000
Training Epoch: 14 [34432/50000]	Loss: 3.5843	LR: 0.100000
Training Epoch: 14 [34560/50000]	Loss: 3.4711	LR: 0.100000
Training Epoch: 14 [34688/50000]	Loss: 3.7333	LR: 0.100000
Training Epoch: 14 [34816/50000]	Loss: 3.5017	LR: 0.100000
Training Epoch: 14 [34944/50000]	Loss: 3.5450	LR: 0.100000
Training Epoch: 14 [35072/50000]	Loss: 3.4722	LR: 0.100000
Training Epoch: 14 [35200/50000]	Loss: 3.5276	LR: 0.100000
Training Epoch: 14 [35328/50000]	Loss: 3.3836	LR: 0.100000
Training Epoch: 14 [35456/50000]	Loss: 3.3847	LR: 0.100000
Training Epoch: 14 [35584/50000]	Loss: 3.1934	LR: 0.100000
Training Epoch: 14 [35712/50000]	Loss: 3.4996	LR: 0.100000
Training Epoch: 14 [35840/50000]	Loss: 3.4674	LR: 0.100000
Training Epoch: 14 [35968/50000]	Loss: 3.3553	LR: 0.1000

Training Epoch: 15 [1664/50000]	Loss: 3.4623	LR: 0.100000
Training Epoch: 15 [1792/50000]	Loss: 3.6385	LR: 0.100000
Training Epoch: 15 [1920/50000]	Loss: 3.5614	LR: 0.100000
Training Epoch: 15 [2048/50000]	Loss: 3.3157	LR: 0.100000
Training Epoch: 15 [2176/50000]	Loss: 3.3473	LR: 0.100000
Training Epoch: 15 [2304/50000]	Loss: 3.5927	LR: 0.100000
Training Epoch: 15 [2432/50000]	Loss: 3.2505	LR: 0.100000
Training Epoch: 15 [2560/50000]	Loss: 3.7867	LR: 0.100000
Training Epoch: 15 [2688/50000]	Loss: 3.4634	LR: 0.100000
Training Epoch: 15 [2816/50000]	Loss: 3.5310	LR: 0.100000
Training Epoch: 15 [2944/50000]	Loss: 3.6192	LR: 0.100000
Training Epoch: 15 [3072/50000]	Loss: 3.3174	LR: 0.100000
Training Epoch: 15 [3200/50000]	Loss: 3.4534	LR: 0.100000
Training Epoch: 15 [3328/50000]	Loss: 3.2214	LR: 0.100000
Training Epoch: 15 [3456/50000]	Loss: 3.4288	LR: 0.100000
Training Epoch: 15 [3584/50000]	Loss: 3.3262	LR: 0.100000
Training Epoch: 15 [3712/50000]	Loss: 3.6206	LR: 0.100000
Training Epoch

Training Epoch: 15 [19712/50000]	Loss: 3.2693	LR: 0.100000
Training Epoch: 15 [19840/50000]	Loss: 3.4837	LR: 0.100000
Training Epoch: 15 [19968/50000]	Loss: 3.4730	LR: 0.100000
Training Epoch: 15 [20096/50000]	Loss: 3.6108	LR: 0.100000
Training Epoch: 15 [20224/50000]	Loss: 3.3932	LR: 0.100000
Training Epoch: 15 [20352/50000]	Loss: 3.5080	LR: 0.100000
Training Epoch: 15 [20480/50000]	Loss: 3.4961	LR: 0.100000
Training Epoch: 15 [20608/50000]	Loss: 3.5388	LR: 0.100000
Training Epoch: 15 [20736/50000]	Loss: 3.5080	LR: 0.100000
Training Epoch: 15 [20864/50000]	Loss: 3.3471	LR: 0.100000
Training Epoch: 15 [20992/50000]	Loss: 3.4762	LR: 0.100000
Training Epoch: 15 [21120/50000]	Loss: 3.6620	LR: 0.100000
Training Epoch: 15 [21248/50000]	Loss: 3.4507	LR: 0.100000
Training Epoch: 15 [21376/50000]	Loss: 3.5331	LR: 0.100000
Training Epoch: 15 [21504/50000]	Loss: 3.3804	LR: 0.100000
Training Epoch: 15 [21632/50000]	Loss: 3.5233	LR: 0.100000
Training Epoch: 15 [21760/50000]	Loss: 3.5669	LR: 0.1000

Training Epoch: 15 [37760/50000]	Loss: 3.3611	LR: 0.100000
Training Epoch: 15 [37888/50000]	Loss: 3.4236	LR: 0.100000
Training Epoch: 15 [38016/50000]	Loss: 3.4685	LR: 0.100000
Training Epoch: 15 [38144/50000]	Loss: 3.3788	LR: 0.100000
Training Epoch: 15 [38272/50000]	Loss: 3.3357	LR: 0.100000
Training Epoch: 15 [38400/50000]	Loss: 3.6473	LR: 0.100000
Training Epoch: 15 [38528/50000]	Loss: 3.2958	LR: 0.100000
Training Epoch: 15 [38656/50000]	Loss: 3.5422	LR: 0.100000
Training Epoch: 15 [38784/50000]	Loss: 3.4426	LR: 0.100000
Training Epoch: 15 [38912/50000]	Loss: 3.4523	LR: 0.100000
Training Epoch: 15 [39040/50000]	Loss: 3.5784	LR: 0.100000
Training Epoch: 15 [39168/50000]	Loss: 3.6002	LR: 0.100000
Training Epoch: 15 [39296/50000]	Loss: 3.3635	LR: 0.100000
Training Epoch: 15 [39424/50000]	Loss: 3.2817	LR: 0.100000
Training Epoch: 15 [39552/50000]	Loss: 3.5705	LR: 0.100000
Training Epoch: 15 [39680/50000]	Loss: 3.5002	LR: 0.100000
Training Epoch: 15 [39808/50000]	Loss: 3.3494	LR: 0.1000

Training Epoch: 16 [5760/50000]	Loss: 3.4014	LR: 0.100000
Training Epoch: 16 [5888/50000]	Loss: 3.4803	LR: 0.100000
Training Epoch: 16 [6016/50000]	Loss: 3.3929	LR: 0.100000
Training Epoch: 16 [6144/50000]	Loss: 3.2012	LR: 0.100000
Training Epoch: 16 [6272/50000]	Loss: 3.3914	LR: 0.100000
Training Epoch: 16 [6400/50000]	Loss: 3.4612	LR: 0.100000
Training Epoch: 16 [6528/50000]	Loss: 3.4139	LR: 0.100000
Training Epoch: 16 [6656/50000]	Loss: 3.1110	LR: 0.100000
Training Epoch: 16 [6784/50000]	Loss: 3.1657	LR: 0.100000
Training Epoch: 16 [6912/50000]	Loss: 3.6889	LR: 0.100000
Training Epoch: 16 [7040/50000]	Loss: 3.3619	LR: 0.100000
Training Epoch: 16 [7168/50000]	Loss: 3.1376	LR: 0.100000
Training Epoch: 16 [7296/50000]	Loss: 3.3275	LR: 0.100000
Training Epoch: 16 [7424/50000]	Loss: 3.4067	LR: 0.100000
Training Epoch: 16 [7552/50000]	Loss: 3.3550	LR: 0.100000
Training Epoch: 16 [7680/50000]	Loss: 3.5532	LR: 0.100000
Training Epoch: 16 [7808/50000]	Loss: 3.4919	LR: 0.100000
Training Epoch

Training Epoch: 16 [23808/50000]	Loss: 3.2889	LR: 0.100000
Training Epoch: 16 [23936/50000]	Loss: 3.3307	LR: 0.100000
Training Epoch: 16 [24064/50000]	Loss: 3.3624	LR: 0.100000
Training Epoch: 16 [24192/50000]	Loss: 3.2918	LR: 0.100000
Training Epoch: 16 [24320/50000]	Loss: 3.5194	LR: 0.100000
Training Epoch: 16 [24448/50000]	Loss: 3.4905	LR: 0.100000
Training Epoch: 16 [24576/50000]	Loss: 3.1210	LR: 0.100000
Training Epoch: 16 [24704/50000]	Loss: 3.3362	LR: 0.100000
Training Epoch: 16 [24832/50000]	Loss: 3.3292	LR: 0.100000
Training Epoch: 16 [24960/50000]	Loss: 3.4472	LR: 0.100000
Training Epoch: 16 [25088/50000]	Loss: 3.4862	LR: 0.100000
Training Epoch: 16 [25216/50000]	Loss: 3.2442	LR: 0.100000
Training Epoch: 16 [25344/50000]	Loss: 3.2068	LR: 0.100000
Training Epoch: 16 [25472/50000]	Loss: 3.3967	LR: 0.100000
Training Epoch: 16 [25600/50000]	Loss: 3.2136	LR: 0.100000
Training Epoch: 16 [25728/50000]	Loss: 3.3272	LR: 0.100000
Training Epoch: 16 [25856/50000]	Loss: 3.2208	LR: 0.1000

Training Epoch: 16 [41856/50000]	Loss: 3.3774	LR: 0.100000
Training Epoch: 16 [41984/50000]	Loss: 3.1064	LR: 0.100000
Training Epoch: 16 [42112/50000]	Loss: 3.2942	LR: 0.100000
Training Epoch: 16 [42240/50000]	Loss: 3.3883	LR: 0.100000
Training Epoch: 16 [42368/50000]	Loss: 3.3096	LR: 0.100000
Training Epoch: 16 [42496/50000]	Loss: 3.2673	LR: 0.100000
Training Epoch: 16 [42624/50000]	Loss: 3.3755	LR: 0.100000
Training Epoch: 16 [42752/50000]	Loss: 3.4738	LR: 0.100000
Training Epoch: 16 [42880/50000]	Loss: 3.3487	LR: 0.100000
Training Epoch: 16 [43008/50000]	Loss: 3.3855	LR: 0.100000
Training Epoch: 16 [43136/50000]	Loss: 3.4572	LR: 0.100000
Training Epoch: 16 [43264/50000]	Loss: 3.2383	LR: 0.100000
Training Epoch: 16 [43392/50000]	Loss: 3.4758	LR: 0.100000
Training Epoch: 16 [43520/50000]	Loss: 3.1268	LR: 0.100000
Training Epoch: 16 [43648/50000]	Loss: 3.2249	LR: 0.100000
Training Epoch: 16 [43776/50000]	Loss: 3.2765	LR: 0.100000
Training Epoch: 16 [43904/50000]	Loss: 3.2911	LR: 0.1000

Training Epoch: 17 [9856/50000]	Loss: 3.4209	LR: 0.100000
Training Epoch: 17 [9984/50000]	Loss: 3.2192	LR: 0.100000
Training Epoch: 17 [10112/50000]	Loss: 3.2168	LR: 0.100000
Training Epoch: 17 [10240/50000]	Loss: 3.3305	LR: 0.100000
Training Epoch: 17 [10368/50000]	Loss: 3.3050	LR: 0.100000
Training Epoch: 17 [10496/50000]	Loss: 3.5012	LR: 0.100000
Training Epoch: 17 [10624/50000]	Loss: 3.3584	LR: 0.100000
Training Epoch: 17 [10752/50000]	Loss: 3.4435	LR: 0.100000
Training Epoch: 17 [10880/50000]	Loss: 3.5612	LR: 0.100000
Training Epoch: 17 [11008/50000]	Loss: 3.3956	LR: 0.100000
Training Epoch: 17 [11136/50000]	Loss: 3.3311	LR: 0.100000
Training Epoch: 17 [11264/50000]	Loss: 3.3046	LR: 0.100000
Training Epoch: 17 [11392/50000]	Loss: 3.2035	LR: 0.100000
Training Epoch: 17 [11520/50000]	Loss: 3.5713	LR: 0.100000
Training Epoch: 17 [11648/50000]	Loss: 3.3136	LR: 0.100000
Training Epoch: 17 [11776/50000]	Loss: 3.5123	LR: 0.100000
Training Epoch: 17 [11904/50000]	Loss: 3.2867	LR: 0.100000

Training Epoch: 17 [27904/50000]	Loss: 3.4685	LR: 0.100000
Training Epoch: 17 [28032/50000]	Loss: 3.2566	LR: 0.100000
Training Epoch: 17 [28160/50000]	Loss: 3.6620	LR: 0.100000
Training Epoch: 17 [28288/50000]	Loss: 3.3096	LR: 0.100000
Training Epoch: 17 [28416/50000]	Loss: 3.1836	LR: 0.100000
Training Epoch: 17 [28544/50000]	Loss: 3.3906	LR: 0.100000
Training Epoch: 17 [28672/50000]	Loss: 3.1605	LR: 0.100000
Training Epoch: 17 [28800/50000]	Loss: 3.1930	LR: 0.100000
Training Epoch: 17 [28928/50000]	Loss: 3.0634	LR: 0.100000
Training Epoch: 17 [29056/50000]	Loss: 3.5552	LR: 0.100000
Training Epoch: 17 [29184/50000]	Loss: 3.5920	LR: 0.100000
Training Epoch: 17 [29312/50000]	Loss: 3.2766	LR: 0.100000
Training Epoch: 17 [29440/50000]	Loss: 3.4292	LR: 0.100000
Training Epoch: 17 [29568/50000]	Loss: 3.4538	LR: 0.100000
Training Epoch: 17 [29696/50000]	Loss: 3.4859	LR: 0.100000
Training Epoch: 17 [29824/50000]	Loss: 3.2251	LR: 0.100000
Training Epoch: 17 [29952/50000]	Loss: 3.3215	LR: 0.1000

Training Epoch: 17 [45952/50000]	Loss: 3.5165	LR: 0.100000
Training Epoch: 17 [46080/50000]	Loss: 3.2651	LR: 0.100000
Training Epoch: 17 [46208/50000]	Loss: 3.3628	LR: 0.100000
Training Epoch: 17 [46336/50000]	Loss: 3.0976	LR: 0.100000
Training Epoch: 17 [46464/50000]	Loss: 3.4816	LR: 0.100000
Training Epoch: 17 [46592/50000]	Loss: 3.4305	LR: 0.100000
Training Epoch: 17 [46720/50000]	Loss: 3.2894	LR: 0.100000
Training Epoch: 17 [46848/50000]	Loss: 3.3519	LR: 0.100000
Training Epoch: 17 [46976/50000]	Loss: 3.3187	LR: 0.100000
Training Epoch: 17 [47104/50000]	Loss: 3.3102	LR: 0.100000
Training Epoch: 17 [47232/50000]	Loss: 3.2241	LR: 0.100000
Training Epoch: 17 [47360/50000]	Loss: 3.3779	LR: 0.100000
Training Epoch: 17 [47488/50000]	Loss: 3.5925	LR: 0.100000
Training Epoch: 17 [47616/50000]	Loss: 3.2831	LR: 0.100000
Training Epoch: 17 [47744/50000]	Loss: 3.3087	LR: 0.100000
Training Epoch: 17 [47872/50000]	Loss: 3.3829	LR: 0.100000
Training Epoch: 17 [48000/50000]	Loss: 3.2015	LR: 0.1000

Training Epoch: 18 [13952/50000]	Loss: 3.3561	LR: 0.100000
Training Epoch: 18 [14080/50000]	Loss: 3.5020	LR: 0.100000
Training Epoch: 18 [14208/50000]	Loss: 3.2527	LR: 0.100000
Training Epoch: 18 [14336/50000]	Loss: 3.3573	LR: 0.100000
Training Epoch: 18 [14464/50000]	Loss: 3.3014	LR: 0.100000
Training Epoch: 18 [14592/50000]	Loss: 3.3988	LR: 0.100000
Training Epoch: 18 [14720/50000]	Loss: 3.1056	LR: 0.100000
Training Epoch: 18 [14848/50000]	Loss: 3.3429	LR: 0.100000
Training Epoch: 18 [14976/50000]	Loss: 3.4826	LR: 0.100000
Training Epoch: 18 [15104/50000]	Loss: 3.1729	LR: 0.100000
Training Epoch: 18 [15232/50000]	Loss: 3.0429	LR: 0.100000
Training Epoch: 18 [15360/50000]	Loss: 3.4048	LR: 0.100000
Training Epoch: 18 [15488/50000]	Loss: 3.4052	LR: 0.100000
Training Epoch: 18 [15616/50000]	Loss: 3.1716	LR: 0.100000
Training Epoch: 18 [15744/50000]	Loss: 3.4097	LR: 0.100000
Training Epoch: 18 [15872/50000]	Loss: 3.3361	LR: 0.100000
Training Epoch: 18 [16000/50000]	Loss: 3.1785	LR: 0.1000

Training Epoch: 18 [32000/50000]	Loss: 3.1299	LR: 0.100000
Training Epoch: 18 [32128/50000]	Loss: 3.2854	LR: 0.100000
Training Epoch: 18 [32256/50000]	Loss: 3.1843	LR: 0.100000
Training Epoch: 18 [32384/50000]	Loss: 3.1624	LR: 0.100000
Training Epoch: 18 [32512/50000]	Loss: 3.0489	LR: 0.100000
Training Epoch: 18 [32640/50000]	Loss: 3.2962	LR: 0.100000
Training Epoch: 18 [32768/50000]	Loss: 3.1951	LR: 0.100000
Training Epoch: 18 [32896/50000]	Loss: 3.3284	LR: 0.100000
Training Epoch: 18 [33024/50000]	Loss: 3.1901	LR: 0.100000
Training Epoch: 18 [33152/50000]	Loss: 3.3699	LR: 0.100000
Training Epoch: 18 [33280/50000]	Loss: 3.2232	LR: 0.100000
Training Epoch: 18 [33408/50000]	Loss: 3.3144	LR: 0.100000
Training Epoch: 18 [33536/50000]	Loss: 3.1662	LR: 0.100000
Training Epoch: 18 [33664/50000]	Loss: 3.2930	LR: 0.100000
Training Epoch: 18 [33792/50000]	Loss: 3.5123	LR: 0.100000
Training Epoch: 18 [33920/50000]	Loss: 3.4062	LR: 0.100000
Training Epoch: 18 [34048/50000]	Loss: 3.0703	LR: 0.1000

Training Epoch: 19 [128/50000]	Loss: 3.4924	LR: 0.100000
Training Epoch: 19 [256/50000]	Loss: 3.2524	LR: 0.100000
Training Epoch: 19 [384/50000]	Loss: 3.3659	LR: 0.100000
Training Epoch: 19 [512/50000]	Loss: 3.2611	LR: 0.100000
Training Epoch: 19 [640/50000]	Loss: 3.1274	LR: 0.100000
Training Epoch: 19 [768/50000]	Loss: 3.4294	LR: 0.100000
Training Epoch: 19 [896/50000]	Loss: 3.1733	LR: 0.100000
Training Epoch: 19 [1024/50000]	Loss: 3.3354	LR: 0.100000
Training Epoch: 19 [1152/50000]	Loss: 3.2702	LR: 0.100000
Training Epoch: 19 [1280/50000]	Loss: 3.4619	LR: 0.100000
Training Epoch: 19 [1408/50000]	Loss: 3.1905	LR: 0.100000
Training Epoch: 19 [1536/50000]	Loss: 3.0028	LR: 0.100000
Training Epoch: 19 [1664/50000]	Loss: 3.3766	LR: 0.100000
Training Epoch: 19 [1792/50000]	Loss: 3.5848	LR: 0.100000
Training Epoch: 19 [1920/50000]	Loss: 3.2645	LR: 0.100000
Training Epoch: 19 [2048/50000]	Loss: 3.3430	LR: 0.100000
Training Epoch: 19 [2176/50000]	Loss: 3.1444	LR: 0.100000
Training Epoch: 19 [2

Training Epoch: 19 [18176/50000]	Loss: 3.1881	LR: 0.100000
Training Epoch: 19 [18304/50000]	Loss: 3.1547	LR: 0.100000
Training Epoch: 19 [18432/50000]	Loss: 3.4728	LR: 0.100000
Training Epoch: 19 [18560/50000]	Loss: 3.3015	LR: 0.100000
Training Epoch: 19 [18688/50000]	Loss: 3.3425	LR: 0.100000
Training Epoch: 19 [18816/50000]	Loss: 3.2140	LR: 0.100000
Training Epoch: 19 [18944/50000]	Loss: 3.2822	LR: 0.100000
Training Epoch: 19 [19072/50000]	Loss: 3.3108	LR: 0.100000
Training Epoch: 19 [19200/50000]	Loss: 3.2237	LR: 0.100000
Training Epoch: 19 [19328/50000]	Loss: 3.3341	LR: 0.100000
Training Epoch: 19 [19456/50000]	Loss: 3.3864	LR: 0.100000
Training Epoch: 19 [19584/50000]	Loss: 3.2193	LR: 0.100000
Training Epoch: 19 [19712/50000]	Loss: 3.1483	LR: 0.100000
Training Epoch: 19 [19840/50000]	Loss: 3.5249	LR: 0.100000
Training Epoch: 19 [19968/50000]	Loss: 3.2981	LR: 0.100000
Training Epoch: 19 [20096/50000]	Loss: 3.4285	LR: 0.100000
Training Epoch: 19 [20224/50000]	Loss: 3.2310	LR: 0.1000

Training Epoch: 19 [36224/50000]	Loss: 3.4590	LR: 0.100000
Training Epoch: 19 [36352/50000]	Loss: 3.2737	LR: 0.100000
Training Epoch: 19 [36480/50000]	Loss: 3.2493	LR: 0.100000
Training Epoch: 19 [36608/50000]	Loss: 3.3453	LR: 0.100000
Training Epoch: 19 [36736/50000]	Loss: 3.2011	LR: 0.100000
Training Epoch: 19 [36864/50000]	Loss: 3.4326	LR: 0.100000
Training Epoch: 19 [36992/50000]	Loss: 3.2853	LR: 0.100000
Training Epoch: 19 [37120/50000]	Loss: 3.0976	LR: 0.100000
Training Epoch: 19 [37248/50000]	Loss: 3.3225	LR: 0.100000
Training Epoch: 19 [37376/50000]	Loss: 3.3932	LR: 0.100000
Training Epoch: 19 [37504/50000]	Loss: 3.3562	LR: 0.100000
Training Epoch: 19 [37632/50000]	Loss: 3.3791	LR: 0.100000
Training Epoch: 19 [37760/50000]	Loss: 3.1925	LR: 0.100000
Training Epoch: 19 [37888/50000]	Loss: 3.5126	LR: 0.100000
Training Epoch: 19 [38016/50000]	Loss: 3.4164	LR: 0.100000
Training Epoch: 19 [38144/50000]	Loss: 3.1567	LR: 0.100000
Training Epoch: 19 [38272/50000]	Loss: 3.0517	LR: 0.1000

<h3>Results</h3>
Trained for up to 20 epochs on Kaggle. Time taken ~ 9 minutes

In [85]:
torch.save({
            'epoch': epoch+1,
            'model_state_dict': net.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            }, 'weights_googlenet_20ep.pth')
print('saved')

saved
