# cifar-100 conv net with Caffe for DVIA

Based on guillaume-chevalier's implementation of NiN for Cifar-100 imageset. This is a variation of the same database scaled _up_ to 48x48 (DVIA images at 48x48 full resolution).

It is based on the NIN (Network In Network) architecture detailed in this paper: http://arxiv.org/pdf/1312.4400v3.pdf. 

https://github.com/guillaume-chevalier/python-caffe-custom-cifar-100-conv-net for original implementation.

## Convert the cifar-100 dataset to Caffe's HDF5 format
This step converts previously downloaded Cifar-100 database to 48x48 HDF5 DB.

In [1]:
%%time

!ipython convert-cifar-100.ipy

Converting...
Conversion was already done. Did not convert twice.

CPU times: user 16 ms, sys: 4 ms, total: 20 ms
Wall time: 1.61 s


## Build the model with Caffe. 

In [1]:
import numpy as np
import os, sys

scriptpath = os.path.dirname(os.path.realpath( "__file__" ))
caffe_root  = os.path.sep.join(scriptpath.split(os.path.sep)[:-2])
#caffe_root = os.path.join(os.environ['HOME'], 'Projects', 'dvcaffe')
db_root     = os.path.join(os.environ['HOME'], 'Projects', 'IMAGES', 'dvia', 'png.48x48')

import caffe
from caffe import layers as L
from caffe import params as P


print "caffe_root = {}".format(caffe_root)
print "db_root = {}".format(db_root)

  from ._caffe import Net, SGDSolver, NesterovSolver, AdaGradSolver, \
  from ._caffe import Net, SGDSolver, NesterovSolver, AdaGradSolver, \
  from ._caffe import Net, SGDSolver, NesterovSolver, AdaGradSolver, \


caffe_root = /home/maheriya/Projects/dvcaffe
db_root = /home/maheriya/Projects/IMAGES/dvia/png.48x48


In [2]:
weight_param = dict(lr_mult=1, decay_mult=1)
bias_param   = dict(lr_mult=2, decay_mult=0)
## Use for training from scratch
learned_param = [weight_param, bias_param]

frozen_weight_param = dict(lr_mult=1.0/5.0, decay_mult=1.0/5.0)
frozen_bias_param   = dict(lr_mult=2.0/5.0, decay_mult=0)
## Use for training from a pretrained model
frozen_param = [frozen_weight_param, frozen_bias_param]

def cnn_inner_layers(n, param=learned_param):
    '''
    n: caffe.NetSpec instance
    It is assumed that n.data is already created.
    '''
    n.conv1 = L.Convolution(n.data, kernel_size=5, stride=1, num_output=64, 
                            weight_filler=dict(type='xavier'), param=param)
    n.cccp1a = L.Convolution(n.conv1, kernel_size=1, num_output=42,
                             weight_filler=dict(type='xavier'), param=param)
    n.relu1a = L.ReLU(n.cccp1a, in_place=True)
    n.cccp1b = L.Convolution(n.relu1a, kernel_size=1, num_output=32,
                             weight_filler=dict(type='xavier'), param=param)
    n.pool1 = L.Pooling(n.cccp1b, kernel_size=3, stride=2, pool=P.Pooling.MAX)
    n.drop1 = L.Dropout(n.pool1, in_place=True)
    n.relu1b = L.ReLU(n.drop1, in_place=True)
    
    n.conv2 = L.Convolution(n.relu1b, kernel_size=3, num_output=64,
                            weight_filler=dict(type='xavier'), param=param)
    n.pool2 = L.Pooling(n.conv2, kernel_size=3, stride=2, pool=P.Pooling.MAX)
    n.drop2 = L.Dropout(n.pool2, in_place=True)
    n.relu2 = L.ReLU(n.drop2, in_place=True)
    
    # Last convolution layer. Note: Param always learned from scratch
    n.conv3 = L.Convolution(n.relu2, kernel_size=3, num_output=96,
                            weight_filler=dict(type='xavier'), param=learned_param)
    n.pool3 = L.Pooling(n.conv3, kernel_size=2, stride=2, pool=P.Pooling.AVE)
    n.relu3 = L.ReLU(n.pool3, in_place=True)
    
    # Last convolution layer. Note: Param always learned from scratch similar to the last convolution layer.
    n.fc1 = L.InnerProduct(n.relu3, num_output=600, weight_filler=dict(type='xavier'), param=learned_param)
    n.relu_last = L.ReLU(n.fc1, in_place=True)
    return n


def cnn_cifar(hdf5, batch_size):
    n = caffe.NetSpec()
    ## Input LMDB data layer
    n.data, n.label_coarse, n.label_fine = L.HDF5Data(batch_size=batch_size, source=hdf5, ntop=3)

    # Create inner layers
    n = cnn_inner_layers(n)

    # Output: 20-class and 100-class classifiers
    n.fc_coarse       = L.InnerProduct(n.relu_last, num_output=20, weight_filler=dict(type='xavier'), param=learned_param)
    n.accuracy_coarse = L.Accuracy(n.fc_coarse, n.label_coarse)
    n.loss_coarse     = L.SoftmaxWithLoss(n.fc_coarse, n.label_coarse, loss_weight=0.65)
    
    n.fc_fine         = L.InnerProduct(n.relu_last, num_output=100, weight_filler=dict(type='xavier'), param=learned_param)
    n.accuracy_fine   = L.Accuracy(n.fc_fine, n.label_fine)
    n.loss_fine       = L.SoftmaxWithLoss(n.fc_fine, n.label_fine, loss_weight=0.35)
    
    return n.to_proto()
    
with open('dvia_pretrain.prototxt', 'w') as f:
    f.write(str(cnn_cifar('cifar_100_caffe_hdf5/train.txt', 100)))
    
with open('dvia_pretest.prototxt', 'w') as f:
    f.write(str(cnn_cifar('cifar_100_caffe_hdf5/test.txt', 120)))

!python /usr/local/caffe/python/draw_net.py dvia_pretrain.prototxt cifar_net.png

  from ._caffe import Net, SGDSolver, NesterovSolver, AdaGradSolver, \
  from ._caffe import Net, SGDSolver, NesterovSolver, AdaGradSolver, \
  from ._caffe import Net, SGDSolver, NesterovSolver, AdaGradSolver, \
Drawing net to cifar_net.png


In [5]:
def cnn(lmdb, batch_size):
    n = caffe.NetSpec()
    ## Input LMDB data layer
    n.data, n.label = L.Data(batch_size=batch_size, source=lmdb, backend=P.Data.LMDB, 
                             transform_param=dict(scale=1./256, mirror=True), ntop=2)

    # Create inner layers
    n = cnn_inner_layers(n, frozen_param)

    # Output 4-class classifier
    n.fc_class = L.InnerProduct(n.relu_last, num_output=4, weight_filler=dict(type='xavier'), param=learned_param)
    n.accuracy_class = L.Accuracy(n.fc_class, n.label)
    n.loss_c = L.SoftmaxWithLoss(n.fc_class, n.label)
    
    return n.to_proto()
    
with open('dvia_train.prototxt', 'w') as f:
    trn_lmdb = os.path.join(db_root, 'data/dvia_48x48/dvia_trn_lmdb')
    f.write(str(cnn(trn_lmdb, 100)))
    
with open('dvia_test.prototxt', 'w') as f:
    val_lmdb = os.path.join(db_root, 'data/dvia_48x48/dvia_val_lmdb')
    f.write(str(cnn(val_lmdb, 120)))

!python /usr/local/caffe/python/draw_net.py dvia_train.prototxt dvia_net.png

  from ._caffe import Net, SGDSolver, NesterovSolver, AdaGradSolver, \
  from ._caffe import Net, SGDSolver, NesterovSolver, AdaGradSolver, \
  from ._caffe import Net, SGDSolver, NesterovSolver, AdaGradSolver, \
Drawing net to dvia_net.png


## Pre-Train Using Cifar-100 DB (scaled to 48x48)
The purpose of this pre-training part is to take advantage of the Cifar-100 database to get better feature extractor as a initial condition for later training with our own image database. 

In [4]:
%%time
!caffe train -solver dvia_presolver.prototxt

I0515 14:05:32.977327 30099 caffe.cpp:185] Using GPUs 0
I0515 14:05:33.013499 30099 caffe.cpp:190] GPU 0: GeForce GTX 470
I0515 14:05:33.170708 30099 solver.cpp:48] Initializing solver from parameters: 
train_net: "dvia_pretrain.prototxt"
test_net: "dvia_pretest.prototxt"
test_iter: 100
test_interval: 1000
base_lr: 0.0006
display: 100
max_iter: 150000
lr_policy: "inv"
gamma: 0.0001
power: 0.75
momentum: 0
weight_decay: 0.001
snapshot: 50000
snapshot_prefix: "cifar_pretrain"
solver_mode: GPU
device_id: 0
rms_decay: 0.98
type: "RMSProp"
I0515 14:05:33.170972 30099 solver.cpp:81] Creating training net from train_net file: dvia_pretrain.prototxt
I0515 14:05:33.172169 30099 net.cpp:49] Initializing net from parameters: 
state {
  phase: TRAIN
}
layer {
  name: "data"
  type: "HDF5Data"
  top: "data"
  top: "label_coarse"
  top: "label_fine"
  hdf5_data_param {
    source: "cifar_100_caffe_hdf5/train.txt"
    batch_size: 100
  }
}
layer {
  name: "conv1"
  type: "Convolution"
  bottom: "data

In [6]:
!ls -rt cifar_pretrain_iter*.caffemodel | tail -n1 | xargs -i cp {} cifar_pretrained.caffemodel

## Pre-Training done!

## Load and visualise the untrained network's internal structure and shape
The network's structure (graph) visualisation tool of caffe is broken in the current release. We will simply print here the data shapes. 

In [15]:
caffe.set_mode_gpu()
solver = None
# trnet = caffe.Net('dvia_pretrain.prototxt', caffe.TRAIN)
# tstnet = caffe.Net('dvia_pretest.prototxt', caffe.TEST)
# tstnets = [ tstnet ]
# solver = caffe.RMSPropSolver('dvia_presolver.prototxt')
# solver.train_net = trnet
# solver.test_nets[0] = tstnets[0]

solver = caffe.RMSPropSolver('dvia_solver.prototxt')
#solver = caffe.get_solver('dvia_solver.prototxt')
print("Layers' features:")
[(k, v.data.shape) for k, v in solver.net.blobs.items()]
print("Parameters and shape:")
[(k, v[0].data.shape) for k, v in solver.net.params.items()]

## Solver's params

The solver's params for the created net are defined in a `.prototxt` file. 

Notice that because `max_iter: 100000`, the training will loop 2 times on the 50000 training data. Because we train data by minibatches of 100 as defined above when creating the net, there will be a total of `100000*100/50000 = 200` epochs on some of those pre-shuffled 100 images minibatches.

We will test the net on `test_iter: 100` different test images at each `test_interval: 1000` images trained. 
____

Here, **RMSProp** is used, it is SDG-based, it converges faster than a pure SGD and it is robust.
____

In [7]:
!cat dvia_solver.prototxt

train_net: "dvia_train.prototxt"
test_net: "dvia_test.prototxt"

test_iter: 100
test_interval: 1000

base_lr: 0.0001
momentum: 0.0
weight_decay: 0.001

lr_policy: "inv"
gamma: 0.0001
power: 0.75

display: 100

max_iter: 150000

snapshot: 50000
snapshot_prefix: "dvia_train"
solver_mode: GPU

type: "RMSProp"
rms_decay: 0.98


## Alternative way to train directly in Python
Since a recent update, there is no output in python by default, which is bad for debugging. 
Skip this cell and train with the second method shown below if needed. It is commented out in case you just chain some `shift+enter` ipython shortcuts. 

In [8]:
# %%time
# solver.solve()
solver = None

## Train by calling caffe in command line
Just set the parameters correctly. Be sure that the notebook is at the root of the ipython notebook server. 
You can run this in an external terminal if you open it in the notebook's directory. 

It is also possible to finetune an existing net with a different solver or different data. Here I do it, because I feel the net could better fit the data. 

In [9]:
%%time
!caffe train -solver dvia_solver.prototxt -weights cifar_pretrained.caffemodel

I0515 21:41:51.101392 10127 caffe.cpp:185] Using GPUs 0
I0515 21:41:51.207337 10127 caffe.cpp:190] GPU 0: GeForce GTX 470
I0515 21:41:51.421470 10127 solver.cpp:48] Initializing solver from parameters: 
train_net: "dvia_train.prototxt"
test_net: "dvia_test.prototxt"
test_iter: 100
test_interval: 1000
base_lr: 0.0001
display: 100
max_iter: 150000
lr_policy: "inv"
gamma: 0.0001
power: 0.75
momentum: 0
weight_decay: 0.001
snapshot: 50000
snapshot_prefix: "dvia_train"
solver_mode: GPU
device_id: 0
rms_decay: 0.98
type: "RMSProp"
I0515 21:41:51.421743 10127 solver.cpp:81] Creating training net from train_net file: dvia_train.prototxt
I0515 21:41:51.422803 10127 net.cpp:49] Initializing net from parameters: 
state {
  phase: TRAIN
}
layer {
  name: "data"
  type: "Data"
  top: "data"
  top: "label"
  transform_param {
    scale: 0.00390625
    mirror: true
  }
  data_param {
    source: "/home/maheriya/Projects/IMAGES/dvia/png.48x48/data/dvia_48x48/dvia_trn_lmdb"
    batch_size: 100
    back

Caffe brewed. 
## Test the model completely on test data
Let's test directly in command-line:

In [10]:
%%time
!caffe test -model dvia_test.prototxt -weights dvia_train_iter_150000.caffemodel -iterations 100

I0516 10:31:09.560021 28565 caffe.cpp:246] Use CPU.
I0516 10:31:10.824569 28565 net.cpp:49] Initializing net from parameters: 
state {
  phase: TEST
}
layer {
  name: "data"
  type: "Data"
  top: "data"
  top: "label"
  transform_param {
    scale: 0.00390625
    mirror: true
  }
  data_param {
    source: "/home/maheriya/Projects/IMAGES/dvia/png.48x48/data/dvia_48x48/dvia_val_lmdb"
    batch_size: 120
    backend: LMDB
  }
}
layer {
  name: "conv1"
  type: "Convolution"
  bottom: "data"
  top: "conv1"
  param {
    lr_mult: 0.2
    decay_mult: 0.2
  }
  param {
    lr_mult: 0.4
    decay_mult: 0
  }
  convolution_param {
    num_output: 64
    kernel_size: 5
    stride: 1
    weight_filler {
      type: "xavier"
    }
  }
}
layer {
  name: "cccp1a"
  type: "Convolution"
  bottom: "conv1"
  top: "cccp1a"
  param {
    lr_mult: 0.2
    decay_mult: 0.2
  }
  param {
    lr_mult: 0.4
    decay_mult: 0
  }
  convolution_param {
    num_output: 42
    kernel_size: 1
    weight_filler {
    

## The model achieved over 85% accuracy
The above is purely test/validation database that is not used for training.

In [11]:
!jupyter nbconvert --to markdown dvia-train.ipynb

[NbConvertApp] Converting notebook dvia-train.ipynb to markdown
[NbConvertApp] Writing 1725612 bytes to dvia-train.md
