In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
import sys, shutil
sys.path.append('../../../fastai')

# this file contains all the main external libs used
from fastai.imports import *

from fastai.transforms import *
from fastai.conv_learner import *
from fastai.model import *
from fastai.dataset import *
from fastai.sgdr import *
from fastai.plots import *

In [None]:
DS_PATH = '../../../../_datasets/kaggle_dog-breed-identification'
# DS_PATH = '../../../datasets/kaggle_dog-breed-identification'

PATH = 'data/kaggle_dog-breed-identification'

In [None]:
arch = resnet34
sz = 224
bsz = 64

## 1. Data preparation

In [None]:
os.makedirs(f'{PATH}/models', exist_ok=True)
os.makedirs(f'{PATH}/tmp', exist_ok=True)

abs_ds_path = os.path.abspath(DS_PATH)

# symlink to root datasets so can use same data in other projects
!ln -s {abs_ds_path}/train {PATH}
!ln -s {abs_ds_path}/test {PATH}

In [None]:
labels_csv = f'{DS_PATH}/labels.csv'

labels_df = pd.read_csv(labels_csv)
n = len(labels_df)

print(n)
labels_df.head()

In [None]:
# get cross validation indexes using fastai framework (default = %20 of train)
val_idxs = get_cv_idxs(n)

print(n, len(val_idxs))

## 2. Review Data

In [None]:
# 1. What is the class distribution? (what are 10 most common classes)
print(f'Unique classes: {len(labels_df.breed.unique())}')

labels_df.pivot_table(index='breed', aggfunc=len).sort_values('id', ascending=False)[:10]

In [None]:
# 2. What is the image size distribution? (what is the avg. height(rows) and width(cols))
tfms = tfms_from_model(arch, sz, aug_tfms=transforms_side_on, max_zoom=1.1)

data = ImageClassifierData.from_csv(PATH, csv_fname=f'{DS_PATH}/labels.csv', 
                                    folder='train', test_name='test', 
                                    bs=bsz, tfms=tfms, val_idxs=val_idxs, suffix='.jpg')

In [None]:
fname = f'{DS_PATH}/{data.trn_ds.fnames[0]}'; fname
img = Image.open(fname)

print(img.size)
img

In [None]:
trn_sizes = { fname: Image.open(f'{DS_PATH}/{fname}').size for fname in data.trn_ds.fnames }
trn_sizes[data.trn_ds.fnames[0]]

In [None]:
row_sz, col_sz = list(zip(*trn_sizes.values()))

row_sz = np.array(row_sz)
col_sz = np.array(col_sz)

print(f'Average image size (H x W): {int(row_sz.mean())} x {int(col_sz.mean())}')

In [None]:
# height distribution
f, (ax1, ax2) = plt.subplots(1, 2, figsize=(15,4) )
ax1.hist(row_sz)
ax2.hist(row_sz[row_sz < 1000])

In [None]:
# width distribution
f, (ax1, ax2) = plt.subplots(1, 2, figsize=(15,4) )
ax1.hist(col_sz)
ax2.hist(col_sz[col_sz < 1000])

## 3. Training

**Review: easy steps to train a world-class image classifier:**
1. Enable data augmentation, and precompute=True
1. Use lr_find() to find highest learning rate where loss is still clearly improving
1. Train last layer from precomputed activations for 1-2 epochs
1. Train last layer with data augmentation (i.e. precompute=False) for 2-3 epochs with cycle_len=1
1. Unfreeze all layers
1. Set earlier layers to 3x-10x lower learning rate than next higher layer
1. Use lr_find() again
1. Train full network with cycle_mult=2 until over-fitting

In [None]:
def get_data(sz, bsz, val_idxs=[0], test_name='test'):
    # 20171112 - due to current bug in framework, you can't set val_idxs=None so we
    #            set it to [0] which will use all but 1 example for training
    
    tfms = tfms_from_model(arch, sz, aug_tfms=transforms_side_on, max_zoom=1.1)

    data = ImageClassifierData.from_csv(PATH, csv_fname=f'{DS_PATH}/labels.csv', 
                                        folder='train', test_name=test_name, 
                                        bs=bsz, tfms=tfms, val_idxs=val_idxs, suffix='.jpg')

    # Why is minimum size 300?
    # see http://forums.fast.ai/t/dog-breed-identification-challenge/7464/53?u=wgpubs
    # "Since we have max_zoom=1.1, I figured we should ensure our images are at release sz*1.1
    # and I figured resizing them to 340x340 would save plenty of time, and leave plenty of room to experiment.
    return data if sz > 300 else data.resize(340, 'tmp')

In [None]:
# 1. Enable data augmentation, and precompute=True
data = get_data(sz, bsz, val_idxs)
learn = ConvLearner.pretrained(arch, data, precompute=True, ps=0.5)

In [None]:
# 2. Use lr_find() to find highest learning rate where loss is still clearly improving
learn.lr_find()

learn.sched.plot_lr(); plt.show()
learn.sched.plot(); plt.show()

In [None]:
# 3. Train last layer from precomputed activations for 1-2 epochs
lr = 1e-2

learn.fit(lr, 2)

In [None]:
# 4. Train last layer with data augmentation (i.e. precompute=False) for 2-3 epochs with cycle_len=1
learn.precompute = False

learn.fit(lr, 2, cycle_len=1)

In [None]:
# underfitting, means cycle_len = 1 is too short ... e.g., is popping out before it finds something better
learn.fit(lr, 3, cycle_len=1, cycle_mult=2)

In [None]:
log_preds, y = learn.TTA()
accuracy(log_preds, y)

In [None]:
learn.save('rn34_224_step4')

In [None]:
learn.load('rn34_224_step4')

In [None]:
# WILL NOT DO IN THIS PROBLEM - Because the training dataset is so similar to original training dataset,
# trainign the convolutional layers will not improve network (in fact, this set is a subset of
# ImageNet, the same dataset that our pre-trained model was trained on)

# 5. Unfreeze all layers
# learn.unfreeze()

# 6. Set earlier layers to 3x-10x lower learning rate than next higher layer
# lr = np.array([1e-7, 1e-6, 1e-5])

# 7. Use lr_find() again
# learn.lr_find(lr/1000)
# learn.sched.plot()

# update differential lrs if lr_find() informs us too.

# 8. Train full network with cycle_mult=2 until over-fitting
# learn.fit(lr, 3, cycle_len=1, cycle_mult=2)
# learn.save('rn34_224_step8')

### 3.1 Continue training on larger images

In [None]:
# try increasing the size; moving to larger images can help reduce overfitting
learn.set_data(get_data(299, bsz, val_idxs))

learn.freeze() # just to make sure that every layer EXCEPT the last is frozen

In [None]:
learn.fit(lr, 3, cycle_len=1)

In [None]:
# underfitting so add cycle_mult to give learner a chance to find best parameters before jumping out
learn.fit(lr, 3, cycle_len=1, cycle_mult=2)

In [None]:
# last scores: [ 6.       0.53188  0.39652  0.87951]   
log_preds, y = learn.TTA()
accuracy(log_preds, y)

In [None]:
learn.save('rn34_229_step4')

In [None]:
learn.load('rn34_229_step4')

### 3.2 Try using K-Fold CV - TODO!

In [None]:
# reset hyperparams
lr = 1e-2; sz=224; bsz=10;

In [None]:
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=5, shuffle=True)

# get folds
kfolds = [ (train_idxs, val_idxs) for train_idxs, val_idxs in skf.split(labels_df.id, labels_df.breed) ]
print(len(kfolds))

In [None]:
data = get_data(sz, bsz, kfolds[0][1], test_name=None)
learn = ConvLearner.pretrained(arch, data, precompute=False, ps=0.5)

## Submit predictions

In [None]:
log_preds, y = learn.TTA()

## Analyzing Results