# Caltech Image Classification

In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
from fastai.imports import *
from fastai.transforms import *
from fastai.conv_learner import *
from fastai.model import *
from fastai.dataset import *
from fastai.sgdr import *
from fastai.plots import *

In [None]:
import json

import boto3
import dill as dill

In [None]:
PATH = 'data/caltech_256/'
sz = 224
arch = resnext50
bs=32

# Download the data

In [None]:
import os
import urllib.request

def download(url):
    filename = url.split("/")[-1]
    if not os.path.exists(filename):
        urllib.request.urlretrieve(url, filename)


# Caltech-256 image files
download('http://www.vision.caltech.edu/Image_Datasets/Caltech256/256_ObjectCategories.tar')
%mkdir -p data/caltech_256/models
!tar -xf 256_ObjectCategories.tar -C data/caltech_256/
!mv data/caltech_256/256_ObjectCategories data/caltech_256/train
!rm 256_ObjectCategories.tar

In [None]:
def read_dirs(path, folder):
    '''
    Fetches name of all files in path in long form, and labels associated by extrapolation of directory names. 
    '''
    lbls, fnames, all_lbls = [], [], []
    full_path = os.path.join(path, folder)
    for lbl in sorted(os.listdir(full_path)):
        if lbl not in ('.ipynb_checkpoints','.DS_Store'):
            all_lbls.append(lbl)
            for fname in os.listdir(os.path.join(full_path, lbl)):
                if fname.endswith('.jpg'):
                    fnames.append(os.path.join(lbl, fname))
                    lbls.append(lbl)
    return fnames, lbls, all_lbls

In [None]:
fnames, lbls, all_lbls = read_dirs(PATH, 'train')

In [None]:
len(fnames), len(lbls), len(all_lbls)

In [None]:
label_df = pd.DataFrame({ 'id': fnames, 'label': lbls })

In [None]:
label_df.head()

In [None]:
label_df.pivot_table(index='label', aggfunc=len).sort_values('id', ascending=False)

In [None]:
label_df.to_csv(f'{PATH}labels.csv', index=False)

In [None]:
label_df = pd.read_csv(f'{PATH}labels.csv')

In [None]:
n = label_df.size//2
val_idxs = get_cv_idxs(n)

In [None]:
len(val_idxs)

In [None]:
def get_data(sz, bs):
    tfms = tfms_from_model(arch, sz, aug_tfms=transforms_side_on, max_zoom=1.1)
    data = ImageClassifierData.from_csv(PATH, 'train', 
               f'{PATH}labels.csv', num_workers=4,
               val_idxs=val_idxs, tfms=tfms, bs=bs)
    return data if sz>300 else data.resize(340, 'tmp')

In [None]:
data = get_data(224, bs)

In [None]:
learn = ConvLearner.pretrained(arch, data, precompute=True, ps=0.5)

In [None]:
lrf=learn.lr_find()

In [None]:
learn.sched.plot_lr()

In [None]:
learn.sched.plot()
LEARN_RATE=3e-2
plt.axvline(x=LEARN_RATE, color="red");

In [None]:
learn.fit(LEARN_RATE, 1)
learn.precompute=False

In [None]:
learn.precompute=False
learn.fit(LEARN_RATE, 3, cycle_len=1)

In [None]:
learn.unfreeze()
lr=np.array([3e-4,3e-3,3e-2])

In [None]:
learn.fit(lr, 3, cycle_len=1, cycle_mult=2)

In [None]:
torch.save(learn.model, f'{PATH}models/caltech_256_resnext50_224.pt', pickle_module=dill)

In [None]:
learn.save('224_pre')

In [None]:
learn.load('224_pre')

In [None]:
learn.set_data(get_data(299, bs))

In [None]:
learn.fit(lr, 3, cycle_len=1, cycle_mult=2)

In [None]:
torch.save(learn.model, f'{PATH}models/caltech_256_resnext50_299.pt', pickle_module=dill)

In [None]:
learn.save('299')

In [None]:
learn.load('299')

In [None]:
log_preds,y = learn.TTA()
probs = np.mean(np.exp(log_preds),0)

accuracy_np(probs,y)

# Analyse results

In [None]:
preds = np.argmax(probs, axis=1)

from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y, preds)

In [None]:
plot_confusion_matrix(cm, data.classes)

## Save classes json file

In [None]:
with open(f'{PATH}/models/classes.json', 'w') as outfile:
    json.dump(json.dumps(all_lbls), outfile)

## Create tarball with model file

In [None]:
!tar czvf data/caltech_256/model.tar.gz -C data/caltech_256/models/ caltech_256_resnext50_299.pt classes.json

## Upload model to S3

In [None]:
import boto3

region = boto3.session.Session().region_name
account_id = boto3.client('sts').get_caller_identity().get('Account')

bucket = f'sagemaker-{account_id}-{region}'
print(f'Bucket is: {bucket}')

In [None]:
key='models/caltech256_fastai/model.tar.gz'   # prefix of the S3 bucket of the model file

In [None]:
boto3.client('s3').upload_file(PATH+"model.tar.gz", bucket, key)
print(f"Uploaded model artefacts to s3://{bucket}/{key}")