In [1]:
%matplotlib inline
from __future__ import print_function
import os,sys
notebook_code_root = os.path.dirname(os.getcwd())
data_root = os.path.dirname(notebook_code_root) + '/data/'
print("Data root: %s" % data_root)
sys.path.insert(0,notebook_code_root)

import bcolz
from keras.preprocessing import image
from keras.utils.np_utils import to_categorical
import matplotlib.pyplot as plt
import math
import numpy as np
from vgg16 import *

from IPython.display import FileLink

def save_array(fname, arr):
    c=bcolz.carray(arr, rootdir=fname, mode='w'); c.flush()
    
def load_array(fname):
    return bcolz.open(fname)[:]


class NotebookData:
    def __init__(self,
                 data_dir,
                 data='statefarm',
                 results_dir='results',
                 sample_mode=True,
                 train=True,
                 preprocess=True):
        
        self.data_root = data_dir + data + '/'
        self.sample_root = self.data_root + 'sample/'
        self.sample_mode = sample_mode
        self.sample_batch_size = 1
        self.rdir = results_dir
        self.train = train
        self.preprocess = preprocess
        self.training_data = None
        self.validation_data = None
        self.training_labels = None
        self.validation_labels = None
        self.test_data = None
        self.cindices = None
        
    def root_dir(self):
        return self.sample_root if self.sample_mode else self.data_root
    
    def test_dir(self):
        return self.root_dir() + 'test/'
    
    def results_dir(self):
        return self.root_dir() + self.rdir
    
    def train_dir(self):
        return self.root_dir() + 'train/'

    def valid_dir(self):
        return self.root_dir() + 'valid/'
    
    def pproc_dir(self):
        return self.root_dir() + 'preprocessed/'
    
    def batch_size(self, requested_size=8):
        return self.sample_batch_size if self.sample_mode else requested_size
    
    def load_data_and_labels(self):
        """Loads the batches and labels to the internal state.
        
           Upon loading, the data is accessible through the
           corresponding methods.
        """
        gen = image.ImageDataGenerator()
        target_size = (224,224)
        if self.preprocess:
            print('Preprocessing data...')
            if not os.path.isdir(self.pproc_dir()):
                os.mkdir(self.pproc_dir())
                
            batch_arr = []
            for ld,segment in [(self.train_dir(), 'train'),
                               (self.valid_dir(), 'valid'),
                               (self.test_dir(), 'test')]:
                # TODO(ness): segment = os.basename(ld)
                flowgen = gen.flow_from_directory(
                    ld,
                    target_size=target_size,
                    shuffle=False,
                    class_mode=None,
                    batch_size=1)
                # Save the batches using method defined in utils.py
                data = np.concatenate([flowgen.next() for i in range(flowgen.n)])
                batches_dir = self.pproc_dir() + segment + '-bc'
                save_array(batches_dir, data)
                
                # Save the classes.
                cls_dir = self.pproc_dir() + segment + '-cl'
                save_array(cls_dir, flowgen.classes)
                
                batch_arr.append((data, flowgen.classes, flowgen.class_indices))
            
            # Set the data.
            self.training_data = batch_arr[0][0]
            self.validation_data = batch_arr[1][0]
            self.test_data = batch_arr[2][0]
            
            # Classes are zero-indexed and represent a category in
            # numerical form. So if the classes are 'dog' and 'cat',
            # the possible class values will be 0 and 1.
            self.trn_classes = batch_arr[0][1]
            self.val_classes = batch_arr[1][1]
            
            # Labels are the one-hot encoded (i.e. categorical)
            # version of the classes. In other words, if there are
            # 5 classes and an element belongs to class 2,
            # its label will be [0,0,1,0,0] (index 1).
            self.training_labels = to_categorical(batch_arr[0][1])
            self.validation_labels = to_categorical(batch_arr[1][1])
            
            # Class indices are dictionaries of the form
            # {'category_name': 0, 'category_name_2: 1}. They
            # make the mapping between numerical class indices and
            # a human-readable category name. They are (should be...)
            # the same for validation and training, so only load them
            # once, after sanity checking.
            self.cindices = batch_arr[0][2]
            print('Done preprocessing.')
        else:
            print('Loading data...')
            # Load the pre-saved data using methods defined in utils.py. See
            # preprocessing branch for the meaning of the data.
            self.training_data = load_array(self.pproc_dir() + 'train-bc')
            self.validation_data = load_array(self.pproc_dir() + 'valid-bc')
            self.test_data = load_array(self.pproc_dir() + 'test-bc')
            self.trn_classes = load_array(self.pproc_dir() + 'train-cl')
            self.val_classes = load_array(self.pproc_dir() + 'valid-cl')
            self.training_labels = to_categorical(self.trn_classes)
            self.validation_labels = to_categorical(self.val_classes)
            
            # To get the class indices, we create the generator. It's cheap to
            # run since it doesn't actually load all the data.
            flowgen = gen.flow_from_directory(
                self.train_dir(),
                target_size=target_size,
                shuffle=False,
                class_mode=None,
                batch_size=1)    
            self.cindices = flowgen.class_indices
            print('Done loading.')
        
    def trn_data(self):
        if self.training_data is None:
            self.load_data_and_labels()
        return self.training_data
    
    def val_data(self):
        if self.validation_data is None:
            self.load_data_and_labels()
        return self.validation_data
    
    def pp_test_data(self):
        if self.test_data is None:
            self.load_data_and_labels()
        return self.test_data
    
    def trn_labels(self):
        if self.training_labels is None:
            self.load_data_and_labels()
        return self.training_labels
    
    def val_labels(self):
        if self.validation_labels is None:
            self.load_data_and_labels()
        return self.validation_labels
    
    def class_indices(self):
        if self.cindices is None:
            self.load_data_and_labels()
        return self.cindices
        
    def __str__(self):
        return ('Options:\n'
            '  Testing directory: {0}\n'
            '  Training directory: {1}\n'
            '  Validation directory: {2}\n'
            '  Preprocess directory: {3}\n'
            '  Results directory: {4}'
                .format(self.test_dir(),
                        self.train_dir(),
                        self.valid_dir(),
                        self.pproc_dir(),
                        self.results_dir()))


opts = NotebookData(data_dir=data_root, sample_mode=True, preprocess=False)
class_names = [
  'safe driving',
  'texting - right',
  'talking on the phone - right',
  'texting - left',
  'talking on the phone - left',
  'operating the radio',
  'drinking',
  'reaching behind',
  'hair and makeup',
  'talking to passenger',
]
print(opts)

def process_model(model,opt,name,create_submission=False):
    iter_path = opt.results_dir()+ '/' + name
    if not os.path.isdir(iter_path):
        os.makedirs(iter_path)
    model.save_weights(iter_path + '/temp_custom.h5')

    if create_submission:
        # Save the results to usable files.
        filenames = None
        generator = image.ImageDataGenerator()
        batches = generator.flow_from_directory(opt.test_dir(),
                                                target_size=(224,224),
                                                class_mode=None,
                                                batch_size=1)
        print('Batch count: %d' % batches.n)
        print('Batches: %s' % batches)
        preds = model.predict_generator(batches, batches.n,verbose=1)
        filenames = batches.filenames
        predictions_path = iter_path + '/temp_custom_preds.dat'
        save_array(predictions_path, preds)
        print('Saved predictions to: %s' % predictions_path)
        filenames_path = iter_path + '/temp_custom_filenames.dat'
        save_array(filenames_path, filenames)
        print('Saved filenames to: %s' % filenames_path)
    
        # Create the response file.
        file_column = [pth[8:] for pth in filenames]
        clipped_preds = np.clip(preds, 0.05, 0.95)
        preds_col = [','.join(['%.2f' % p for p in pred]) for pred in clipped_preds]
        entries = [','.join([f,p]) for f,p in zip(file_column, preds_col)]
        entries = np.array(entries)

        class_names = ['c%d' % i for i in range(10)]
        title_row = ','.join(['img'] + class_names)

        submission_file_name = iter_path + 'submission.csv'
        np.savetxt(submission_file_name,
                   entries,
                   fmt='%s',
                   header=title_row,
                   comments='')
    
        FileLink(submission_file_name)
    
# Define a fit method to save on time.
def fit_model(model, tbatches, vbatches, opt, batch_size=8, epochs=5):
    bsize = opt.batch_size(batch_size)
    tbatches.batch_size = bsize
    vbatches.batch_size = bsize
    model.fit_generator(tbatches,
                        epochs=epochs,
                        validation_data=vbatches)
    
def load_model(model, opt, iter_name):
    model_dir = opt.results_dir() + '/' + iter_name + '/temp_custom.h5'
    print(model_dir)
    model.load_weights(model_dir)
    print('loaded')

Data root: /home/ubuntu/data/


Using Theano backend.


Options:
  Testing directory: /home/ubuntu/data/statefarm/sample/test/
  Training directory: /home/ubuntu/data/statefarm/sample/train/
  Validation directory: /home/ubuntu/data/statefarm/sample/valid/
  Preprocess directory: /home/ubuntu/data/statefarm/sample/preprocessed/
  Results directory: /home/ubuntu/data/statefarm/sample/results


Using cuDNN version 5103 on context None
Mapped name None to device cuda: Tesla K80 (0000:00:1E.0)


In [None]:
# Load the default model.
vgg_mod = Vgg16()
training_batches = vgg_mod.get_batches(opts.train_dir())
validation_batches = vgg_mod.get_batches(opts.valid_dir())
print(training_batches)

vgg_mod.finetune(training_batches)

vgg_mod.model.compile(optimizer=Adam(lr=0.001), # Slower convergence rate; we are already close.
                     loss='categorical_crossentropy',
                     metrics=['accuracy'])
print('Tuning model compiled. Starting training.')
fit_model(vgg_mod.model,
          training_batches,
          validation_batches,
          opts,
          batch_size=64,
          epochs=5)

Found 160 images belonging to 10 classes.
Found 20 images belonging to 10 classes.
<keras.preprocessing.image.DirectoryIterator object at 0x7faeb8511cd0>
Tuning model compiled. Starting training.
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
#vgg_mod.model.summary()

#process_model(vgg_mod.model, opts, 'v2/top-layer', create_submission=True)
preds, idxs, classes = vgg_mod.predict(opts.pp_test_data())
#load_model(vgg_mod.model, opts, 'v1')

# Now that we've finetuned, we want to make every layer trainable
# to see if it improves the results.
# first_dense_idx = [index
#                    for (index, layer)
#                    in enumerate(vgg_mod.model.layers)
#                    if type(layer) is Dense][0]
for layer in vgg_mod.model.layers[-2:]:
    layer.trainable = True

preds2, idxs2, classes2 = vgg_mod.predict(opts.pp_test_data())
    
vgg_mod.model.compile(optimizer=Adam(lr=0.0001), # Slower convergence rate; we are already close.
                     loss='categorical_crossentropy',
                     metrics=['accuracy'])
preds3, idxs3, classes3 = vgg_mod.predict(opts.pp_test_data())
print('All-dense-layer model compiled. Starting training.')
fit_model(vgg_mod.model,
          training_batches,
          validation_batches,
          opts,
          batch_size=64,
          epochs=5)
preds4, idxs4, classes4 = vgg_mod.predict(opts.pp_test_data())

preds_all = np.stack([classes, classes2, classes3, classes4], axis=1)
print(preds_all)

Loading data...
Found 160 images belonging to 10 classes.
Done loading.
All-dense-layer model compiled. Starting training.
Epoch 1/5
Epoch 2/5
  4/160 [..............................] - ETA: 10s - loss: 0.8298 - acc: 0.7500

In [None]:
for layer in vgg_mod.model.layers:
    layer.trainable = False

vgg_mod.model.layers[-1].trainable = True
vgg_mod.model.compile(optimizer=Adam(lr=0.0001), # Slower convergence rate; we are already close.
                     loss='categorical_crossentropy',
                     metrics=['accuracy'])
fit_model(vgg_mod.model,
          training_batches,
          validation_batches,
          opts,
          batch_size=64,
          epochs=5)