In [0]:
# Use this cell to change directory to one level above. 
# The default directory is the "content" folder, and its parent directory 
# was preferable as it made uploads more convenient for me

# In terms of what I uploaded, they correspond to helper and model classes that 
# I had previously written, and didn't want to copy into this notebook

%cd .. 
!ls

In [0]:
# Use this cell to mount your drive; it is useful for storing any output files 
# to a safe location.

from google.colab import drive
drive.mount('/content/gdrive')

In [0]:
# Use this cell to establish a connection to the Google Cloud storage where 
# your dataset is located. 

import json
import os
import pprint
import re
import time
import tensorflow as tf

bucket = 'YOUR_BUCKET_NAME'

from google.colab import auth
auth.authenticate_user()

if 'COLAB_TPU_ADDR' in os.environ:
  print("Found colab tpu addr\n")
  TF_MASTER = 'grpc://{}'.format(os.environ['COLAB_TPU_ADDR'])
  
  # Upload credentials to TPU.
  with tf.Session(TF_MASTER) as sess:    
    with open('/content/adc.json', 'r') as f:
      auth_info = json.load(f)
    tf.contrib.cloud.configure_gcs(sess, credentials=auth_info)
  # Now credentials are set for all future sessions on this TPU.
else:
  TF_MASTER=''

with tf.Session(TF_MASTER) as session:
  pprint.pprint(session.list_devices())

In [0]:
########################################################################
###################  VERY IMPORTANT  ###################################
# Use this cell to downgrade TensorFlow to version 1.13; the code is not 
# compatible with 1.14
!pip install tensorflow==1.13.2
import tensorflow as tf
print(tf.__version__)

##################### 

The cell below corresponds to the main component of this notebook. It is responsible for training the network and saving the results (weights, accuracy, etc.) to Google Drive. 

Weights are exported in .h5 format. The entire models are saved, but these were found to be either glitchy, buggy, or redundant with respect to the exported weights, and so were not used. Results are saved into a .csv file for processing and plotting at a later time. 

In terms of process flow, training-related tasks were performed largely using this notebook, and various experiments were conducted by commenting/uncommenting various code blocks according to my needs. Non-training tasks were performed locally since they did not require the TPU

##################### 

In [0]:
# Below are import statements to use required frameworks
from __future__ import absolute_import, division, print_function

import os
import sys

# ----------------------------------------------------------- #

import tensorflow as tf
import numpy as np

import traceback	# for error tracebacks
import csv

# Imports below are for files in the working directory
import food_classifier_models as models
import helper_funcs as helper

## END IMPORTS


## 
# Hyperparameters
NUM_EPOCHS = 60
# format: epoch to update learning rate at, multiplier (for step decay)
EPOCH_UPDATES = [[25, 0.002], [45, 0.0004]]
''' batch size is multiplied by 8 by the TPU, so a value of 64 actually 
corresponds to 512. If receiving errors, try to ensure that all samples are 
captured in the batch, and drop the remainder '''
BATCH_SIZE = 64
'''each entry in the shuffle buffer is approximately 320kB. 1000 elements
corresponds to ~320MB. Choose a size that matches the desired amount of 
memory you'd like to use '''
SHUFFLE_BUFFER_SIZE = 6144
INITIAL_LEARNING_RATE = 0.01
LR_DECAY = 0.0005
MOMENTUM = 0.9

'''Tailor the steps_per_epoch to suit the number of batched entries you'd 
expect given your chosen image augmentation scheme

In my case, I had 22 for my batch size for no augmentation, 44 with minimal, 
and 200 with more augmentation, for UEC100'''
#STEPS_PER_EPOCH = 22 # uecfood100 22 44 200
STEPS_PER_EPOCH = 110 # uecfood256 55 110 497
#VALID_STEPS = 5 # validation steps for UEC100
VALID_STEPS = 10 # validation steps for UEC256

# Convenience constants
MODEL = 0	# 0 for mobilenetv2, 1 for densenet
USE_WIDESLICE = False
IS_TRAINED = False

DATASET = 1		# 0 for UEC100, 1 for UEC256


# Dataset paths

''' This group of constants below is used to set the base directory for the 
tfrecords files '''
GCS_BASE = 'gs://YOUR_BUCKET_NAME' # <------------- ENTER YOUR BUCKET NAME
UEC256_DIR = '/uecfood256_split' # <--- Folder path from base bucket
RESULTS_BASE = '/content/gdrive/My Drive/PATH_TO_RESULTS' 
#                                 ^^^^^^^^^^^^^^ ENTER YOUR GDRIVE PATH
UEC_TFRECORD_SPLIT = ['/val0.tfrecords', '/val1.tfrecords', '/val2.tfrecords',
                      '/val3.tfrecords', '/val4.tfrecords']


''' The group below defines a set of constants whose values are assigned based
on the dataset (UEC100 or UEC256). It's to maintain one point of change '''
NUM_CATEGORY = None
DS_BASE = None


''' This if statement assigns constant values (typically for paths) based on 
whether we're training on UEC100 or 256 '''
if (DATASET == 0):
  NUM_CATEGORY = 100	# 100 different food classes for UEC100
  DS_BASE = GCS_BASE
elif (DATASET == 1):
  NUM_CATEGORY = 256	# 256 different food classes for UEC256
  DS_BASE = GCS_BASE + UEC256_DIR


# Paths and file names
UEC100_MOBNET_WEIGHTS = 'mob100weights.h5'
UEC100_MOBNET = 'modelmobnet.h5'
UEC100_MOBNET_METRICS = 'metricsmobnet.csv'

UEC256_MOBNET_WEIGHTS = 'mob256weights.h5'
UEC256_MOBNET = 'modelmobnet.h5'
UEC256_MOBNET_METRICS = 'metricsmobnet256.csv'

UEC100_DENSE_WEIGHTS = 'dense100weights.h5'
UEC100_DENSE = 'modeldense.h5'
UEC100_DENSE_METRICS = 'metricsdense.csv'

UEC256_DENSE_WEIGHTS = 'dense256weights.h5'
UEC256_DENSE = 'modeldense.h5'
UEC256_DENSE_METRICS = 'metricsdense256.csv'

BASE_PATH_WEIGHTS = None
BASE_PATH_MODEL = None
BASE_PATH_METRICS = None

if DATASET == 0:
  if MODEL == 0:
    BASE_PATH_WEIGHTS = RESULTS_BASE + UEC100_MOBNET_WEIGHTS
    BASE_PATH_MODEL = RESULTS_BASE + UEC100_MOBNET
    BASE_PATH_METRICS = RESULTS_BASE + UEC100_MOBNET_METRICS
  elif MODEL == 1:
    BASE_PATH_WEIGHTS = RESULTS_BASE + UEC100_DENSE_WEIGHTS
    BASE_PATH_MODEL = RESULTS_BASE + UEC100_DENSE
    BASE_PATH_METRICS = RESULTS_BASE + UEC100_DENSE_METRICS
    
elif DATASET == 1:
  if MODEL == 0:
    BASE_PATH_WEIGHTS = RESULTS_BASE + UEC256_MOBNET_WEIGHTS
    BASE_PATH_MODEL = RESULTS_BASE + UEC256_MOBNET
    BASE_PATH_METRICS = RESULTS_BASE + UEC256_MOBNET_METRICS
  elif MODEL == 1:
    BASE_PATH_WEIGHTS = RESULTS_BASE + UEC256_DENSE_WEIGHTS
    BASE_PATH_MODEL = RESULTS_BASE + UEC256_DENSE
    BASE_PATH_METRICS = RESULTS_BASE + UEC256_DENSE_METRICS


## END constant definitions


''' 
This callback function updates the learning rate for the Keras fit method. 
@param epoch : The current epoch
@param lr : The current learning rate. 
@return updated_lr if updates are required; lr otherwise
''' 
def update_lr(epoch, lr):
  for epoch_update in EPOCH_UPDATES:
    if epoch == epoch_update[0]:
      updated_lr = epoch_update[1]
      print("Updating lr: ", updated_lr)
      return float(updated_lr)
    
  return float(lr)
  
        
'''
This is a function wrapper for the training dataset, since keras-TPU did not 
support direct feeding at the time of experimentation (around ~Summer 2019). 

It implements best practices according to the TensorFlow documentation

@return training_dataset : The modified training dataset, including batching 
and shuffles
'''
def input_dataset():
  # Append all dataset paths except the last one (which is used for validation)
  training_dataset_paths = [DS_BASE + UEC_TFRECORD_SPLIT[i] for i in range(len(UEC_TFRECORD_SPLIT) - 1)]
  training_dataset = tf.data.Dataset.list_files(training_dataset_paths)
  training_dataset = training_dataset.apply(tf.contrib.data.parallel_interleave(
      tf.data.TFRecordDataset, cycle_length=4, sloppy = True))
  # Parses each example (or entry) in the .tfrecords file
  training_dataset = training_dataset.map(helper.parse_tfrecord_example, num_parallel_calls=8)
  training_dataset = training_dataset.cache()
  # Apply augmentation. This example evaluates my third scheme
  training_dataset = training_dataset.map(helper.augment_image_three, num_parallel_calls=8)
  # My augmentation scheme returns a list of images, which needs to be unbatched
  training_dataset = training_dataset.apply(tf.data.experimental.unbatch())	
  training_dataset = training_dataset.shuffle(SHUFFLE_BUFFER_SIZE, reshuffle_each_iteration = True)
  
  # Re-batches the shuffled data, dropping remainders to avoid errors
  training_dataset = training_dataset.batch(BATCH_SIZE, drop_remainder=True)
  training_dataset = training_dataset.prefetch(BATCH_SIZE)
  training_dataset = training_dataset.repeat()
  
  return training_dataset

'''
This is a function wrapper for the validation dataset, similar to above.It 
implements best practices according to the TensorFlow documentation

@return validation_dataset : The modified validation dataset
'''
def valid_dataset():
  testing_dataset_path = DS_BASE + UEC_TFRECORD_SPLIT[-1]	
  validation_dataset = tf.data.Dataset.list_files(testing_dataset_path)
  validation_dataset = validation_dataset.apply(tf.contrib.data.parallel_interleave(
      tf.data.TFRecordDataset, cycle_length=4, sloppy = True))
  validation_dataset = validation_dataset.map(helper.parse_tfrecord_example, num_parallel_calls=8)
  validation_dataset = validation_dataset.cache()
  # 2850 arbitrarily chosen based on manually-determined size of dataset. Note
  # that this is less efficient for UEC100, which has less entries; consider 
  # using 2 different values. I wanted convenience, so left it as is
  validation_dataset = validation_dataset.shuffle(2850, reshuffle_each_iteration = True)
  validation_dataset = validation_dataset.apply(
      tf.data.experimental.map_and_batch(
        map_func=helper.resize_validation_image, 
        batch_size=BATCH_SIZE, 
        drop_remainder=True,
        num_parallel_calls=8
      ))
  validation_dataset = validation_dataset.prefetch(BATCH_SIZE)
  validation_dataset = validation_dataset.repeat()
  
  return validation_dataset



'''
Main training loop. Manual changes are required to re-run with different 
experimental settings, like model. 
'''
if __name__ == "__main__":
  tf.keras.backend.clear_session() # destroys old graphs to clear clutter

  print("tf version: ", tf.VERSION) # <-------- ENSURE VERSION 1.13.2
  print("keras version: ", tf.keras.__version__)
  
  if MODEL == 0:
    print("\nCreating mobnet model...\n")
    model = models.MobNetVTwo(BATCH_SIZE, NUM_CATEGORY, USE_WIDESLICE)
    
    #############################
    # Load weights here if desired. Comment the portion out if you dont want to 
    #############################
    #model.load_weights('mob100weights.h5')
    #model.load_weights('mob256weights.h5')
    
  elif MODEL == 1:
    print("\nCreating dense model...\n")
    model = models.DenseNet(BATCH_SIZE, NUM_CATEGORY, USE_WIDESLICE)

    #############################
    # Load weights here if desired. Comment the portion out if you dont want to 
    #############################
    #model.load_weights('dense100weights.h5')
    #model.load_weights('dense256weights.h5')
  
  
  #############################
  # Use below to determine how much fine-tuning you wish to do 
  #############################
  num_layers = len(model.layers)
  
  for i in range(num_layers):
    layer = model.layers[i]
    layer.trainable = True
    
#     threshold = int(0.8 * num_layers)
#     if i <= threshold:
#       layer.trainable = False
#     else:
#       layer.trainable = True
  
  
  
  # This prints the model summary for review. Uncomment if you'd like to see it
  # model.summary()
  
  # Creates a stochastic gradient descent optimizer for training. Experiment 
  # with your own if you'd like
  optimizer = tf.keras.optimizers.SGD(lr=INITIAL_LEARNING_RATE, momentum=MOMENTUM, decay=LR_DECAY, nesterov=True)
  
  print("\nModel built. Compiling model...\n")

  tpu_address = 'grpc://' + os.environ['COLAB_TPU_ADDR']
  print ('TPU address is', tpu_address)
  
  model.compile(
    loss='categorical_crossentropy',
    optimizer=optimizer,
    metrics=['categorical_accuracy']
  )
  
  
  # This is necessary to use the TPU from a Keras model
  model = tf.contrib.tpu.keras_to_tpu_model(
      model,
      strategy=tf.contrib.tpu.TPUDistributionStrategy(
          tf.contrib.cluster_resolver.TPUClusterResolver(tpu=tpu_address)
      )
  )
  
  # Learning rate callback to update the learning rate at each epoch. This 
  # assigns the previously-defined callback function
  lr_updater_cbk = tf.keras.callbacks.LearningRateScheduler(update_lr)
  print("\nModel compiled. Beginning training loop...\n")
  
  
  # Train the model and get the history
  history = model.fit(
    x=input_dataset,
    epochs=NUM_EPOCHS,
    verbose=2,
    callbacks=[lr_updater_cbk],
    validation_data=valid_dataset,
    steps_per_epoch=STEPS_PER_EPOCH,
    validation_steps=VALID_STEPS
  )
  
  print("\nCompleted training. Saving history to drive")
  
  # Saves history to gdrive for later processing
  with open(BASE_PATH_METRICS, 'w', newline='') as csv_file:
    writer = csv.writer(csv_file)
    for key, value in history.history.items():
      writer.writerow([key, value])
      
  csv_file.close()
  
  print("\nSaving weights to drive.")
  savepath = BASE_PATH_WEIGHTS
  model.save_weights(savepath, overwrite=True)
  print("\nWeights saved.")
  
  # Use below to save model if you'd like. I personally did not find much use 
  # for it, and it may have been buggy for me too -- I do not recall anymore.
  
#   print("\nAttempting to save model")
#   model.save(
#     BASE_PATH_MODEL,
#     overwrite=True,
#     include_optimizer=True
#   )
  
  print("\nSaved. Exiting program")

tf version:  1.13.2
keras version:  2.2.4-tf

Creating mobnet model...

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
mobnetv2_in (InputLayer)        (512, 224, 224, 3)   0                                            
__________________________________________________________________________________________________
Conv1_pad (ZeroPadding2D)       (512, 225, 225, 3)   0           mobnetv2_in[0][0]                
__________________________________________________________________________________________________
Conv1 (Conv2D)                  (512, 112, 112, 48)  1296        Conv1_pad[0][0]                  
__________________________________________________________________________________________________
bn_Conv1 (BatchNormalizationV1) (512, 112, 112, 48)  192         Conv1[0][0]                      
_____________________________________

RuntimeError: ignored