In [None]:
%bash
pip install -U tensorflow
pip install Keras

In [107]:
import tensorflow as tf
import keras
import keras.backend as K
from keras.models import Model
from keras.layers import Dense, Flatten, GlobalAveragePooling2D, Input
from keras.optimizers import Adam, Adagrad, RMSprop, SGD

print(tf.__version__, keras.__version__)

import numpy as np
import pandas as pd

import google.datalab.storage as storage

('1.3.0', '2.0.8')


In [124]:
np.random.seed(seed=42)

In [26]:
# Global vars
BUCKET = 'aus-cloud-ml-handson'
PREFIX = 'cdisc-data/bottleneck-features/'
SUFFIX = '.npy'  # numpy saved array file extension
GCS_PATH = 'gs://{}/{}'.format(BUCKET, PREFIX)
DATA_PATH = GCS_PATH + 'input-ssd/'
TRAIN_BSON_PATH = DATA_PATH + 'train.bson'
TEST_BSON_PATH = DATA_PATH + 'test.bson'
NUM_TRAIN_PRODUCTS = 7069896
NUM_TEST_PRODUCTS = 1768172
IMG_WIDTH = IMG_HEIGTH = 180
NUM_CLASSES = 5270
BATCH_SIZE = 128

TODO:
- Read bottleneck features and labels
- List all files in alphabetical order
- Build classifier NN model
- Merge predictions

In [78]:
# Read bottleneck features and labels
# List all files in alphabetical order
def read_files_from_gcs(bucket, prefix, suffix=''):
  gcs_bucket = storage.Bucket(bucket)
  all_files = gcs_bucket.objects(prefix=prefix)
  all_files = (obj.key for obj in all_files if suffix in obj.key)
  return all_files

def load_numpy_gcs(full_filepath):
  with tf.gfile.Open(full_filepath) as f:
     return np.load(f)

In [47]:
# train_features = read_files_from_gcs(bucket=BUCKET, prefix=PREFIX+'train_features_')
# train_features = np.array(list(train_features))

# train_labels = read_files_from_gcs(bucket=BUCKET, prefix=PREFIX+'train_labels_')
# train_labels = np.array(list(train_labels))

# test_features = read_files_from_gcs(bucket=BUCKET, prefix=PREFIX+'test_features_')
# test_features = np.array(list(test_features))

# train_features.shape, train_labels.shape, test_features.shape

In [58]:
# Save file lists to not reload again (slow!)
np.savetxt('./train_features.txt', train_features, fmt='%s')
np.savetxt('./train_labels.txt', train_labels, fmt='%s')
np.savetxt('./test_features.txt', test_features, fmt='%s')

In [65]:
# Load file lists
train_features = np.loadtxt('./train_features.txt', dtype='str')
train_labels = np.loadtxt('./train_labels.txt', dtype='str')
test_features = np.loadtxt('./test_features.txt', dtype='str')

In [90]:
def read_batches_from_gcs(fn_features, fn_labels):
  fn_features = 'gs://'+BUCKET+'/'+fn_features
  fn_labels = 'gs://'+BUCKET+'/'+fn_labels
  file_id = '_'.join(fn_features.split('_')[-2:])
  if file_id not in fn_labels: 
    raise Exception("Files not matching! {} {}".format(fn_features, fn_labels))  # sanity check

  bx = load_numpy_gcs(fn_features)
  by = load_numpy_gcs(fn_labels)
  return bx, by

In [91]:
bx, by = read_batches_from_gcs(train_features[1], train_labels[1])
by = np.argmax(by, axis=1)

bx.shape, by.shape

((131, 6, 6, 2048), (131,))

In [93]:
bx.shape[1:]

(6, 6, 2048)

array([ 'cdisc-data/bottleneck-features/train_features_180crop_2017-10-13T10-24-31.717406_0.npy',
       'cdisc-data/bottleneck-features/train_labels_2017-10-13T10-24-31.717406_0.npy'],
      dtype='|S90')

In [139]:
def batches_iterator(train_features, train_labels, random=True):
  fn_stacked = np.stack((train_features, train_labels), axis=1)
  if random:
    fn_stacked = np.random.permutation(fn_stacked)
  for fn_item in fn_stacked:
    train_features = fn_item[0]
    train_labels = fn_item[1]
    bx, by = read_batches_from_gcs(train_features, train_labels)
    by = np.argmax(by, axis=1)
    yield bx, by

In [144]:
batches_iter = batches_iterator(train_features, train_labels, random=True)
total_batches = len(train_features)

In [143]:
bx, by = next(batches_iter)
bx.shape, by.shape

((130, 6, 6, 2048), (130,))

In [145]:
# NN model (Xception-like)
x = GlobalAveragePooling2D()(input_layer)
pred_layer = Dense(NUM_CLASSES, activation='softmax', name='predictions')(x)

# Build model for provided classes
model = Model(inputs=input_layer, outputs=pred_layer)


In [146]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_5 (InputLayer)         (None, 6, 6, 2048)        0         
_________________________________________________________________
global_average_pooling2d_5 ( (None, 2048)              0         
_________________________________________________________________
predictions (Dense)          (None, 5270)              10798230  
Total params: 10,798,230
Trainable params: 10,798,230
Non-trainable params: 0
_________________________________________________________________


In [147]:
model.compile(optimizer=Adam(lr=1E-2), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [None]:
model.fit_generator(batches_iter, steps_per_epoch=total_batches, epochs=1)