In [1]:
%matplotlib inline
from google.cloud import storage
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import io
import numpy as np
from pathlib import Path
import tensorflow as tf
import os
from PIL import Image
import json
import time
import re
import googleapiclient.discovery as discovery
from googleapiclient import errors

In [2]:
# Define bucket and blob prefix

project = 'optimum-treat-262616'
photo_bucket_name = 'catflap-photos-raw'
model_bucket_name = 'cat-detection-models'
prefix = '2020-06-01'

# Create input json file

In [3]:
# Set up buckets

client = storage.Client()
photo_bucket = client.get_bucket(photo_bucket_name)
model_bucket = client.get_bucket(model_bucket_name)

In [4]:
# Get list of blob names

blobs = photo_bucket.list_blobs(prefix=prefix)
blob_list = [blob.name for blob in blobs]
print(len(blob_list))

5822


In [9]:
blob_name

'2020-06-01_195953.jpg'

In [8]:
# Read labels into pandas dataframe

batch_input_filename = f'/home/jupyter/batch-input-{prefix}.json'
with open(batch_input_filename, 'w') as fp:
    for idx, blob_name in enumerate(blob_list[:]):

        # Read blob from GCS
        blob = photo_bucket.blob(blob_name)
        blob_str = blob.download_as_string()
        bytes_io = io.BytesIO(blob_str)
        img = mpimg.imread(bytes_io, format='jpg')
        img_red_downsample = img[::10,::10,0]

        # Write to file
        json_instances_dict = {'flatten_input': img_red_downsample.tolist(), 'key': blob_name}
        json.dump(json_instances_dict, fp)
        fp.write('\n')
        
        if idx % 100 == 0:
            print(pd.Timestamp.now(), idx)

2020-06-08 22:18:03.256225 0
2020-06-08 22:18:14.249526 100
2020-06-08 22:18:24.887001 200
2020-06-08 22:18:35.719818 300
2020-06-08 22:18:46.465935 400
2020-06-08 22:18:56.710459 500
2020-06-08 22:19:06.812917 600
2020-06-08 22:19:17.041849 700
2020-06-08 22:19:27.906782 800
2020-06-08 22:19:38.059869 900
2020-06-08 22:19:48.982764 1000
2020-06-08 22:19:59.772029 1100
2020-06-08 22:20:10.196476 1200
2020-06-08 22:20:20.370394 1300
2020-06-08 22:20:30.579216 1400
2020-06-08 22:20:41.030114 1500
2020-06-08 22:20:51.259740 1600
2020-06-08 22:21:01.741141 1700
2020-06-08 22:21:11.698408 1800
2020-06-08 22:21:22.245964 1900
2020-06-08 22:21:32.177410 2000
2020-06-08 22:21:42.416589 2100
2020-06-08 22:21:52.588404 2200
2020-06-08 22:22:02.680263 2300
2020-06-08 22:22:12.593219 2400
2020-06-08 22:22:22.860303 2500
2020-06-08 22:22:32.941069 2600
2020-06-08 22:22:43.170889 2700
2020-06-08 22:22:53.303760 2800
2020-06-08 22:23:03.470608 2900
2020-06-08 22:23:13.311755 3000
2020-06-08 22:23:23.

UnidentifiedImageError: cannot identify image file <_io.BytesIO object at 0x7faf5abbea70>

In [10]:
# Upload batch input json file to GCS

batch_input_blob = model_bucket.blob('batch-input-keys/'+prefix+'.json')
batch_input_blob.upload_from_filename(batch_input_filename)

# Submit batch prediction job

In [11]:
def make_batch_job_body(project_name, input_paths, output_path,
        model_name, region, data_format='JSON',
        version_name=None, max_worker_count=None,
        runtime_version=None):

    project_id = 'projects/{}'.format(project_name)
    model_id = '{}/models/{}'.format(project_id, model_name)
    if version_name:
        version_id = '{}/versions/{}'.format(model_id, version_name)

    # Make a jobName of the format "model_name_batch_predict_YYYYMMDD_HHMMSS"
    timestamp = time.strftime('%Y%m%d_%H%M%S', time.gmtime())

    # Make sure the project name is formatted correctly to work as the basis
    # of a valid job name.
    clean_project_name = re.sub(r'\W+', '_', project_name)

    job_id = '{}_{}_{}'.format(clean_project_name, model_name,
                           timestamp)

    # Start building the request dictionary with required information.
    body = {'jobId': job_id,
            'predictionInput': {
                'dataFormat': data_format,
                'inputPaths': input_paths,
                'outputPath': output_path,
                'region': region}}

    # Use the version if present, the model (its default version) if not.
    if version_name:
        body['predictionInput']['versionName'] = version_id
    else:
        body['predictionInput']['modelName'] = model_id

    # Only include a maximum number of workers or a runtime version if specified.
    # Otherwise let the service use its defaults.
    if max_worker_count:
        body['predictionInput']['maxWorkerCount'] = max_worker_count

    if runtime_version:
        body['predictionInput']['runtimeVersion'] = runtime_version

    return body

In [12]:
# Create batch job body

batch_predict_body = make_batch_job_body(
    project_name = project, 
    input_paths = f'gs://{model_bucket_name}/batch-input-keys/{prefix}.json', 
    output_path = f'gs://{model_bucket_name}/batch-output-keys/{prefix}/',
    model_name = 'logistic_regression_v2', 
    region = 'europe-west2',
    version_name='logistic_regression_v2', 
    max_worker_count=20)

batch_predict_body

{'jobId': 'optimum_treat_262616_logistic_regression_v2_20200608_223238',
 'predictionInput': {'dataFormat': 'JSON',
  'inputPaths': 'gs://cat-detection-models/batch-input-keys/2020-06-01.json',
  'outputPath': 'gs://cat-detection-models/batch-output-keys/2020-06-01/',
  'region': 'europe-west2',
  'versionName': 'projects/optimum-treat-262616/models/logistic_regression_v2/versions/logistic_regression_v2',
  'maxWorkerCount': 20}}

In [13]:
# Submit batch prediction job

project_id = 'projects/{}'.format(project)

ml = discovery.build('ml', 'v1')
request = ml.projects().jobs().create(parent=project_id, body=batch_predict_body)

try:
    response = request.execute()

    print('Job requested.')

    # The state returned will almost always be QUEUED.
    print('state : {}'.format(response['state']))

except errors.HttpError as err:
    # Something went wrong, print out some information.
    print('There was an error getting the prediction results.' +
          'Check the details:')
    print(err._get_reason())

Job requested.
state : QUEUED
