In [1]:
%matplotlib inline
from google.cloud import storage
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import io
import numpy as np
from pathlib import Path
import tensorflow as tf
import os
from PIL import Image
import json
import time
import re
import googleapiclient.discovery as discovery
from googleapiclient import errors

In [2]:
# Define bucket and blob prefix

project = 'optimum-treat-262616'
photo_bucket_name = 'catflap-photos-raw'
model_bucket_name = 'cat-detection-models'
prefix = '2020-06-01'

# Create input json file

In [3]:
# Set up buckets

client = storage.Client()
photo_bucket = client.get_bucket(photo_bucket_name)
model_bucket = client.get_bucket(model_bucket_name)

In [4]:
# Get list of blob names

blobs = photo_bucket.list_blobs(prefix=prefix)
blob_list = [blob.name for blob in blobs]
print(len(blob_list))

5822


In [25]:
# Read labels into pandas dataframe

batch_input_filename = '/home/jupyter/batch-input.json'
with open(, 'w') as fp:
    for idx, blob_name in enumerate(blob_list[:]):

        # Read blob from GCS
        blob = photo_bucket.blob(blob_name)
        blob_str = blob.download_as_string()
        bytes_io = io.BytesIO(blob_str)
        img = mpimg.imread(bytes_io, format='jpg')
        img_red_downsample = img[::10,::10,0]

        # Write to file
        json_instances_dict = {'flatten_input': img_red_downsample.tolist()}
        json.dump(json_instances_dict, fp)
        fp.write('\n')
        
        if idx % 100 == 0:
            print(pd.Timestamp.now(), idx)

2020-06-07 15:11:49.912670 0
2020-06-07 15:11:57.674853 100
2020-06-07 15:12:04.841299 200
2020-06-07 15:12:12.972489 300
2020-06-07 15:12:23.971038 400
2020-06-07 15:12:34.884835 500
2020-06-07 15:12:46.433051 600
2020-06-07 15:12:57.735909 700
2020-06-07 15:13:09.199341 800
2020-06-07 15:13:20.556126 900
2020-06-07 15:13:31.906892 1000
2020-06-07 15:13:42.954666 1100
2020-06-07 15:13:54.119665 1200
2020-06-07 15:14:05.650092 1300
2020-06-07 15:14:16.753098 1400
2020-06-07 15:14:27.982812 1500
2020-06-07 15:14:38.684336 1600
2020-06-07 15:14:49.863019 1700
2020-06-07 15:15:00.798460 1800
2020-06-07 15:15:12.271462 1900
2020-06-07 15:15:22.810127 2000
2020-06-07 15:15:34.062328 2100
2020-06-07 15:15:44.920822 2200
2020-06-07 15:15:56.332327 2300
2020-06-07 15:16:07.542723 2400
2020-06-07 15:16:18.361117 2500
2020-06-07 15:16:29.450744 2600
2020-06-07 15:16:39.895238 2700
2020-06-07 15:16:50.901991 2800
2020-06-07 15:17:01.500834 2900
2020-06-07 15:17:12.221955 3000
2020-06-07 15:17:22.

In [29]:
# Upload batch input json file to GCS

batch_input_blob = model_bucket.blob('batch-input/'+prefix+'.json')
batch_input_blob.upload_from_filename(batch_input_filename)

# Submit batch prediction job

In [4]:
def make_batch_job_body(project_name, input_paths, output_path,
        model_name, region, data_format='JSON',
        version_name=None, max_worker_count=None,
        runtime_version=None):

    project_id = 'projects/{}'.format(project_name)
    model_id = '{}/models/{}'.format(project_id, model_name)
    if version_name:
        version_id = '{}/versions/{}'.format(model_id, version_name)

    # Make a jobName of the format "model_name_batch_predict_YYYYMMDD_HHMMSS"
    timestamp = time.strftime('%Y%m%d_%H%M%S', time.gmtime())

    # Make sure the project name is formatted correctly to work as the basis
    # of a valid job name.
    clean_project_name = re.sub(r'\W+', '_', project_name)

    job_id = '{}_{}_{}'.format(clean_project_name, model_name,
                           timestamp)

    # Start building the request dictionary with required information.
    body = {'jobId': job_id,
            'predictionInput': {
                'dataFormat': data_format,
                'inputPaths': input_paths,
                'outputPath': output_path,
                'region': region}}

    # Use the version if present, the model (its default version) if not.
    if version_name:
        body['predictionInput']['versionName'] = version_id
    else:
        body['predictionInput']['modelName'] = model_id

    # Only include a maximum number of workers or a runtime version if specified.
    # Otherwise let the service use its defaults.
    if max_worker_count:
        body['predictionInput']['maxWorkerCount'] = max_worker_count

    if runtime_version:
        body['predictionInput']['runtimeVersion'] = runtime_version

    return body

In [16]:
# Create batch job body

batch_predict_body = make_batch_job_body(
    project_name = project, 
    input_paths = f'gs://{model_bucket_name}/batch-input/{prefix}.json', 
    output_path = f'gs://{model_bucket_name}/batch-output/{prefix}/',
    model_name = 'logistic_regression_v1', 
    region = 'europe-west2',
    version_name='logistic_regression_v1', 
    max_worker_count=20)

batch_predict_body

{'jobId': 'optimum_treat_262616_logistic_regression_v1_20200607_165247',
 'predictionInput': {'dataFormat': 'JSON',
  'inputPaths': 'gs://cat-detection-models/batch-input/2020-06-01.json',
  'outputPath': 'gs://cat-detection-models/batch-output/2020-06-01/',
  'region': 'europe-west2',
  'versionName': 'projects/optimum-treat-262616/models/logistic_regression_v1/versions/logistic_regression_v1',
  'maxWorkerCount': 20}}

In [19]:
# Submit batch prediction job

project_id = 'projects/{}'.format(project)

ml = discovery.build('ml', 'v1')
request = ml.projects().jobs().create(parent=project_id, body=batch_predict_body)

try:
    response = request.execute()

    print('Job requested.')

    # The state returned will almost always be QUEUED.
    print('state : {}'.format(response['state']))

except errors.HttpError as err:
    # Something went wrong, print out some information.
    print('There was an error getting the prediction results.' +
          'Check the details:')
    print(err._get_reason())

Job requested.
state : QUEUED
