In [None]:
# Copyright 2021 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

### Import packages

In [None]:
import os
import json
import logging
import pandas as pd
import numpy as np

from datetime import datetime
from pytz import timezone

## 1. Load the data training and predictions data from GCS

### Configure Global Variables

In [None]:
project_id = !gcloud config list --format 'value(core.project)' 2>/dev/null
print(project_id)

In [None]:
# Configure your global variables
PROJECT = project_id[0]          # Replace with your project ID
USER = 'test_user'               # Replace with your user name
BUCKET_NAME = project_id[0] + '-vertex-ai'       # Replace with your gcs bucket name

FOLDER_NAME = 'sklearn_models'
ALGORITHM = 'isolation_forest'
TIMEZONE = 'US/Pacific'         
REGION = 'us-central1'           # bucket should be in same region as Vertex AI         
TRAIN_FEATURE_PATH = f"gs://{BUCKET_NAME}/{FOLDER_NAME}_data/{ALGORITHM}/train/train.csv"
TEST_FEATURE_PATH = f"gs://{BUCKET_NAME}/{FOLDER_NAME}_data/{ALGORITHM}/test/test.csv"

In [None]:
print(f"Project:      {PROJECT}")
print(f"Bucket Name: {BUCKET_NAME}")
print(f"Training Data URI:  {TRAIN_FEATURE_PATH}")
print(f"Test Data URI:      {TEST_FEATURE_PATH}")

### Model file URI

In [None]:
JOB_NAME = 'custom_container_isolation_forest_gke'
# We use the job names as folder names to store outputs.
JOB_DIR = 'gs://{}/{}/{}'.format(
    BUCKET_NAME,
    FOLDER_NAME,
    JOB_NAME,
    )

ARTIFACT_URI = JOB_DIR
print("ARTIFACT_URI = ", ARTIFACT_URI)

### Load Train, Test datasets and the model file

In [None]:
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from numpy import genfromtxt
from sklearn.ensemble import IsolationForest
import tensorflow as tf
import joblib

rng = np.random.RandomState(42)

train_df = pd.read_csv(TRAIN_FEATURE_PATH, header=None, delimiter=',', index_col=False)
x_train = train_df.to_numpy()
test_df = pd.read_csv(TEST_FEATURE_PATH, header=None, delimiter=',', index_col=False)
x_test = test_df.to_numpy()

# load the model from GCS
model_file = ARTIFACT_URI + '/' + 'model.joblib'
clf = joblib.load(tf.io.gfile.GFile(model_file, 'rb'))

# Generate some abnormal novel observations
x_outliers = rng.uniform(low=-4, high=4, size=(20, 2))

## 2. Plot the samples, and the nearest vectors to the plane

In [None]:
# plot the line, the samples, and the nearest vectors to the plane
xx, yy = np.meshgrid(np.linspace(-5, 5, 50), np.linspace(-5, 5, 50))
Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

plt.title("IsolationForest")
plt.contourf(xx, yy, Z, cmap=plt.cm.Blues_r)

b1 = plt.scatter(x_train_from_csv[:, 0], x_train[:, 1], c='white',
                 s=20, edgecolor='k')
b2 = plt.scatter(x_test_from_csv[:, 0], x_test[:, 1], c='green',
                 s=20, edgecolor='k')
c = plt.scatter(x_outliers[:, 0], x_outliers[:, 1], c='red',
                s=20, edgecolor='k')
plt.axis('tight')
plt.xlim((-5, 5))
plt.ylim((-5, 5))
plt.legend([b1, b2, c],
           ["training observations",
            "new regular observations", "new abnormal observations"],
           loc="upper left")
plt.show()

## 3. (Optional) Run predictions locally

In [None]:
x_test

In [None]:
# Run predictions
y_pred_dataset = clf.predict(x_test)
y_pred_dataset