<a href="https://colab.research.google.com/github/neo4j-partners/hands-on-lab-neo4j-and-vertex-ai/blob/main/Lab%206%20-%20Vertex%20AI/vertex_ai_embedding.ipynb" target="_blank">
  <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Colab logo"> Run in Colab
</a>

# Install additional Packages
First off, you'll also need to install a few packages.

In [None]:
!pip install --quiet google-cloud-storage
!pip install --quiet google.cloud.aiplatform

# Restart the kernel
After you install the additional packages, you need to restart the notebook kernel so it can find the packages.  When you run this, you may get a notification that the kernel crashed.  You can disregard that.

In [None]:
import IPython

app = IPython.Application.instance()
app.kernel.do_shutdown(True)

# Split the Data
Now let's grab the data set and split it into a training and a test set.

In [None]:
# todo - wget from bucket in lab 5

In [None]:
import pandas
df = pandas.read_csv('form13.csv')

train = df.loc[df['reportCalendarOrQuarter'] == '03-31-2021']
train = train.append(df.loc[df['reportCalendarOrQuarter'] == '06-30-2021'])
train.to_csv('train.csv', index=False)

test = df.loc[df['reportCalendarOrQuarter'] == '09-30-2021']
test.to_csv('test.csv', index=False)

# Authenticate your Google Cloud account


In [None]:
# Edit these variables!
PROJECT_ID = "YOUR-PROJECT-ID"
STORAGE_BUCKET = "YOUR-BUCKET-NAME"

In [None]:
import os
os.environ["GCLOUD_PROJECT"] = PROJECT_ID

In [None]:
# You can leave these defaults
REGION = "us-central1"
STORAGE_PATH = "embedding"

In [None]:
try:
    from google.colab import auth as google_auth
    google_auth.authenticate_user()
except:
    pass

# Upload to a GCP Cloud Storage Bucket

To get the data into Vertex AI, we must first put it in a bucket as a CSV.

In [None]:
from google.cloud import storage
client = storage.Client()

In [None]:
bucket = client.bucket(STORAGE_BUCKET)
client.create_bucket(bucket)

In [None]:
# Upload our files to that bucket
for filename in ['train.csv', 'test.csv']:
    upload_path = os.path.join(STORAGE_PATH, filename)
    blob = bucket.blob(upload_path)
    blob.upload_from_filename(filename)

# Train a Model on GCP
We'll use the engineered features to train an AutoML Tables model, then deploy it to an endpoint

In [None]:
from google.cloud import aiplatform

aiplatform.init(project=PROJECT_ID, location=REGION)

dataset = aiplatform.TabularDataset.create(
    display_name="form13raw",
    gcs_source=os.path.join("gs://", STORAGE_BUCKET, STORAGE_PATH, 'train.csv'),
)
dataset.wait()

print(f'\tDataset: "{dataset.display_name}"')
print(f'\tname: "{dataset.resource_name}"')

In [None]:
job = aiplatform.AutoMLTabularTrainingJob(
    display_name='train-form13embedding-automl-1',
    optimization_prediction_type='classification'
)

In [None]:
model = job.run(
    dataset=dataset,
    target_column='target',
    training_fraction_split=0.8,
    validation_fraction_split=0.1,
    test_fraction_split=0.1,
    model_display_name="form13embedding",
    disable_early_stopping=False,
    budget_milli_node_hours=1000,
)

This job will run for an hour.  That's the minimum time for an AutoML job.  We're going to move on to the next notebook.  You can check on the job later in the [Google Cloud Console](https://console.cloud.google.com/) to see the results.  There's a link to the specific job in the output of the cell above.

# Optional Exercises
Optional work follows.  In this, we deploy our model, create a feature store and then use that.  This is a walkthrough of how we might operationalize the model we created above.

In [None]:
endpoint = model.deploy(machine_type="n1-standard-4")

## Loading Data into GCP Feature Store
In this section, we'll take our dataframe with newly engineered features and load that into GCP feature store.

In [None]:
from google.cloud.aiplatform_v1 import FeaturestoreServiceClient

api_endpoint = "{}-aiplatform.googleapis.com".format(REGION)
fs_client = FeaturestoreServiceClient(client_options={"api_endpoint": api_endpoint})

resource_path = fs_client.common_location_path(PROJECT_ID, REGION)
fs_path = fs_client.featurestore_path(PROJECT_ID, REGION, FEATURESTORE_ID)
entity_path = fs_client.entity_type_path(
    PROJECT_ID, REGION, FEATURESTORE_ID, ENTITY_NAME
)

First, let's check if the Feature Store already exists

In [None]:
from grpc import StatusCode


def check_has_resource(callable):
    has_resource = False
    try:
        callable()
        has_resource = True
    except Exception as e:
        if (
            not hasattr(e, "grpc_status_code")
            or e.grpc_status_code != StatusCode.NOT_FOUND
        ):
            raise e
    return has_resource

In [None]:
feature_store_exists = check_has_resource(
    lambda: fs_client.get_featurestore(name=fs_path)
)

In [None]:
from google.cloud.aiplatform_v1.types import entity_type as entity_type_pb2
from google.cloud.aiplatform_v1.types import feature as feature_pb2
from google.cloud.aiplatform_v1.types import featurestore as featurestore_pb2
from google.cloud.aiplatform_v1.types import \
    featurestore_service as featurestore_service_pb2
from google.cloud.aiplatform_v1.types import io as io_pb2

if not feature_store_exists:
    create_lro = fs_client.create_featurestore(
        featurestore_service_pb2.CreateFeaturestoreRequest(
            parent=resource_path,
            featurestore_id=FEATURESTORE_ID,
            featurestore=featurestore_pb2.Featurestore(
                online_serving_config=featurestore_pb2.Featurestore.OnlineServingConfig(
                    fixed_node_count=1
                ),
            ),
        )
    )

    print(create_lro.result())

In [None]:
entity_type_exists = check_has_resource(
    lambda: fs_client.get_entity_type(name=entity_path)
)

if not entity_type_exists:
    users_entity_type_lro = fs_client.create_entity_type(
        featurestore_service_pb2.CreateEntityTypeRequest(
            parent=fs_path,
            entity_type_id=ENTITY_NAME,
            entity_type=entity_type_pb2.EntityType(
                description="Main entity type",
            ),
        )
    )
    print(users_entity_type_lro.result())

    feature_requests = [
        featurestore_service_pb2.CreateFeatureRequest(
            feature=feature_pb2.Feature(
                value_type=feature_pb2.Feature.ValueType.DOUBLE,
                description="Embedding {} from Neo4j".format(i),
            ),
            feature_id="embedding_{}".format(i),
        )
        for i in range(EMBEDDING_DIMENSION)
    ]
    create_features_lro = fs_client.batch_create_features(
        parent=entity_path,
        requests=feature_requests,
    )
    print(create_features_lro.result())

In [None]:
feature_specs = [
    featurestore_service_pb2.ImportFeatureValuesRequest.FeatureSpec(
        id="embedding_{}".format(i)
    )
    for i in range(EMBEDDING_DIMENSION)
]

from google.protobuf.timestamp_pb2 import Timestamp

feature_time = Timestamp()
feature_time.GetCurrentTime()
feature_time.nanos = 0

import_request = fs_client.import_feature_values(
    featurestore_service_pb2.ImportFeatureValuesRequest(
        entity_type=entity_path,
        csv_source=io_pb2.CsvSource(
            gcs_source=io_pb2.GcsSource(
                uris=[
                    os.path.join(
                        "gs://", STORAGE_BUCKET, STORAGE_PATH, FEATURES_FILENAME
                    )
                ]
            )
        ),
        entity_id_field="nodeId",
        feature_specs=feature_specs,
        worker_count=1,
        feature_time=feature_time,
    )
)

print(import_request.result())

## Sending a prediction using features from the feature store

In [None]:
from google.cloud.aiplatform_v1 import FeaturestoreOnlineServingServiceClient

data_client = FeaturestoreOnlineServingServiceClient(
    client_options={"api_endpoint": api_endpoint}
)

In [None]:
# Retrieve Neo4j embeddings from feature store
from google.cloud.aiplatform_v1.types import FeatureSelector, IdMatcher
from google.cloud.aiplatform_v1.types import \
    featurestore_online_service as featurestore_online_service_pb2

feature_selector = FeatureSelector(
    id_matcher=IdMatcher(
        ids=["embedding_{}".format(i) for i in range(EMBEDDING_DIMENSION)]
    )
)

fs_features = data_client.read_feature_values(
    featurestore_online_service_pb2.ReadFeatureValuesRequest(
        entity_type=entity_path,
        entity_id="5",
        feature_selector=feature_selector,
    )
)

saved_embeddings = dict(
    zip(
        (fd.id for fd in fs_features.header.feature_descriptors),
        (str(d.value.double_value) for d in fs_features.entity_view.data),
    )
)

In [None]:
# Combine with other features. These might be sourced per transaction
all_features = {"num_transactions": "80", "total_dollar_amnt": "7484459.618641878"}

all_features.update(saved_embeddings)

instances = [{key: str(value) for key, value in all_features.items()}]

In [None]:
# Send a prediction
endpoint.predict(instances=instances)

# Cleanup


In [None]:
#Delete the feature store and turn down the endpoint
fs_client.delete_featurestore(
    request=featurestore_service_pb2.DeleteFeaturestoreRequest(
        name=fs_client.featurestore_path(PROJECT_ID, REGION, FEATURESTORE_ID),
        force=True,
    )
).result()

endpoint.delete()