In [None]:
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Vertex AI Feature Store

In this notebook, we will build and use a [Vertex AI Feature Store](https://cloud.google.com/vertex-ai/docs/featurestore).



## Prerequisites
**Note:** This notebook and repository are supporting artifacts for the "Google Machine Learning and Generative AI for Solutions Architects" book. The book describes the concepts associated with this notebook, and for some of the activities, the book contains instructions that should be performed before running the steps in the notebooks. Each top-level folder in this repo is associated with a chapter in the book. Please ensure that you have read the relevant chapter sections before performing the activities in this notebook.

**There are also important generic prerequisite steps outlined [here](https://github.com/PacktPublishing/Google-Machine-Learning-for-Solutions-Architects/blob/main/Prerequisite-steps/Prerequisites.ipynb).**


**Attention:** The code in this notebook creates Google Cloud resources that can incur costs.

Refer to the Google Cloud pricing documentation for details.

For example:

* [Vertex AI Pricing](https://cloud.google.com/vertex-ai/pricing)
* [BigQuery Pricing](https://cloud.google.com/bigquery/pricing)


## Install and import required libraries

In [None]:
! pip install --upgrade --quiet google-cloud-bigquery google-cloud-aiplatform

## Restart the kernel

The code in the next cell will retart the kernel, which is sometimes required after installing/upgrading packages.

When prompted, click OK to restart the kernel.

The sleep command simply prevents further cells from executing before the kernel restarts.

In [None]:
import IPython

app = IPython.Application.instance()
app.kernel.do_shutdown(True)


In [None]:
import time
time.sleep(10)

# (Wait for kernel to restart before proceeding...)

In [None]:
from google.cloud import bigquery
from google.cloud.aiplatform_v1beta1 import FeatureOnlineStoreAdminServiceClient, FeatureOnlineStoreServiceClient, FeatureRegistryServiceClient
from google.cloud.aiplatform_v1beta1.types import feature as feature_pb2
from google.cloud.aiplatform_v1beta1.types import feature_registry_service as feature_registry_service_pb2
from google.cloud.aiplatform_v1beta1.types import featurestore_service as featurestore_service_pb2
from google.cloud.aiplatform_v1beta1.types import feature_online_store_service as feature_online_store_service_pb2
from google.cloud.aiplatform_v1beta1.types import feature_group as feature_group_pb2
from google.cloud.aiplatform_v1beta1.types import feature_online_store as feature_online_store_pb2
from google.cloud.aiplatform_v1beta1.types import feature_online_store_admin_service as feature_online_store_admin_service_pb2
from google.cloud.aiplatform_v1beta1.types import feature_view as feature_view_pb2
from google.cloud.aiplatform_v1beta1.types import io as io_pb2

In [None]:
PROJECT_ID_DETAILS = !gcloud config get-value project
PROJECT_ID = PROJECT_ID_DETAILS[0]  # The project ID is item 0 in the list returned by the gcloud command
REGION="us-central1"
API_ENDPOINT = f"{REGION}-aiplatform.googleapis.com"

# Access and explore the Data: 

The NYC Taxi Trip Records dataset is available on BigQuery as a public dataset.
In the next cell, we will:
Familiarize ourselves with the data: Understand the dataset's structure, including the available fields like pick-up/drop-off times and locations, distances, fares, etc.
Identify Key Features: Determine which features are relevant for our use case. For example, average trip duration, total trips per day, average fare, peak hours, etc.

In [None]:
# Create a BigQuery client
client = bigquery.Client()

In [None]:
# Specify the public dataset and table
public_dataset_name = 'new_york_taxi_trips'
public_project_name = 'bigquery-public-data'
public_table_name = 'tlc_yellow_trips_2020'

# Prepare a query to identify key features
query = f"""
SELECT
    EXTRACT(HOUR FROM pickup_datetime) AS hour,
    AVG(trip_distance) AS avg_trip_distance,
    COUNT(*) AS total_trips,
    AVG(fare_amount) AS avg_fare,
    AVG((fare_amount / NULLIF(trip_distance, 0))) AS avg_fare_per_mile
FROM
    `{public_project_name}.{public_dataset_name}.{public_table_name}`
WHERE
    trip_distance > 0 AND fare_amount > 0
GROUP BY
    hour
ORDER BY
    hour
"""

# Run the query
query_job = client.query(query)

# Print the results to understand key features
print("Hourly Trip Data:")
for row in query_job:
    print(f"Hour: {row.hour}, Avg Trip Distance: {row.avg_trip_distance}, Total Trips: {row.total_trips}, Avg Fare: {row.avg_fare}, Avg Fare per Mile: {row.avg_fare_per_mile}")

# Data Preparation and Feature Engineering

The following python code will perform:

Data Extraction: Selects relevant columns from the taxi trip data.

Feature Engineering:
1. Calculates fare_per_mile by dividing the fare amount by the trip distance.
1. Extracts pickup_hour and pickup_day_of_week from the pickup datetime.
1. Extracts dropoff_hour and dropoff_day_of_week from the dropoff datetime.
1. Filters out records with zero or negative trip distances or fares.
1. Limits the query to 1000 rows for initial analysis.


In [None]:
# Prepare the query
query = f"""
SELECT
    *,
    (fare_amount / NULLIF(trip_distance, 0)) AS fare_per_mile,
    EXTRACT(HOUR FROM pickup_datetime) AS pickup_hour,
    EXTRACT(DAYOFWEEK FROM pickup_datetime) AS pickup_day_of_week,
    EXTRACT(HOUR FROM dropoff_datetime) AS dropoff_hour,
    EXTRACT(DAYOFWEEK FROM dropoff_datetime) AS dropoff_day_of_week
FROM
    `{public_project_name}.{public_dataset_name}.{public_table_name}`
WHERE
    trip_distance > 0 AND
    fare_amount > 0
LIMIT 1000
"""

# Run the query and get the results
query_job = client.query(query)

# Print the first 5 rows to verify the results
row_count = 0
for row in query_job:
    print(row)
    row_count += 1
    if row_count >= 5:
        break


# Create dataset to store features

In [None]:
# Define your dataset ID (replace with your desired dataset name)
feature_dataset_name = 'feature_store_for_nyc_taxi_data'
dataset_id = f"{PROJECT_ID}.{feature_dataset_name}"

In [None]:
# Create a Dataset
dataset = bigquery.Dataset(dataset_id)
dataset.location = "US"  # Choose your location, e.g., "US", "EU", etc.
client.create_dataset(dataset, exists_ok=True)

## Create a view

In [None]:
# Define our dataset and view name
view_name = 'nyc_taxi_data_view'
view_id = f"{dataset_id}.{view_name}"

# Define the fully qualified name of your BigQuery view
BQ_VIEW_ID_FQN = f"bq://{view_id}"

# SQL query to define the view
view_query = f"""
SELECT
  pickup_datetime,
  dropoff_datetime,
  passenger_count,
  CAST(trip_distance AS FLOAT64) AS trip_distance,  -- Cast to FLOAT64 (supported by Feature Store)
  CAST(fare_amount AS FLOAT64) AS fare_amount,      -- Cast to FLOAT64 (supported by Feature Store)
  CAST((fare_amount / NULLIF(trip_distance, 0)) AS FLOAT64) AS fare_per_mile, -- Cast to FLOAT64 (supported by Feature Store)
  EXTRACT(HOUR FROM pickup_datetime) AS pickup_hour,
  EXTRACT(DAYOFWEEK FROM pickup_datetime) AS pickup_day_of_week,
  EXTRACT(HOUR FROM dropoff_datetime) AS dropoff_hour,
  EXTRACT(DAYOFWEEK FROM dropoff_datetime) AS dropoff_day_of_week,
  CONCAT(pickup_datetime, '-', CAST(pickup_location_id AS STRING)) AS entity_id,
  pickup_datetime AS feature_timestamp
FROM
  `{public_project_name}.{public_dataset_name}.{public_table_name}`
WHERE
  trip_distance > 0 AND fare_amount > 0
"""

# Create the view
view = bigquery.Table(view_id)
view.view_query = view_query
client.create_table(view, exists_ok=True)


In [None]:
query = f"""
SELECT DISTINCT entity_id
FROM `{view_id}`
LIMIT 100  -- Adjust the limit as needed
"""

# Run the query
query_job = client.query(query)

# Initialize an empty list to store entity IDs
entity_ids = []

# Fetch and print the results, and add each entity_id to the list
for row in query_job:
    entity_ids.append(row.entity_id)

## Set up and start online serving

Some of the cells in this section contain repurposed and/or modified code from [this example notebook](https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/feature_store/online_feature_serving_and_fetching_bigquery_data_with_feature_store_optimized.ipynb).

Now for the exciting part! To serve data in a feature store, you need to do the following:

1. Create an online store cluster to host the data.
    * Create a `FeatureOnlineStore` instance with autoscaling.
    * Choose Optimized as the storage type.
1. Define the data (`FeatureView`) to be served by the newly-created instance. This can map to either of the following:
    * The BigQuery view that you just created for serving data.
    * The `FeatureGroup` and `Feature` we will create to host feature metadata.

We recommend NOT sending loads larger than 7500 QPS to one FeatureOnlineStore.
In general, we recommend creating multiple gRPC connections to one FeatureOnlineStore, and evenly distribute your loads across them. More connections and smaller per-connection QPS typically help with internal load balancing and scaling, reducing the chance of seeing higher tail-latencies. Specifically:
1. If your FetchFeatureValues response payload size is small (e.g. less than 1 kB), you may create one connection for up to every 2000 QPS.
2. If your FetchFeatureValues response payload size can be large (e.g. more than a few kB or 10s of kB), we recommend you to create one connection for up to every 250 QPS, and we also recommend avoiding sudden increases of loads.

## Initialize the admin and registry clients

In [None]:
admin_client = FeatureOnlineStoreAdminServiceClient(
    client_options={"api_endpoint": API_ENDPOINT}
)
registry_client = FeatureRegistryServiceClient(
    client_options={"api_endpoint": API_ENDPOINT}
)

## Create online store instance

In [None]:
FEATURE_ONLINE_STORE_ID = (
    "feature_store_nyc_taxi_online"  # @param {type:"string"}
)
online_store_config = feature_online_store_pb2.FeatureOnlineStore(
    optimized=feature_online_store_pb2.FeatureOnlineStore.Optimized()
)


create_store_lro = admin_client.create_feature_online_store(
    feature_online_store_admin_service_pb2.CreateFeatureOnlineStoreRequest(
        parent=f"projects/{PROJECT_ID}/locations/{REGION}",
        feature_online_store_id=FEATURE_ONLINE_STORE_ID,
        feature_online_store=online_store_config,
    )
)

In [None]:
# Wait for the LRO to finish and get the LRO result.
print(create_store_lro.result())

In [None]:
# Use list to verify the store is created.
admin_client.list_feature_online_stores(
    parent=f"projects/{PROJECT_ID}/locations/{REGION}"
)

## Create FeatureGroup/Features

Create a FeatureGroup pointing to the created BigQuery view for the demo. We will then create features for each column we would like to register.

##### Data source preparation guidelines for Feature Registry data source

Note that if you choose to use Feature Registry source, Feature Store only provides the option to support time-series sources for which Feature Store will generate latest featureValues.

Use the following guidelines to understand the schema and constraints while creating the BigQuery source:

* The BigQuery table or view *must* have a column with `string` values to use as the (entity) IDs. You'll need to specify that this column is the ID column during `FeatureGroup` creation. Note that the size of each value in this column must be less than 4 KB.
* The BigQuery table or view *must* have a column named `feature_timestamp` with `timestamp` values to use as timestamp column.
* Feature Registry sources are treated as sparse by default i.e. a point in time lookup (BQ.ML_FEATURES_AT_TIME()) to generate latest featureValues per entityId.
* Provide values for each feature is a separate column. Supported data types are `bool`, `int`, `double`, `string`, timestamp, arrays of these data types, and bytes. Note that the timestamp data type is converted to `int64` during data sync.
* Feature Store validates the schema during `FeatureView`/`FeatureGroup`/`Featre` creation. However, it doesn't revalidate the schema during each data sync. Columns with unsupported data types added after `FeatureView` creation time are ignored.
* The BigQuery table or view must be in either the same region as the online store, or in a multiregion that overlaps with the online store. For example, if the online store is in `us-central`, the BigQuery source can be in `us-central` or in `US`.

In [None]:
FEATURE_GROUP_ID = "feature_group_nyc_taxi"  # Replace with your feature group ID
FEATURE_IDS = [
    "pickup_datetime",
    "dropoff_datetime",
    "passenger_count",
    "trip_distance",
    "fare_amount",
    "fare_per_mile",
    "pickup_hour",
    "pickup_day_of_week",
    "dropoff_hour",
    "dropoff_day_of_week",
]

# Create the feature group configuration
feature_group_config = feature_group_pb2.FeatureGroup(
    big_query=feature_group_pb2.FeatureGroup.BigQuery(
        big_query_source=io_pb2.BigQuerySource(input_uri=BQ_VIEW_ID_FQN)
    )
)

# Create the feature group
create_group_lro = registry_client.create_feature_group(
    feature_registry_service_pb2.CreateFeatureGroupRequest(
        parent=f"projects/{PROJECT_ID}/locations/{REGION}",
        feature_group_id=FEATURE_GROUP_ID,
        feature_group=feature_group_config,
    )
)


In [None]:
print(create_group_lro.result())

In [None]:
create_feature_lros = []
for id in FEATURE_IDS:
    create_feature_lros.append(
        registry_client.create_feature(
            featurestore_service_pb2.CreateFeatureRequest(
                parent=f"projects/{PROJECT_ID}/locations/{REGION}/featureGroups/{FEATURE_GROUP_ID}",
                feature_id=id,
                feature=feature_pb2.Feature(),
            )
        )
    )

In [None]:
for lro in create_feature_lros:
    print(lro.result())

Create a `FeatureView` instance for the BigQuery view and FeatureGroup/features you created earlier in this tutorial and set the sync time and frequency to 1:00 AM PST daily.

In [None]:
FEATURE_VIEW_ID = "registry_product"  # @param {type:"string"}
CRON_SCHEDULE = "TZ=America/Los_Angeles 56 * * * *"  # @param {type:"string"}

feature_registry_source = feature_view_pb2.FeatureView.FeatureRegistrySource(
    feature_groups=[
        feature_view_pb2.FeatureView.FeatureRegistrySource.FeatureGroup(
            feature_group_id=FEATURE_GROUP_ID, feature_ids=FEATURE_IDS
        )
    ]
)

sync_config = feature_view_pb2.FeatureView.SyncConfig(cron=CRON_SCHEDULE)

create_view_lro = admin_client.create_feature_view(
    feature_online_store_admin_service_pb2.CreateFeatureViewRequest(
        parent=f"projects/{PROJECT_ID}/locations/{REGION}/featureOnlineStores/{FEATURE_ONLINE_STORE_ID}",
        feature_view_id=FEATURE_VIEW_ID,
        feature_view=feature_view_pb2.FeatureView(
            feature_registry_source=feature_registry_source,
            sync_config=sync_config,
        ),
    )
)

# Wait for LRO to complete and show result
print(create_view_lro.result())

In [None]:
# Again, list all feature view under the FEATURE_ONLINE_STORE_ID to confirm
admin_client.list_feature_views(
    parent=f"projects/{PROJECT_ID}/locations/{REGION}/featureOnlineStores/{FEATURE_ONLINE_STORE_ID}"
)

### Start sync manually

The sync pipeline executes according to the schedule specified in the `FeatureView` instance.

To skip the wait and execute the sync pipeline immediately, start the sync manually.

In [None]:
sync_response = admin_client.sync_feature_view(
    feature_view=f"projects/{PROJECT_ID}/locations/{REGION}/featureOnlineStores/{FEATURE_ONLINE_STORE_ID}/featureViews/{FEATURE_VIEW_ID}"
)

The `sync_response` contains the ID of the sync job.

Use `get_feature_view_sync` to check the status of the job.

In [None]:
import time

while True:
    feature_view_sync = admin_client.get_feature_view_sync(
        name=sync_response.feature_view_sync
    )
    if feature_view_sync.run_time.end_time.seconds > 0:
        status = "Succeed" if feature_view_sync.final_status.code == 0 else "Failed"
        print(f"Sync {status} for {feature_view_sync.name}.")
        break
    else:
        print("Sync ongoing, waiting for 30 seconds.")
    time.sleep(30)

Use `list_feature_view_syncs` to view all your syncs.

In [None]:
admin_client.list_feature_view_syncs(
    parent=f"projects/{PROJECT_ID}/locations/{REGION}/featureOnlineStores/{FEATURE_ONLINE_STORE_ID}/featureViews/{FEATURE_VIEW_ID}"
)

### Start online serving

After the data sync is complete, use the `FetchFeatureValues` API to retrieve the data. 

**Retrieve your features**

In [None]:
# Get Optimized online store
response = admin_client.get_feature_online_store(
    name=f"projects/{PROJECT_ID}/locations/{REGION}/featureOnlineStores/{FEATURE_ONLINE_STORE_ID}"
)
print(response)

In [None]:
data_client = FeatureOnlineStoreServiceClient(
    client_options={
        "api_endpoint": response.dedicated_serving_endpoint.public_endpoint_domain_name
    }
)
print(data_client)

**Note:** Sometimes it can take a few seconds for the API endpoint to come online to become reachable. The following code will periodically check for the feature view to be available, and will print the status. 

While waiting for it to become available, you may see expected error messages such as: 

`503 failed to connect to all addresses; last error: UNAVAILABLE: ipv4:XXX.XXX.XXX.XXX:443: Socket closed`. 

When you see a message saying **"Feature view is online and available"** then you can proceed to the next cell.

In [None]:
from google.api_core.exceptions import GoogleAPICallError
import time

# Define a timeout (e.g., 10 minutes)
timeout = 600  # seconds
start_time = time.time()

while time.time() - start_time < timeout:
    try:
        # Attempt to fetch feature values (as a test)
        test_response = data_client.fetch_feature_values(
            request=feature_online_store_service_pb2.FetchFeatureValuesRequest(
                feature_view=f"projects/{PROJECT_ID}/locations/{REGION}/featureOnlineStores/{FEATURE_ONLINE_STORE_ID}/featureViews/{FEATURE_VIEW_ID}",
                id=entity_ids[0], 
                format=feature_online_store_service_pb2.FetchFeatureValuesRequest.Format.PROTO_STRUCT,
            )
        )
        print("Feature view is online and available.")
        break
    except GoogleAPICallError as e:
        # Handle exceptions related to unavailability or other API errors
        print(f"Waiting for feature view to be available: {e}")
        time.sleep(30)  # Wait for 30 seconds before retrying

# Check for timeout
if time.time() - start_time >= timeout:
    print("Timed out waiting for feature view to be available.")


In [None]:
data_client.fetch_feature_values(
    request=feature_online_store_service_pb2.FetchFeatureValuesRequest(
        feature_view=f"projects/{PROJECT_ID}/locations/{REGION}/featureOnlineStores/{FEATURE_ONLINE_STORE_ID}/featureViews/{FEATURE_VIEW_ID}",
        id=entity_ids[0],
        format=feature_online_store_service_pb2.FetchFeatureValuesRequest.Format.PROTO_STRUCT,
    )
)

# Train a model

In [None]:
import pandas as pd

# Initialize a BigQuery client
client = bigquery.Client()

# Define your query
query = f"""
SELECT 
  trip_distance, 
  fare_amount, 
  EXTRACT(HOUR FROM pickup_datetime) AS pickup_hour,
  EXTRACT(DAYOFWEEK FROM pickup_datetime) AS pickup_day_of_week
FROM 
  `{view_id}`
"""

# Run the query and convert to a pandas DataFrame
df = client.query(query).to_dataframe()


In [None]:
X = df[['trip_distance', 'pickup_hour', 'pickup_day_of_week']]
y = df['fare_amount']

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Initialize the model
model = LinearRegression()

# Train the model
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

In [None]:
from sklearn.metrics import mean_absolute_error

# Calculate Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error: {mae}")

# That's it! Well Done!

# Clean up

When you no longer need the resources created by this notebook. You can delete them as follows.

**Note: if you do not delete the resources, you will continue to pay for them.**

In [None]:
clean_up = False  # Set to True if you want to delete the resources

## Delete Feature Store resources

In [None]:
if clean_up:  
    try:
        # 1. Delete Feature View
        delete_op = admin_client.delete_feature_view(
            name=f"projects/{PROJECT_ID}/locations/{REGION}/featureOnlineStores/{FEATURE_ONLINE_STORE_ID}/featureViews/{FEATURE_VIEW_ID}"
        )
        # Wait for Feature View deletion to complete
        delete_op.result()
        print("Feature View deleted successfully!")
    except Exception as e:
        print(f"Error deleting Feature View: {e}")

    try:
        # 2. Delete Features 
        for lro in create_feature_lros:
            feature_name = lro.result().name  
            registry_client.delete_feature(name=feature_name)
        print("Features deleted successfully!")
    except Exception as e:
        print(f"Error deleting Features: {e}")

    try:
        # 3. Delete Feature Group
        registry_client.delete_feature_group(
            name=f"projects/{PROJECT_ID}/locations/{REGION}/featureGroups/{FEATURE_GROUP_ID}"
        )
        # Wait for Feature Group deletion to complete
        delete_op.result()
        print("Feature Group deleted successfully!")
    except Exception as e:
        print(f"Error deleting Feature Group: {e}")

    try:
        # 4. Delete Feature Online Store
        admin_client.delete_feature_online_store(
            name=f"projects/{PROJECT_ID}/locations/{REGION}/featureOnlineStores/{FEATURE_ONLINE_STORE_ID}"
        )
        print("Feature Online Store deleted successfully!")
    except Exception as e:
        print(f"Error deleting Feature Online Store: {e}")

else:
    print("clean_up parameter is set to False.")


## Delete BigQuery dataset and view

# WARNING: THE BIGQUERY DATASET AND VIEW CREATED IN THIS NOTEBOOK ARE ALSO USED IN CHAPTER 14. IF YOU PLAN TO PROCEED WITH THE ACTIVITIES IN CHAPTER 14, DO NOT DELETE THESE RESOURCES YET. 

If you want to delete the BigQuery resources, set the delete_bq parameter to True.

In [None]:
delete_bq = False

In [None]:
if delete_bq:  
    try:
        client.delete_table(view_id, not_found_ok=True)
        print(f"Deleted view: {view_id}")
    except Exception as e:
        print(f"Error deleting view: {e}")

    try:
        client.delete_dataset(dataset_id, delete_contents=True, not_found_ok=True)
        print(f"Deleted dataset: {dataset_id}")
    except Exception as e:
        print(f"Error deleting dataset: {e}")
        
else:
    print("delete_bq parameter is set to False.")

## Delete GCS Bucket
The bucket can be reused throughout multiple activities in the book. Sometimes, activities in certain chapters make use of artifacts from previous chapters that are stored in the GCS bucket.

I highly recommend **not deleting the bucket** unless you will be performing no further activities in the book. For this reason, there's a separate `delete_bucket` variable to specify if you want to delete the bucket.

If you want to delete the bucket, set the `delete_bucket` parameter to `True`.

In [None]:
delete_bucket = False

In [None]:
if delete_bucket == True:
    # Delete the bucket
    ! gcloud storage rm --recursive gs://$BUCKET
else:
    print("delete_bucket parameter is set to False")