# Minimal Ride Hailing Example

### Configuration

Restart your Kernel after installing these packages

In [1]:
!pip install protobuf gcsfs feast -U -q --user

### Basic Imports and Feast Client initialization

In [2]:
import os
import time
from pprint import pprint
from datetime import datetime
now = datetime.now() # current date and time
date_time_str = now.strftime("%m-%d-%Y-%H-%M-%S")
print(date_time_str)
pprint({key: value for key, value in os.environ.items() if key.startswith("FEAST_")})
staging_bucket = 'gs://my-feast-playground-s-11-35de9347/'

03-25-2021-12-53-24
{'FEAST_CORE_URL': 'feast-release-feast-core:6565',
 'FEAST_HISTORICAL_FEATURE_OUTPUT_LOCATION': 'file:///home/jovyan/historical_feature_output',
 'FEAST_HISTORICAL_SERVING_URL': 'feast-release-feast-batch-serving:6566',
 'FEAST_JUPYTER_SERVICE_PORT': 'tcp://10.79.252.133:80',
 'FEAST_JUPYTER_SERVICE_PORT_80_TCP': 'tcp://10.79.252.133:80',
 'FEAST_JUPYTER_SERVICE_PORT_80_TCP_ADDR': '10.79.252.133',
 'FEAST_JUPYTER_SERVICE_PORT_80_TCP_PORT': '80',
 'FEAST_JUPYTER_SERVICE_PORT_80_TCP_PROTO': 'tcp',
 'FEAST_JUPYTER_SERVICE_SERVICE_HOST': '10.79.252.133',
 'FEAST_JUPYTER_SERVICE_SERVICE_PORT': '80',
 'FEAST_REDIS_HOST': 'feast-release-redis-master',
 'FEAST_RELEASE_FEAST_CORE_PORT': 'tcp://10.79.242.226:80',
 'FEAST_RELEASE_FEAST_CORE_PORT_6565_TCP': 'tcp://10.79.242.226:6565',
 'FEAST_RELEASE_FEAST_CORE_PORT_6565_TCP_ADDR': '10.79.242.226',
 'FEAST_RELEASE_FEAST_CORE_PORT_6565_TCP_PORT': '6565',
 'FEAST_RELEASE_FEAST_CORE_PORT_6565_TCP_PROTO': 'tcp',
 'FEAST_RELEASE_FE

In [3]:
from feast import Client, Feature, Entity, ValueType, FeatureTable
from feast.data_source import FileSource, KafkaSource
from feast.data_format import ParquetFormat, AvroFormat
#client = Client()

client = Client(
    core_url="feast-release-feast-core:6565",
    serving_url="feast-release-feast-serving:6566",
    spark_launcher="k8s",
    spark_staging_location=staging_bucket,
    spark_k8s_namespace="default",
    executor_instances=2,
    redis_host="feast-release-redis-headless",
    historical_feature_output_location=f"{staging_bucket}historical",
)
client.set_project("default")


Feast is an open source project that collects anonymized usage statistics. To opt out or learn more see https://docs.feast.dev/v/master/advanced/telemetry


### Declare Features and Entities

In [4]:
driver_id = Entity(name="driver_id", description="Driver identifier", value_type=ValueType.INT64)

  and should_run_async(code)


In [5]:
# Daily updated features 
acc_rate = Feature("acc_rate", ValueType.FLOAT)
conv_rate = Feature("conv_rate", ValueType.FLOAT)
avg_daily_trips = Feature("avg_daily_trips", ValueType.INT32)

# Real-time updated features
trips_today = Feature("trips_today", ValueType.INT32)
time.sleep(5)

In [6]:
# Offline data will be stored in this location
demo_data_location = staging_bucket+date_time_str
print(demo_data_location)

gs://my-feast-playground-s-11-35de9347/03-25-2021-12-53-24


In [7]:
driver_statistics_source_uri = os.path.join(demo_data_location, "driver_statistics")

driver_statistics = FeatureTable(
    name = "driver_statistics",
    entities = ["driver_id"],
    features = [
        acc_rate,
        conv_rate,
        avg_daily_trips
    ],
    batch_source=FileSource(
        event_timestamp_column="datetime",
        created_timestamp_column="created",
        file_format=ParquetFormat(),
        file_url=driver_statistics_source_uri,
        date_partition_column="date"
    )
)

In [8]:
driver_trips_source_uri = os.path.join(demo_data_location, "driver_trips")

driver_trips = FeatureTable(
    name = "driver_trips",
    entities = ["driver_id"],
    features = [
        trips_today
    ],
    batch_source=FileSource(
        event_timestamp_column="datetime",
        created_timestamp_column="created",
        file_format=ParquetFormat(),
        file_url=driver_trips_source_uri,
        date_partition_column="date"
    )
)

### Registering entities and feature tables in Feast Core

In [9]:
client.apply(driver_id)
client.apply(driver_statistics)
client.apply(driver_trips)

In [10]:
print(client.get_feature_table("driver_statistics").to_yaml())
print(client.get_feature_table("driver_trips").to_yaml())

spec:
  name: driver_statistics
  entities:
  - driver_id
  features:
  - name: avg_daily_trips
    valueType: INT32
  - name: conv_rate
    valueType: FLOAT
  - name: acc_rate
    valueType: FLOAT
  batchSource:
    type: BATCH_FILE
    eventTimestampColumn: datetime
    datePartitionColumn: date
    createdTimestampColumn: created
    fileOptions:
      fileFormat:
        parquetFormat: {}
      fileUrl: gs://my-feast-playground-s-11-35de9347/03-25-2021-12-53-24/driver_statistics
meta:
  createdTimestamp: '2021-03-25T12:53:30Z'

spec:
  name: driver_trips
  entities:
  - driver_id
  features:
  - name: trips_today
    valueType: INT32
  batchSource:
    type: BATCH_FILE
    eventTimestampColumn: datetime
    datePartitionColumn: date
    createdTimestampColumn: created
    fileOptions:
      fileFormat:
        parquetFormat: {}
      fileUrl: gs://my-feast-playground-s-11-35de9347/03-25-2021-12-53-24/driver_trips
meta:
  createdTimestamp: '2021-03-25T12:53:31Z'



### Populating batch source

Feast is agnostic to how the batch source is populated, as long as it complies to the Feature Table specification. Therefore, any existing ETL tools can be used for the purpose of data ingestion. Alternatively, you can also use Feast SDK to ingest a Panda Dataframe to the batch source.

In [11]:
import pandas as pd
import numpy as np
from datetime import datetime

In [12]:
def generate_entities():
    return np.random.choice(999999, size=100, replace=False)

In [13]:
def generate_trips(entities):
    df = pd.DataFrame(columns=["driver_id", "trips_today", "datetime", "created"])
    df['driver_id'] = entities
    df['trips_today'] = np.random.randint(0, 1000, size=100).astype(np.int32)
    df['datetime'] = pd.to_datetime(
            np.random.randint(
                datetime(2020, 10, 10).timestamp(),
                datetime(2020, 10, 20).timestamp(),
                size=100),
        unit="s"
    )
    df['created'] = pd.to_datetime(datetime.now())
    return df
    

In [14]:
def generate_stats(entities):
    df = pd.DataFrame(columns=["driver_id", "conv_rate", "acc_rate", "avg_daily_trips", "datetime", "created"])
    df['driver_id'] = entities
    df['conv_rate'] = np.random.random(size=100).astype(np.float32)
    df['acc_rate'] = np.random.random(size=100).astype(np.float32)
    df['avg_daily_trips'] = np.random.randint(0, 1000, size=100).astype(np.int32)
    df['datetime'] = pd.to_datetime(
            np.random.randint(
                datetime(2020, 10, 10).timestamp(),
                datetime(2020, 10, 20).timestamp(),
                size=100),
        unit="s"
    )
    df['created'] = pd.to_datetime(datetime.now())
    return df

In [15]:
entities = generate_entities()
stats_df = generate_stats(entities)
trips_df = generate_trips(entities)

In [16]:
client.ingest(driver_statistics, stats_df)
client.ingest(driver_trips, trips_df)

Removing temporary file(s)...
Data has been successfully ingested into FeatureTable batch source.
Removing temporary file(s)...
Data has been successfully ingested into FeatureTable batch source.


## Historical Retrieval For Training

Create a training dataset from offline feature tables

In [17]:
import gcsfs
from pyarrow.parquet import ParquetDataset
from urllib.parse import urlparse

In [18]:
entities_with_timestamp = pd.DataFrame(columns=['driver_id', 'event_timestamp'])
entities_with_timestamp['driver_id'] = np.random.choice(entities, 10, replace=False)
entities_with_timestamp['event_timestamp'] = pd.to_datetime(np.random.randint(
    datetime(2020, 10, 18).timestamp(),
    datetime(2020, 10, 20).timestamp(),
    size=10), unit='s')
entities_with_timestamp

Unnamed: 0,driver_id,event_timestamp
0,768360,2020-10-18 19:16:52
1,951779,2020-10-19 12:31:19
2,705569,2020-10-19 09:34:18
3,918271,2020-10-19 17:48:33
4,542045,2020-10-19 10:03:29
5,206107,2020-10-19 15:07:53
6,853946,2020-10-18 18:49:29
7,39913,2020-10-18 22:52:13
8,900434,2020-10-18 10:31:29
9,154045,2020-10-19 13:04:20


In [19]:
# get_historical_features will return immediately once the Spark job has been submitted succesfully.
job = client.get_historical_features(
    feature_refs=[
        "driver_statistics:avg_daily_trips",
        "driver_statistics:conv_rate",
        "driver_statistics:acc_rate",
        "driver_trips:trips_today"
    ], 
    entity_source=entities_with_timestamp
)

  and should_run_async(code)


In [20]:
# get_output_file_uri will block until the Spark job is completed.
output_file_uri = job.get_output_file_uri()
print(output_file_uri)

gs://my-feast-playground-s-11-35de9347/historical/a7b3baea-6a33-4897-8596-c39d8a3e7e2e


In [21]:
# Retrieve the remote training dataset
time.sleep(10)
parsed_uri = urlparse(output_file_uri)
fs = gcsfs.GCSFileSystem()
files = ["gs://" + path for path in fs.glob(output_file_uri + '/part-*')]
ds = ParquetDataset(files, filesystem=fs)
ds.read().to_pandas()

Unnamed: 0,driver_id,event_timestamp,driver_statistics__avg_daily_trips,driver_statistics__conv_rate,driver_statistics__acc_rate,driver_trips__trips_today
0,853946,2020-10-18 18:49:29,,,,768
1,542045,2020-10-19 10:03:29,602.0,0.001133,0.140484,306
2,705569,2020-10-19 09:34:18,958.0,0.521007,0.386785,96
3,918271,2020-10-19 17:48:33,940.0,0.009663,0.909216,638
4,154045,2020-10-19 13:04:20,440.0,0.968196,0.553506,530
5,951779,2020-10-19 12:31:19,57.0,0.172013,0.838158,772
6,900434,2020-10-18 10:31:29,220.0,0.796095,0.775111,772
7,206107,2020-10-19 15:07:53,492.0,0.269022,0.299133,522
8,39913,2020-10-18 22:52:13,351.0,0.822276,0.992605,375
9,768360,2020-10-18 19:16:52,263.0,0.95405,0.094763,767


The retrieved result can now be used for model training.

## Populating Online Storage with Batch Ingestion

In order to populate the online storage, we can use Feast SDK to start a Spark batch job which will extract the features from the batch source, then load the features to an online store.

In [22]:
time.sleep(5)
job = client.start_offline_to_online_ingestion(
    driver_statistics,
    datetime(2020, 10, 10),
    datetime(2020, 10, 20)
)

  and should_run_async(code)


In [23]:
# It will take some time before the Spark Job is completed
time.sleep(15)
job.get_status()
time.sleep(15)


Once the job is completed, the SDK can be used to retrieve the result from the online store.

In [24]:
entities_sample = np.random.choice(entities, 10, replace=False)
entities_sample = [{"driver_id": e} for e in entities_sample]
entities_sample
time.sleep(15)

In [27]:
time.sleep(15)

features = client.get_online_features(
    feature_refs=["driver_statistics:avg_daily_trips"],
    entity_rows=entities_sample).to_dict()
features


  and should_run_async(code)


{'driver_statistics:avg_daily_trips': [None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None],
 'driver_id': [601619,
  429198,
  78691,
  705569,
  486634,
  265905,
  912559,
  184790,
  37658,
  279076]}

In [26]:
pd.DataFrame(features)

Unnamed: 0,driver_statistics:avg_daily_trips,driver_id
0,,601619
1,,429198
2,,78691
3,,705569
4,,486634
5,,265905
6,,912559
7,,184790
8,,37658
9,,279076
