In [2]:
import boto3
import sagemaker

original_boto3_version = boto3.__version__
%pip install 'boto3>1.17.21'

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
[0mNote: you may need to restart the kernel to use updated packages.


In [3]:
from sagemaker.session import Session

region = boto3.Session().region_name

boto_session = boto3.Session(region_name=region)

sagemaker_client = boto_session.client(service_name="sagemaker", region_name=region)
featurestore_runtime = boto_session.client(
    service_name="sagemaker-featurestore-runtime", region_name=region
)

feature_store_session = Session(
    boto_session=boto_session,
    sagemaker_client=sagemaker_client,
    sagemaker_featurestore_runtime_client=featurestore_runtime,
)

In [4]:
default_s3_bucket_name = feature_store_session.default_bucket()
prefix = "sagemaker-featurestore-assignment"

print(default_s3_bucket_name)

sagemaker-us-east-1-652903355321


In [5]:
from sagemaker import get_execution_role

# You can modify the following to use a role of your choosing. See the documentation for how to create this.
role = get_execution_role()
print(role)

arn:aws:iam::652903355321:role/LabRole


In [6]:
import os
# current working directory
current_directory = os.getcwd()
# elative path to the data file within the current directory
file_path1 = os.path.join(current_directory, 'housing_gmaps_data_raw.csv')
file_path2 = os.path.join(current_directory, 'housing.csv')
print("Path of the data file:", file_path1)
print("Path of the data file:", file_path2)

Path of the data file: /root/MLOPSAssignments/housing_gmaps_data_raw.csv
Path of the data file: /root/MLOPSAssignments/housing.csv


In [12]:
s3 = boto3.resource('s3')
s3.Object(default_s3_bucket_name, 'housingdata/csv/housing_gmaps_data_raw.csv').put(Body=open(file_path1, 'rb'))
s3.Object(default_s3_bucket_name, 'housingdata/csv/housing.csv').put(Body=open(file_path2, 'rb'))

{'ResponseMetadata': {'RequestId': '62GSN1M5QSV7W7R9',
  'HostId': 'JitZavZ8WEuNGU1Jle0OZ2gjGvRWKRIicGVxhgi2cuqvwACsvUILCvo9ckyjZpinV9ZZzXmQv48=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'JitZavZ8WEuNGU1Jle0OZ2gjGvRWKRIicGVxhgi2cuqvwACsvUILCvo9ckyjZpinV9ZZzXmQv48=',
   'x-amz-request-id': '62GSN1M5QSV7W7R9',
   'date': 'Sat, 18 May 2024 12:33:50 GMT',
   'x-amz-server-side-encryption': 'AES256',
   'etag': '"d1c47305887e2252bf1ccbd74ff159a6"',
   'server': 'AmazonS3',
   'content-length': '0'},
  'RetryAttempts': 0},
 'ETag': '"d1c47305887e2252bf1ccbd74ff159a6"',
 'ServerSideEncryption': 'AES256'}

In [16]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import io

s3_client = boto3.client("s3", region_name=region)

housing_data_bucket_name = default_s3_bucket_name
housing_gmaps_file_key = (
    "housingdata/csv/housing_gmaps_data_raw.csv"
)
housing_file_key = (
    "housingdata/csv/housing.csv"
)

housing_data_object = s3_client.get_object(
    Bucket=housing_data_bucket_name, Key=housing_file_key
)
housing_gmaps_data_object = s3_client.get_object(
    Bucket=housing_data_bucket_name, Key=housing_gmaps_file_key
)

housing_data = pd.read_csv(io.BytesIO(housing_data_object["Body"].read()))
housing_gmaps_data = pd.read_csv(io.BytesIO(housing_gmaps_data_object["Body"].read()))


In [17]:
housing_data.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [18]:
housing_gmaps_data.head()

Unnamed: 0,street_number,route,locality-political,administrative_area_level_2-political,administrative_area_level_1-political,country-political,postal_code,address,longitude,latitude,...,establishment-natural_feature,airport-establishment-point_of_interest,political-sublocality-sublocality_level_1,administrative_area_level_3-political,post_box,establishment-light_rail_station-point_of_interest-transit_station,establishment-point_of_interest,aquarium-establishment-park-point_of_interest-tourist_attraction-zoo,campground-establishment-lodging-park-point_of_interest-rv_park-tourist_attraction,cemetery-establishment-park-point_of_interest
0,3130,Grizzly Peak Boulevard,Berkeley,Alameda County,California,United States,94705.0,"3130 Grizzly Peak Blvd, Berkeley, CA 94705, USA",-122.23,37.88,...,,,,,,,,,,
1,2005,Tunnel Road,Oakland,Alameda County,California,United States,94611.0,"2005 Tunnel Rd, Oakland, CA 94611, USA",-122.22,37.86,...,,,,,,,,,,
2,6886,Chabot Road,Oakland,Alameda County,California,United States,94618.0,"6886 Chabot Rd, Oakland, CA 94618, USA",-122.24,37.85,...,,,,,,,,,,
3,6365,Florio Street,Oakland,Alameda County,California,United States,94618.0,"6365 Florio St, Oakland, CA 94618, USA",-122.25,37.85,...,,,,,,,,,,
4,5407,Bryant Avenue,Oakland,Alameda County,California,United States,94618.0,"5407 Bryant Ave, Oakland, CA 94618, USA",-122.25,37.84,...,,,,,,,,,,


In [23]:
# Merge the dataframes on longitude and latitude
housing_data_merged = pd.merge(housing_data, housing_gmaps_data, on=['longitude', 'latitude'], how='left')
housing_data_merged.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,...,establishment-natural_feature,airport-establishment-point_of_interest,political-sublocality-sublocality_level_1,administrative_area_level_3-political,post_box,establishment-light_rail_station-point_of_interest-transit_station,establishment-point_of_interest,aquarium-establishment-park-point_of_interest-tourist_attraction-zoo,campground-establishment-lodging-park-point_of_interest-rv_park-tourist_attraction,cemetery-establishment-park-point_of_interest
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY,...,,,,,,,,,,
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY,...,,,,,,,,,,
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY,...,,,,,,,,,,
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY,...,,,,,,,,,,
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY,...,,,,,,,,,,


In [24]:
# Fill null values in the 'neighborhood-political' column with "Unknown"
housing_data_merged['neighborhood-political'].fillna('Unknown', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  housing_data_merged['neighborhood-political'].fillna('Unknown', inplace=True)


In [39]:
#housing_data_processed=housing_data_merged[['housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income', 'median_house_value', 'ocean_proximity', 'neighborhood-political','street_number','route','locality-political','administrative_area_level_2-political','administrative_area_level_1-political','country-political','postal_code','address']]
housing_data_processed=housing_data_merged[['neighborhood-political','ocean_proximity', 'median_house_value','housing_median_age', 'households', 'total_bedrooms', 'locality-political','postal_code']]
housing_data_processed.rename(columns={'neighborhood-political': 'neighborhood'}, inplace=True)
housing_data_processed.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  housing_data_processed.rename(columns={'neighborhood-political': 'neighborhood'}, inplace=True)


Unnamed: 0,neighborhood,ocean_proximity,median_house_value,housing_median_age,households,total_bedrooms,locality-political,postal_code
0,Unknown,NEAR BAY,452600.0,41.0,126.0,129.0,Berkeley,94705.0
1,Merriewood,NEAR BAY,358500.0,21.0,1138.0,1106.0,Oakland,94611.0
2,Upper Rockridge,NEAR BAY,352100.0,52.0,177.0,190.0,Oakland,94618.0
3,Rockridge,NEAR BAY,341300.0,52.0,219.0,235.0,Oakland,94618.0
4,Rockridge,NEAR BAY,342200.0,52.0,259.0,280.0,Oakland,94618.0


# Feature Engineering

In [47]:
# Feature Engineering

housing_data_processed = housing_data_processed.round(5)
housing_data_processed = housing_data_processed.fillna(0)

# Feature transformations for this dataset are applied before ingestion into FeatureStore.
# One hot encode card4, card6
encoded_ocean_proximity = pd.get_dummies(housing_data_processed["ocean_proximity"])

transformed_housing_data = pd.concat(
    [housing_data_processed, encoded_ocean_proximity], axis=1
)
# blank space is not allowed in feature name
transformed_housing_data = transformed_housing_data.rename(
    columns={"NEAR BAY": "NEAR_BAY","NEAR OCEAN": "NEAR_OCEAN","<1H OCEAN": "Lessthen_1H_OCEAN" }
)

In [48]:
transformed_housing_data.head()

Unnamed: 0,neighborhood,ocean_proximity,median_house_value,housing_median_age,households,total_bedrooms,locality-political,postal_code,Lessthen_1H_OCEAN,INLAND,ISLAND,NEAR_BAY,NEAR_OCEAN
0,Unknown,NEAR BAY,452600.0,41.0,126.0,129.0,Berkeley,94705.0,False,False,False,True,False
1,Merriewood,NEAR BAY,358500.0,21.0,1138.0,1106.0,Oakland,94611.0,False,False,False,True,False
2,Upper Rockridge,NEAR BAY,352100.0,52.0,177.0,190.0,Oakland,94618.0,False,False,False,True,False
3,Rockridge,NEAR BAY,341300.0,52.0,219.0,235.0,Oakland,94618.0,False,False,False,True,False
4,Rockridge,NEAR BAY,342200.0,52.0,259.0,280.0,Oakland,94618.0,False,False,False,True,False


# Ingest Data Into Feature Store

In [49]:
from time import gmtime, strftime, sleep

neighborhood_feature_group_name = "neighborhood-feature-group-" + strftime("%d-%H-%M-%S", gmtime())
print(neighborhood_feature_group_name)

neighborhood-feature-group-18-13-26-36


In [50]:
from sagemaker.feature_store.feature_group import FeatureGroup

neighbourhood_feature_group = FeatureGroup(
    name=neighborhood_feature_group_name, sagemaker_session=feature_store_session
)

In [51]:
import time

current_time_sec = int(round(time.time()))


def cast_object_to_string(data_frame):
    for label in data_frame.columns:
        if data_frame.dtypes[label] == "object":
            data_frame[label] = data_frame[label].astype("str").astype("string")

def cast_boolean_to_string(data_frame):
    for col in data_frame:
        if data_frame[col].dtype == "bool":
            data_frame[col] = data_frame[col].replace({True: 'True', False: 'False'}).astype("string")


# cast object dtype to string. The SageMaker FeatureStore Python SDK will then map the string dtype to String feature type.
cast_object_to_string(transformed_housing_data)

# cast object dtype to string.
cast_boolean_to_string(transformed_housing_data)

# record identifier and event time feature names
record_identifier_feature_name = "neighborhood"
event_time_feature_name = "EventTime"

# append EventTime feature
transformed_housing_data[event_time_feature_name] = pd.Series(
    [current_time_sec] * len(transformed_housing_data), dtype="float64"
)

# load feature definitions to the feature group. SageMaker FeatureStore Python SDK will auto-detect the data schema based on input data.
neighbourhood_feature_group.load_feature_definitions(data_frame=transformed_housing_data)

[FeatureDefinition(feature_name='neighborhood', feature_type=<FeatureTypeEnum.STRING: 'String'>, collection_type=None),
 FeatureDefinition(feature_name='ocean_proximity', feature_type=<FeatureTypeEnum.STRING: 'String'>, collection_type=None),
 FeatureDefinition(feature_name='median_house_value', feature_type=<FeatureTypeEnum.FRACTIONAL: 'Fractional'>, collection_type=None),
 FeatureDefinition(feature_name='housing_median_age', feature_type=<FeatureTypeEnum.FRACTIONAL: 'Fractional'>, collection_type=None),
 FeatureDefinition(feature_name='households', feature_type=<FeatureTypeEnum.FRACTIONAL: 'Fractional'>, collection_type=None),
 FeatureDefinition(feature_name='total_bedrooms', feature_type=<FeatureTypeEnum.FRACTIONAL: 'Fractional'>, collection_type=None),
 FeatureDefinition(feature_name='locality-political', feature_type=<FeatureTypeEnum.STRING: 'String'>, collection_type=None),
 FeatureDefinition(feature_name='postal_code', feature_type=<FeatureTypeEnum.FRACTIONAL: 'Fractional'>, col

In [52]:
neighbourhood_feature_group.create(
    s3_uri=f"s3://{default_s3_bucket_name}/{prefix}",
    record_identifier_name=record_identifier_feature_name,
    event_time_feature_name=event_time_feature_name,
    role_arn=role,
    enable_online_store=True,
)

def wait_for_feature_group_creation_complete(feature_group):
    status = feature_group.describe().get("FeatureGroupStatus")
    while status == "Creating":
        print("Waiting for Feature Group Creation")
        time.sleep(5)
        status = feature_group.describe().get("FeatureGroupStatus")
    if status != "Created":
        raise RuntimeError(f"Failed to create feature group {feature_group.name}")
    print(f"FeatureGroup {feature_group.name} successfully created.")


wait_for_feature_group_creation_complete(feature_group=neighbourhood_feature_group)

Waiting for Feature Group Creation
Waiting for Feature Group Creation
Waiting for Feature Group Creation
Waiting for Feature Group Creation
Waiting for Feature Group Creation
Waiting for Feature Group Creation
FeatureGroup neighborhood-feature-group-18-13-26-36 successfully created.


In [53]:
neighbourhood_feature_group.describe()

{'FeatureGroupArn': 'arn:aws:sagemaker:us-east-1:652903355321:feature-group/neighborhood-feature-group-18-13-26-36',
 'FeatureGroupName': 'neighborhood-feature-group-18-13-26-36',
 'RecordIdentifierFeatureName': 'neighborhood',
 'EventTimeFeatureName': 'EventTime',
 'FeatureDefinitions': [{'FeatureName': 'neighborhood',
   'FeatureType': 'String'},
  {'FeatureName': 'ocean_proximity', 'FeatureType': 'String'},
  {'FeatureName': 'median_house_value', 'FeatureType': 'Fractional'},
  {'FeatureName': 'housing_median_age', 'FeatureType': 'Fractional'},
  {'FeatureName': 'households', 'FeatureType': 'Fractional'},
  {'FeatureName': 'total_bedrooms', 'FeatureType': 'Fractional'},
  {'FeatureName': 'locality-political', 'FeatureType': 'String'},
  {'FeatureName': 'postal_code', 'FeatureType': 'Fractional'},
  {'FeatureName': 'Lessthen_1H_OCEAN', 'FeatureType': 'String'},
  {'FeatureName': 'INLAND', 'FeatureType': 'String'},
  {'FeatureName': 'ISLAND', 'FeatureType': 'String'},
  {'FeatureName'

In [54]:
sagemaker_client.list_feature_groups()  # use boto client to list FeatureGroups

{'FeatureGroupSummaries': [{'FeatureGroupName': 'neighborhood-feature-group-18-13-26-36',
   'FeatureGroupArn': 'arn:aws:sagemaker:us-east-1:652903355321:feature-group/neighborhood-feature-group-18-13-26-36',
   'CreationTime': datetime.datetime(2024, 5, 18, 13, 26, 54, 453000, tzinfo=tzlocal()),
   'FeatureGroupStatus': 'Created'}],
 'ResponseMetadata': {'RequestId': '292d6f79-9b1c-405a-8100-22ea9b64d502',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '292d6f79-9b1c-405a-8100-22ea9b64d502',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '266',
   'date': 'Sat, 18 May 2024 13:29:31 GMT'},
  'RetryAttempts': 0}}

In [55]:
neighbourhood_feature_group.ingest(data_frame=transformed_housing_data, max_workers=3, wait=True)

IngestionManagerPandas(feature_group_name='neighborhood-feature-group-18-13-26-36', feature_definitions={'neighborhood': {'FeatureName': 'neighborhood', 'FeatureType': 'String'}, 'ocean_proximity': {'FeatureName': 'ocean_proximity', 'FeatureType': 'String'}, 'median_house_value': {'FeatureName': 'median_house_value', 'FeatureType': 'Fractional'}, 'housing_median_age': {'FeatureName': 'housing_median_age', 'FeatureType': 'Fractional'}, 'households': {'FeatureName': 'households', 'FeatureType': 'Fractional'}, 'total_bedrooms': {'FeatureName': 'total_bedrooms', 'FeatureType': 'Fractional'}, 'locality-political': {'FeatureName': 'locality-political', 'FeatureType': 'String'}, 'postal_code': {'FeatureName': 'postal_code', 'FeatureType': 'Fractional'}, 'Lessthen_1H_OCEAN': {'FeatureName': 'Lessthen_1H_OCEAN', 'FeatureType': 'String'}, 'INLAND': {'FeatureName': 'INLAND', 'FeatureType': 'String'}, 'ISLAND': {'FeatureName': 'ISLAND', 'FeatureType': 'String'}, 'NEAR_BAY': {'FeatureName': 'NEAR_B

In [56]:
record_identifier_value = "Merriewood"

featurestore_runtime.get_record(
    FeatureGroupName=neighborhood_feature_group_name,
    RecordIdentifierValueAsString=record_identifier_value,
)

{'ResponseMetadata': {'RequestId': 'c0a00d53-58fd-42e5-b37f-14fdf30ea1c0',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'c0a00d53-58fd-42e5-b37f-14fdf30ea1c0',
   'content-type': 'application/json',
   'content-length': '1173',
   'date': 'Sat, 18 May 2024 13:36:48 GMT'},
  'RetryAttempts': 0},
 'Record': [{'FeatureName': 'neighborhood', 'ValueAsString': 'Merriewood'},
  {'FeatureName': 'ocean_proximity', 'ValueAsString': 'NEAR BAY'},
  {'FeatureName': 'median_house_value', 'ValueAsString': '325900.0'},
  {'FeatureName': 'housing_median_age', 'ValueAsString': '34.0'},
  {'FeatureName': 'households', 'ValueAsString': '496.0'},
  {'FeatureName': 'total_bedrooms', 'ValueAsString': '490.0'},
  {'FeatureName': 'locality-political', 'ValueAsString': 'Oakland'},
  {'FeatureName': 'postal_code', 'ValueAsString': '94611.0'},
  {'FeatureName': 'Lessthen_1H_OCEAN', 'ValueAsString': 'False'},
  {'FeatureName': 'INLAND', 'ValueAsString': 'False'},
  {'FeatureName': 'ISLAND', 'Valu

In [58]:
account_id = boto3.client("sts").get_caller_identity()["Account"]
print(account_id)

neighbourhood_feature_group_resolved_output_s3_uri = (
    neighbourhood_feature_group.describe()
    .get("OfflineStoreConfig")
    .get("S3StorageConfig")
    .get("ResolvedOutputS3Uri")
)

neighbourhood_feature_group_s3_prefix = neighbourhood_feature_group_resolved_output_s3_uri.replace(
    f"s3://{default_s3_bucket_name}/", ""
)

offline_store_contents = None
while offline_store_contents is None:
    objects_in_bucket = s3_client.list_objects(
        Bucket=default_s3_bucket_name, Prefix=neighbourhood_feature_group_s3_prefix
    )
    if "Contents" in objects_in_bucket and len(objects_in_bucket["Contents"]) > 1:
        offline_store_contents = objects_in_bucket["Contents"]
    else:
        print("Waiting for data in offline store...\n")
        sleep(60)

print("Data available.")

652903355321
Data available.


In [59]:
print(neighbourhood_feature_group.as_hive_ddl())

CREATE EXTERNAL TABLE IF NOT EXISTS sagemaker_featurestore.neighborhood-feature-group-18-13-26-36 (
  neighborhood STRING
  ocean_proximity STRING
  median_house_value FLOAT
  housing_median_age FLOAT
  households FLOAT
  total_bedrooms FLOAT
  locality-political STRING
  postal_code FLOAT
  Lessthen_1H_OCEAN STRING
  INLAND STRING
  ISLAND STRING
  NEAR_BAY STRING
  NEAR_OCEAN STRING
  EventTime FLOAT
  write_time TIMESTAMP
  event_time TIMESTAMP
  is_deleted BOOLEAN
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
  STORED AS
  INPUTFORMAT 'parquet.hive.DeprecatedParquetInputFormat'
  OUTPUTFORMAT 'parquet.hive.DeprecatedParquetOutputFormat'
LOCATION 's3://sagemaker-us-east-1-652903355321/sagemaker-featurestore-assignment/652903355321/sagemaker/us-east-1/offline-store/neighborhood-feature-group-18-13-26-36-1716038814/data'


In [71]:
neighbourhood_query = neighbourhood_feature_group.athena_query()

neighbourhood_table = neighbourhood_query.table_name
search_string="Sundale"

query_string = (
    'SELECT * FROM '
    + neighbourhood_table    
    
)
print("Running " + query_string)

# run Athena query. The output is loaded to a Pandas dataframe.
# dataset = pd.DataFrame()
neighbourhood_query.run(
    query_string=query_string,
    output_location="s3://" + default_s3_bucket_name + "/" + prefix + "/query_results/",
)
neighbourhood_query.wait()
dataset = neighbourhood_query.as_dataframe()

dataset

Running SELECT * FROM neighborhood_feature_group_18_13_26_36_1716038814


Unnamed: 0,neighborhood,ocean_proximity,median_house_value,housing_median_age,households,total_bedrooms,locality-political,postal_code,lessthen_1h_ocean,inland,island,near_bay,near_ocean,eventtime,write_time,api_invocation_time,is_deleted
0,Piedmont Avenue,NEAR BAY,224100.0,52.0,519.0,559.0,Oakland,94611.0,False,False,False,True,False,1.716039e+09,2024-05-18 13:38:20.978,2024-05-18 13:33:16.000,False
1,Mission Valley,NEAR OCEAN,441700.0,52.0,285.0,301.0,San Diego,92108.0,False,False,False,False,True,1.716039e+09,2024-05-18 13:38:13.148,2024-05-18 13:33:19.000,False
2,Unknown,NEAR BAY,209100.0,40.0,589.0,617.0,El Cerrito,94530.0,False,False,False,True,False,1.716039e+09,2024-05-18 13:34:03.265,2024-05-18 13:33:40.000,False
3,Temescal,NEAR BAY,191400.0,52.0,309.0,330.0,Oakland,94609.0,False,False,False,True,False,1.716039e+09,2024-05-18 13:38:13.074,2024-05-18 13:33:15.000,False
4,Unknown,NEAR BAY,217200.0,37.0,462.0,489.0,El Cerrito,94530.0,False,False,False,True,False,1.716039e+09,2024-05-18 13:34:03.265,2024-05-18 13:33:40.000,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20635,Shandin Hills,INLAND,115000.0,33.0,260.0,282.0,San Bernardino,92405.0,False,True,False,False,False,1.716039e+09,2024-05-18 13:38:13.107,2024-05-18 13:34:48.000,False
20636,Shandin Hills,INLAND,88000.0,38.0,138.0,163.0,San Bernardino,92405.0,False,True,False,False,False,1.716039e+09,2024-05-18 13:38:13.107,2024-05-18 13:34:48.000,False
20637,Shandin Hills,INLAND,85500.0,40.0,182.0,199.0,San Bernardino,92405.0,False,True,False,False,False,1.716039e+09,2024-05-18 13:38:13.107,2024-05-18 13:34:49.000,False
20638,Shandin Hills,INLAND,90800.0,45.0,174.0,166.0,San Bernardino,92405.0,False,True,False,False,False,1.716039e+09,2024-05-18 13:38:13.107,2024-05-18 13:34:49.000,False


In [104]:
neighbourhood_query = neighbourhood_feature_group.athena_query()

neighbourhood_table = neighbourhood_query.table_name
search_string='Alessandro'
query_string = (
    'SELECT * FROM '
    + neighbourhood_table
     + ' WHERE '
    + 'neighborhood = "'
    + search_string
    +'"'
    
)

query_string = query_string.replace('"', "'")
print("Running " + query_string)

# run Athena query. The output is loaded to a Pandas dataframe.
# dataset = pd.DataFrame()
neighbourhood_query.run(
    query_string=query_string,
    output_location="s3://" + default_s3_bucket_name + "/" + prefix + "/query_results/",
)
neighbourhood_query.wait()
dataset = neighbourhood_query.as_dataframe()

dataset

Running SELECT * FROM neighborhood_feature_group_18_13_26_36_1716038814 WHERE neighborhood = 'Alessandro'


Unnamed: 0,neighborhood,ocean_proximity,median_house_value,housing_median_age,households,total_bedrooms,locality-political,postal_code,lessthen_1h_ocean,inland,island,near_bay,near_ocean,eventtime,write_time,api_invocation_time,is_deleted
0,Alessandro,INLAND,70500.0,37.0,563.0,574.0,San Bernardino,92411.0,False,True,False,False,False,1716039000.0,2024-05-18 13:38:33.193,2024-05-18 13:34:48.000,False
1,Alessandro,INLAND,66400.0,39.0,477.0,498.0,San Bernardino,92411.0,False,True,False,False,False,1716039000.0,2024-05-18 13:38:33.193,2024-05-18 13:34:48.000,False
2,Alessandro,INLAND,68100.0,38.0,324.0,337.0,San Bernardino,92411.0,False,True,False,False,False,1716039000.0,2024-05-18 13:38:33.193,2024-05-18 13:34:48.000,False
3,Alessandro,INLAND,67300.0,41.0,270.0,302.0,San Bernardino,92411.0,False,True,False,False,False,1716039000.0,2024-05-18 13:38:33.193,2024-05-18 13:34:48.000,False
